Exemple #1
0
    def fit(self, data, args):
        self.model = MinMaxScaler()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval
Exemple #2
0
def test_min_max_scaler_1d():
    # Test scaling of dataset along single axis
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # Constant feature.
    X = np.zeros(5)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_greater_equal(X_scaled.min(), 0.)
    assert_less_equal(X_scaled.max(), 1.)
Exemple #3
0
def load_data(n_samples, label_scaling: bool = False):
    """Take in Brian's data and spit out some numpy arrays for the PAL"""
    df_full_factorial_feat = pd.read_csv(
        os.path.join(DATADIR, 'new_features_full_random.csv'))[FEATURES].values
    a2 = pd.read_csv(
        os.path.join(
            DATADIR,
            'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values
    gibbs = pd.read_csv(os.path.join(
        DATADIR, 'b1-b21_random_deltaG.csv'))['deltaGmin'].values * (-1)
    gibbs_max = pd.read_csv(
        os.path.join(DATADIR,
                     'b1-b21_random_virial_large_new.csv'))['deltaGmax'].values
    rg = pd.read_csv(os.path.join(DATADIR, 'rg_results.csv'))['Rg'].values
    y = np.hstack(
        [rg.reshape(-1, 1),
         gibbs.reshape(-1, 1),
         gibbs_max.reshape(-1, 1)])
    assert len(df_full_factorial_feat) == len(a2) == len(gibbs) == len(y)

    feat_scaler = StandardScaler()
    X = feat_scaler.fit_transform(df_full_factorial_feat)

    if label_scaling:
        label_scaler = MinMaxScaler()
        y = label_scaler.fit_transform(y)

    greedy_indices = get_maxmin_samples(X, n_samples)

    nan_indices = np.unique(np.random.randint(0, len(y) - 1, int(len(y) / 3)))
    y[nan_indices, 2] = np.nan
    return X, y, greedy_indices
def test_min_max_scaler_iris():
    X = iris.data
    scaler = MinMaxScaler()
    # default params
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), 0)
    assert_array_almost_equal(X_trans.min(axis=0), 0)
    assert_array_almost_equal(X_trans.max(axis=0), 1)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # not default params: min=1, max=2
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), 1)
    assert_array_almost_equal(X_trans.max(axis=0), 2)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # min=-.5, max=.6
    scaler = MinMaxScaler(feature_range=(-.5, .6))
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), -.5)
    assert_array_almost_equal(X_trans.max(axis=0), .6)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # raises on invalid range
    scaler = MinMaxScaler(feature_range=(2, 1))
    assert_raises(ValueError, scaler.fit, X)
 def parkinsons_replicated_data(self, park_dat):
     df_parkinson = pd.read_csv(park_dat, sep=',')
     ylabel = df_parkinson['Status']
     xfeatures = df_parkinson.drop(['Status', 'ID'], axis=1)
     xfeats = df_parkinson.drop(['Status', 'ID'], axis=1).values
     x = (xfeats - np.min(xfeats)) / (np.max(xfeats) - np.min(xfeats))
     y = df_parkinson['Status'].values
     xfeatsp = pd.DataFrame(xfeatures)
     minmax_scaling = MinMaxScaler()
     x_scaledp = minmax_scaling.fit_transform(xfeatsp)
     x_scaledp = pd.DataFrame(x_scaledp)
     f1 = plt.figure(figsize=(19, 16))
     plt.matshow(x_scaledp.corr(), fignum=f1.number)
     plt.xticks(range(x_scaledp.shape[1]),
                x_scaledp.columns,
                fontsize=10,
                rotation=45)
     plt.xticks(range(x_scaledp.shape[1]), x_scaledp.columns, fontsize=10)
     cb = plt.colorbar()
     cb.ax.tick_params(labelsize=12)
     plt.show()
     for eachx in xfeatures:
         xfeatures[eachx] = (xfeatures[eachx] - xfeatures[eachx].min()
                             ) / xfeatures[eachx].max()
     ylabel = ylabel.values
     # ydata = ylabel[:, None]
     xdata = x_scaledp.to_numpy()
     targets = np.array(ylabel).reshape(-1)
     y = np.eye(2)[targets]
     xtrain, xtest, y_train, y_test = train_test_split(
         x, y, test_size=0.30)  #, shuffle=False)
     print(y_test)
     #y_train = ytrain[:, None]
     #y_test = ytest[:, None]
     return xtrain, xtest, y_train, y_test
def process_data():

    file_path = '/Users/fpena/Stuff/House Search/Dublin/viewings-ucd.csv'
    data_frame = pandas.read_csv(file_path)
    print(data_frame.columns.values.tolist())
    print(data_frame.head())
    print(data_frame.describe())
    print(data_frame['Price'])

    price_scaler = MinMaxScaler()
    data_frame['Price Score'] = 1 - price_scaler.fit_transform(
        data_frame[['Price']])
    data_frame['Cycle Time Score'] = 1 - price_scaler.fit_transform(
        data_frame[['Cycle Time']])
    data_frame['Score'] = 0.5 * (data_frame['Price Score'] +
                                 data_frame['Cycle Time Score'])
    data_frame['Rank'] = data_frame['Score'].rank(
        ascending=True) / (len(data_frame))

    cycle_hour_cost = 30
    working_days_per_month = 22
    data_frame['Money Score'] =\
        data_frame['Price'] + data_frame['Cycle Time'] / 60 * cycle_hour_cost * working_days_per_month
    data_frame.rename(columns={'Cycle Time': 'Cycle'}, inplace=True)
    # print(data_frame['Price Score'])
    # print(data_frame[['Score', 'Rank']])
    # with pandas.option_context('display.max_rows', 500, 'display.max_columns', 10):
    #   print(data_frame[['Address', 'Price', 'Cycle', 'Rank', 'Score']].sort_values('Rank', ascending=False))
    # print(data_frame[['Address', 'Price', 'Cycle', 'Rank', 'Score', 'Money Score']].to_string())
    print(data_frame[[
        'Address', 'Price', 'Cycle', 'Rank', 'Score', 'Money Score'
    ]].sort_values('Rank', ascending=False).to_string())

    # seaborn.(x='Price', y='Cycle Time', data_frame=data_frame)
    data_frame.plot.scatter(x='Price', y='Cycle')
    pyplot.savefig('/tmp/daft_scatter.pdf')
    pyplot.cla()
    pyplot.clf()

    data_frame.plot.scatter(x='Price Score', y='Cycle Time Score')
    pyplot.savefig('/tmp/daft_scatter_norm.pdf')
    pyplot.cla()
    pyplot.clf()

    seaborn.stripplot(x='Accommodation Type',
                      y='Price',
                      data=data_frame,
                      jitter=True)
    pyplot.savefig('/tmp/daft_price.pdf')
    pyplot.cla()
    pyplot.clf()

    data_frame.plot.scatter(x='Housemates', y='Price')
    pyplot.savefig('/tmp/daft_scatter_price_housemates.pdf')
    pyplot.cla()
    pyplot.clf()

    data_frame.to_csv('/tmp/daft-houses-processed.csv')
Exemple #7
0
def spambase_transform(input_path, features_path, labels_path, metadata_path):
    metadata = create_metadata(
        VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {},
        sum(NUM_SAMPLES), CLASSES)

    input_file = open(input_path, "r")

    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    i = 0
    line = input_file.readline()
    while line != "":
        line = line.rstrip("\n")
        values = line.split(",")

        assert len(values) - 1 == len(VARIABLES), str(
            (len(values) - 1, len(VARIABLES)))

        for j, value in enumerate(values[:-1]):
            value = float(value)
            features[i, j] = value

        labels[i] = int(values[-1])

        i += 1

        line = input_file.readline()

    # scale
    scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    scaler.fit_transform(features)

    assert i == metadata["num_samples"]

    num_positive_samples = int(labels.sum())
    num_negative_samples = labels.shape[0] - num_positive_samples

    assert num_negative_samples == NUM_SAMPLES[0]
    assert num_positive_samples == NUM_SAMPLES[1]

    print("Negative samples: ", num_negative_samples)
    print("Positive samples: ", num_positive_samples)
    print("Total samples: ", features.shape[0])
    print("Features: ", features.shape[1])

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    metadata["features_min"] = scaler.data_min_.tolist()
    metadata["features_max"] = scaler.data_max_.tolist()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Exemple #8
0
def pearson(A, B, scale=True):
    correlation = 0
    if scale:
        scaler = MinMaxScaler()
        A = scaler.fit_transform(A)
        B = scaler.fit_transform(B)
    for i in range(A.shape[1]):
        correlation = correlation + pearsonr(A[:, i], B[:, i])[0]
    return correlation / A.shape[1]
Exemple #9
0
def pearson(A, B, scale=True):
    correlation = 0
    if scale:
        scaler = MinMaxScaler()
        A = scaler.fit_transform(A)
        B = scaler.fit_transform(B)
    for i in range(A.shape[1]):
        correlation = correlation + pearsonr(A[:, i], B[:, i])[0]
    return correlation / A.shape[1]
Exemple #10
0
def letter_recognition_transform(input_path, features_path, labels_path,
                                 metadata_path):
    metadata = create_metadata(
        VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {},
        sum(NUM_SAMPLES), CLASSES)

    input_file = open(input_path, "r")

    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    i = 0
    line = input_file.readline()
    while line != "":
        line = line.rstrip("\n")
        values = line.split(",")

        assert len(values) - 1 == len(VARIABLES), str(
            (len(values) - 1, len(VARIABLES)))

        for j, value in enumerate(values[1:]):
            value = float(value)
            features[i, j] = value

        labels[i] = CLASS_TO_INDEX[values[0]]

        i += 1

        line = input_file.readline()

    # scale
    scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    scaler.fit_transform(features)

    assert i == metadata["num_samples"]

    for class_index in range(len(NUM_SAMPLES)):
        num_samples_class = (labels == class_index).sum()
        assert num_samples_class == NUM_SAMPLES[class_index]

    print("Total samples: ", features.shape[0])
    print("Features: ", features.shape[1])

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    metadata["features_min"] = scaler.data_min_.tolist()
    metadata["features_max"] = scaler.data_max_.tolist()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Exemple #11
0
def default_credit_card_transform(input_path, features_path, labels_path, metadata_path):
    input_file = open(input_path, "r")
    reader = csv.DictReader(input_file)

    variables = set(reader.fieldnames)
    variables.remove("ID")
    variables.remove("default payment next month")

    metadata = create_metadata(variables, TYPES, VALUES, NUM_SAMPLES, CLASSES)

    features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    for i, row in enumerate(reader):
        # the categorical variables are already one hot encoded
        for j, variable in enumerate(metadata["variables"]):
            value = row[variable]
            if TYPES[variable] == "numerical":
                value = float(value)
                features[i, metadata["value_to_index"][variable]] = value
            elif TYPES[variable] == "categorical":
                value = value.replace(".0", "")
                assert value in VALUES[variable], \
                    "'{}' is not a valid value for '{}'".format(value, variable)
                features[i, metadata["value_to_index"][variable][value]] = 1.0

        # the class needs to be transformed
        labels[i] = int(row["default payment next month"].replace(".0", ""))

    # scale
    scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    scaler.fit_transform(features)

    assert i == metadata["num_samples"] - 1

    num_positive_samples = int(labels.sum())
    num_negative_samples = labels.shape[0] - num_positive_samples

    print("Negative samples: ", num_negative_samples)
    print("Positive samples: ", num_positive_samples)
    print("Total samples: ", features.shape[0])
    print("Features: ", features.shape[1])

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    metadata["features_min"] = scaler.data_min_.tolist()
    metadata["features_max"] = scaler.data_max_.tolist()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Exemple #12
0
def test_min_max_scaler_zero_variance_features():
    # Check min max scaler on toy data with zero variance features
    X = [[0., 1., +0.5], [0., 1., -0.1], [0., 1., +1.1]]

    X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]]

    # default params
    scaler = MinMaxScaler()
    X_trans = scaler.fit_transform(X)
    X_expected_0_1 = [[0., 0., 0.5], [0., 0., 0.0], [0., 0., 1.0]]
    assert_array_almost_equal(X_trans, X_expected_0_1)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    X_trans_new = scaler.transform(X_new)
    X_expected_0_1_new = [[+0., 1., 0.500], [-1., 0., 0.083], [+0., 0., 1.333]]
    assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)

    # not default params
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    X_expected_1_2 = [[1., 1., 1.5], [1., 1., 1.0], [1., 1., 2.0]]
    assert_array_almost_equal(X_trans, X_expected_1_2)

    # function interface
    X_trans = minmax_scale(X)
    assert_array_almost_equal(X_trans, X_expected_0_1)
    X_trans = minmax_scale(X, feature_range=(1, 2))
    assert_array_almost_equal(X_trans, X_expected_1_2)
Exemple #13
0
class MinMaxScalerImpl():
    def __init__(self, feature_range=(0, 1), copy=True):
        self._hyperparams = {'feature_range': feature_range, 'copy': copy}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def transform(self, X):
        return self._sklearn_model.transform(X)
Exemple #14
0
class Scaler(TransformerMixin):
    def __init__(self):
        self._scaler = MinMaxScaler(feature_range=(-1, 1))

    def transform(self, df, *_):
        assert_all_finite(df)
        scaled = self._scaler.transform(df)
        df = pd.DataFrame(scaled, columns=df.columns)
        assert_all_finite(df)
        return df

    def fit(self, df, *_):
        self._scaler.fit(df)
        return self
Exemple #15
0
    def classify(X_train, y_train, X_test, y_test):

        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)

        liberatore_NB = GaussianNB()

        liberatore_NB.fit(X_train, y_train)
        del X_train

        X_test = scaler.transform(X_test)

        predictions = liberatore_NB.predict(X_test)

        return y_test, predictions
def correct_values(values, min_value, max_value):
    '''
    Ensures that values are in given range
    @param values: 1d numpy array
    '''
    # scale
    # do nothing if valid values
    lowest_val = np.min(values)
    largest_val = np.max(values)
    lowest_val_valid = lowest_val >= min_value and lowest_val < max_value
    largest_val_valid = largest_val <= max_value and largest_val > min_value
    #print("allowed: min_val: ", min_value, " max_val: ", max_value)
    #print("current: min_val: ", lowest_val, "max_val: ", largest_val)
    if lowest_val_valid and largest_val_valid:
        pass
    else:
        #print("at least one not valid")
        # +/-1 to prevent AssertionErrors caused by rounding errors
        # -> +/-1 introduces new excpetion: "ValueError: Minimum of desired
        # feature range must be smaller than maximum. Got (84.80001171045868,
        # 84). -> Therefore used without +-1 and adapted assertions.
        min_value_for_scaler = min_value  # + 1
        max_value_for_scaler = max_value  # - 1
        # re-use max/min values in data if valid, otherwise all functions would
        # be in same range
        if lowest_val_valid:
            #print("lowest valid")
            min_value_for_scaler = lowest_val
        if largest_val_valid:
            #print("largest valid")
            max_value_for_scaler = largest_val
        scaler = MinMaxScaler(feature_range=(
            min_value_for_scaler, max_value_for_scaler))
        reshaped_values = values.reshape(-1, 1)  # otherwise DeprecationWarning
        scaler = scaler.fit(reshaped_values)
        values = scaler.transform(reshaped_values)
        values = np.reshape(values, len(values))  # original shape
    # print("afterwards: min_val: ", np.min(
    #    values), " max_val: ", np.max(values))
    min_in_scaled = np.min(values)
    max_in_scaled = np.max(values)
    # test whether min_value <= min_in_scaled
    assert min_value - min_in_scaled <= 0.0000001, "current min: " + \
        str(min_in_scaled) + "but allowed min is: " + str(min_value)
    # test wheter max_in_scaled <= max_value
    assert max_in_scaled - max_value <= 0.000001, "current max: " + str(max_in_scaled) + \
        " but allowed max is: " + str(max_value)
    return values
Exemple #17
0
def get_pipeline(features):
    feature_names = []
    for feature in features:
        feature_names += feature[1].FEATS
    print(feature_names)
    return Pipeline(features + [('transform', ToMatrix(
        features=feature_names)), ('norm', MinMaxScaler())])
Exemple #18
0
def run(test, train):
    # generate features

    # feats = get_serialized_pipeline(train)
    feats = get_serialized_pipeline(train)

    train_x = feats.fit_transform(train)
    test_x = feats.fit_transform(test)

    # train
    train_y = [1 if sent.label_test > 0 else 0 for sent in train]
    clf = MLPClassifier(max_iter=300, solver='sgd', alpha=4, hidden_layer_sizes=(200, 50),
                        random_state=42, activation='relu', learning_rate_init=0.04, batch_size=550)

    clf.fit(train_x, train_y)

    # predict
    predictions = clf.predict(test_x)
    pred_probs = clf.predict_proba(test_x)
    pred_probs = MinMaxScaler().fit_transform(np.reshape([pred[1] for pred in pred_probs], (-1, 1))).tolist()
    for i, sent in enumerate(test):
        sent.pred_label = predictions[i]
        sent.pred = pred_probs[i]

    return test
Exemple #19
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
Exemple #20
0
 def _create_scaler(self, positivity):
     self.scaler_positivity = positivity
     if positivity is True:
         eps = 1e-9
         self._scaler = MinMaxScaler(feature_range=(eps, 1))
     else:
         self._scaler = StandardScaler()
     self.scaler_is_fitted = False
Exemple #21
0
def test_min_max_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.max(axis=0), 1.0)

    # Constant feature.
    X = np.zeros(5)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit(X).transform(X)
    assert_greater_equal(X_scaled.min(), 0.)
    assert_less_equal(X_scaled.max(), 1.)
Exemple #22
0
def test_warning_scaling_integers():
    # Check warning when scaling integer data
    X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8)

    w = "Data with input dtype uint8 was converted to float64"

    clean_warning_registry()
    assert_warns_message(DataConversionWarning, w, scale, X)
    assert_warns_message(DataConversionWarning, w, StandardScaler().fit, X)
    assert_warns_message(DataConversionWarning, w, MinMaxScaler().fit, X)
Exemple #23
0
def test_warning_scaling_integers():
    """Check warning when scaling integer data"""
    X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8)

    w = "assumes floating point values as input, got uint8"

    clean_warning_registry()
    assert_warns_message(UserWarning, w, scale, X)
    assert_warns_message(UserWarning, w, StandardScaler().fit, X)
    assert_warns_message(UserWarning, w, MinMaxScaler().fit, X)
Exemple #24
0
def classifier_dyer2012(X_train, y_train, X_test, y_test, time_train=None, time_test=None):

    obj = Dyer2012VNGPlusPlusClassifier()

    X_train, fields = dyer2012_tracestoInstances(obj, X_train, time_train)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)

    models1 = {
        'Bernoulli': BernoulliNB(),
        'Gaussian': GaussianNB(),
        'Multinomial': MultinomialNB(),
    }

    params1 = {
        'Bernoulli': {},
        'Gaussian': {},
        'Multinomial': {},
        #'SVC': [
        #    {'kernel': ['linear'], 'C': [1, 10]},
         #   {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
        #]
    }

    dyer_NB=MultinomialNB()
    dyer_NB.fit(X_train, y_train)
    del X_train

    #test
    X_test, fields = dyer2012_tracestoInstances(obj, X_test, time_test, fields)
    X_test = scaler.transform(X_test)


    predictions = dyer_NB.predict(X_test)
    del X_test

    labels = []
    for l in y_train:
        if l not in labels:
            labels.append(l)

    return y_test, predictions
 def xtraintestdata(self, datarray, yarray, dfiletowrite):
     x_train, x_test, y_train, y_test = train_test_split(datarray,
                                                         yarray,
                                                         test_size=0.2,
                                                         random_state=1)
     x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                       y_train,
                                                       test_size=0.2,
                                                       random_state=1)
     min_max_scaler = MinMaxScaler()
     # feed in a numpy array
     x_train_norm = min_max_scaler.fit_transform(x_train)
     _ = np.c_[x_train_norm, y_train]
     dirme = dfiletowrite
     sio.savemat(dirme, mdict={'UCIDat': yarray})
     xy_valid = np.c_[x_val, y_val]
     xy_train = np.c_[x_train, y_train]
     xy_test = np.c_[x_test, y_test]
     return xy_train, xy_test, xy_valid
Exemple #26
0
def load_data(n_samples, label_scaling: bool = False, method: str = 'maxmin'):
    """Take in Brian's data and spit out some numpy arrays for the PAL"""
    df_full_factorial_feat = pd.read_csv(
        os.path.join(DATADIR, 'new_features_full_random.csv'))[FEATURES].values
    a2 = pd.read_csv(
        os.path.join(
            DATADIR,
            'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values
    deltaGMax = pd.read_csv(
        os.path.join(
            DATADIR,
            'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values  # pylint:disable=unused-variable
    gibbs = pd.read_csv(os.path.join(
        DATADIR, 'b1-b21_random_deltaG.csv'))['deltaGmin'].values * (-1)
    gibbs_max = pd.read_csv(
        os.path.join(DATADIR,
                     'b1-b21_random_virial_large_new.csv'))['deltaGmax'].values
    force_max = pd.read_csv(
        os.path.join(
            DATADIR,
            'b1-b21_random_virial_large_fit2.csv'))['F_repel_max'].values  # pylint:disable=unused-variable
    rg = pd.read_csv(os.path.join(DATADIR, 'rg_results.csv'))['Rg'].values
    y = np.hstack(
        [rg.reshape(-1, 1),
         gibbs.reshape(-1, 1),
         gibbs_max.reshape(-1, 1)])
    assert len(df_full_factorial_feat) == len(a2) == len(gibbs) == len(y)

    feat_scaler = StandardScaler()
    X = feat_scaler.fit_transform(df_full_factorial_feat)

    if label_scaling:
        label_scaler = MinMaxScaler()
        y = label_scaler.fit_transform(y)

    if method == 'maxmin':
        greedy_indices = get_maxmin_samples(X, n_samples)

    elif method == 'kmeans':
        greedy_indices = get_kmeans_samples(X, n_samples)

    return X, y, greedy_indices
Exemple #27
0
def test_warning_scaling_integers():
    """Check warning when scaling integer data"""
    X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8)

    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        assert_warns(UserWarning, StandardScaler().fit, X)

    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        assert_warns(UserWarning, MinMaxScaler().fit, X)
Exemple #28
0
class CreateMinMaxScaler(CreateModel):
    def fit(self, data, args):
        self.model = MinMaxScaler()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval

    def test(self, data):
        assert self.model is not None

        return self.model.transform(data.X_test)

    def predict(self, data):
        with Timer() as t:
            self.predictions = self.test(data)

        data.learning_task = LearningTask.REGRESSION
        return t.interval
Exemple #29
0
def read_svm_pred(test_sent, input_file):
    input = open(input_file)
    ranks = []
    for line in input:
        ranks.append(float(line.strip()))

    ranks = MinMaxScaler().fit_transform(ranks)
    for i, sent in enumerate(test_sent):
        test_sent[i].pred = ranks[i]
        test_sent[i].pred_label = 1 if ranks[i] >= 0.5 else 0
    return test_sent
Exemple #30
0
def prepare_df_for_violinplot(df,
                              feature_cols,
                              class_col,
                              class_indices=None,
                              minmaxscale=True):
    """
    Min-max-scale the data and then melt the dataframe into the long format
    """
    if class_indices:
        df = df.loc[list(class_indices)]
    df = df[feature_cols + [class_col]]

    if minmaxscale:
        from sklearn.preprocessing.data import MinMaxScaler
        scaler = MinMaxScaler()
        df[feature_cols] = scaler.fit_transform(df[feature_cols])

    prepared_df = pd.melt(df, value_vars=feature_cols, id_vars=class_col)

    return prepared_df
Exemple #31
0
def make_models(X, y, y_bin):
    return dict(ols=LinearRegression().fit(X, y),
                lr_bin=LogisticRegression().fit(X, y_bin),
                lr_ovr=LogisticRegression(multi_class='ovr').fit(X, y),
                lr_mn=LogisticRegression(solver='lbfgs',
                                         multi_class='multinomial').fit(X, y),
                svc=SVC(kernel='linear').fit(X, y_bin),
                svr=SVR(kernel='linear').fit(X, y),
                dtc=DecisionTreeClassifier(max_depth=4).fit(X, y),
                dtr=DecisionTreeRegressor(max_depth=4).fit(X, y),
                rfc=RandomForestClassifier(n_estimators=3,
                                           max_depth=3,
                                           random_state=1).fit(X, y),
                rfr=RandomForestRegressor(n_estimators=3,
                                          max_depth=3,
                                          random_state=1).fit(X, y),
                gbc=GradientBoostingClassifier(n_estimators=3,
                                               max_depth=3,
                                               random_state=1).fit(X, y),
                gbr=GradientBoostingRegressor(n_estimators=3,
                                              max_depth=3,
                                              random_state=1).fit(X, y),
                abc=AdaBoostClassifier(algorithm='SAMME',
                                       n_estimators=3,
                                       random_state=1).fit(X, y),
                abc2=AdaBoostClassifier(algorithm='SAMME.R',
                                        n_estimators=3,
                                        random_state=1).fit(X, y),
                abc3=AdaBoostClassifier(algorithm='SAMME',
                                        n_estimators=3,
                                        random_state=1).fit(X, y_bin),
                abc4=AdaBoostClassifier(algorithm='SAMME.R',
                                        n_estimators=3,
                                        random_state=1).fit(X, y_bin),
                km=KMeans(1).fit(X),
                km2=KMeans(5).fit(X),
                pc1=PCA(1).fit(X),
                pc2=PCA(2).fit(X),
                pc3=PCA(2, whiten=True).fit(X),
                mlr1=MLPRegressor([2], 'relu').fit(X, y),
                mlr2=MLPRegressor([2, 1], 'tanh').fit(X, y),
                mlr3=MLPRegressor([2, 2, 2], 'identity').fit(X, y),
                mlc=MLPClassifier([2, 2], 'tanh').fit(X, y),
                mlc_bin=MLPClassifier([2, 2], 'identity').fit(X, y_bin),
                bin=Binarizer(0.5),
                mms=MinMaxScaler().fit(X),
                mas=MaxAbsScaler().fit(X),
                ss1=StandardScaler().fit(X),
                ss2=StandardScaler(with_mean=False).fit(X),
                ss3=StandardScaler(with_std=False).fit(X),
                n1=Normalizer('l1'),
                n2=Normalizer('l2'),
                n3=Normalizer('max'))
def test_min_max_scaler_zero_variance_features():
    """Check min max scaler on toy data with zero variance features"""
    X = [[0., 1., +0.5],
         [0., 1., -0.1],
         [0., 1., +1.1]]

    X_new = [[+0., 2., 0.5],
             [-1., 1., 0.0],
             [+0., 1., 1.5]]

    # default params
    scaler = MinMaxScaler()
    X_trans = scaler.fit_transform(X)
    X_expected_0_1 = [[0., 0., 0.5],
                      [0., 0., 0.0],
                      [0., 0., 1.0]]
    assert_array_almost_equal(X_trans, X_expected_0_1)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    X_trans_new = scaler.transform(X_new)
    X_expected_0_1_new = [[+0., 1., 0.500],
                          [-1., 0., 0.083],
                          [+0., 0., 1.333]]
    assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)

    # not default params
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    X_expected_1_2 = [[1., 1., 1.5],
                      [1., 1., 1.0],
                      [1., 1., 2.0]]
    assert_array_almost_equal(X_trans, X_expected_1_2)
Exemple #33
0
def classifier_panchenko2016(X_train,
                             y_train,
                             X_test,
                             y_test,
                             separateClassifier=False):
    train_or_test_labels = ["train"
                            for i in y_train] + ["test" for i in y_test]
    y_train, X_train, y_test, X_test = outlier_removal(train_or_test_labels,
                                                       X_train + X_test,
                                                       y_train + y_test)

    y_train, X_train = features_extraction(
        y_train,
        X_train,
        separateClassifier=separateClassifier,
        featuresCount=100)

    y_test, X_test = features_extraction(y_test,
                                         X_test,
                                         separateClassifier=separateClassifier,
                                         featuresCount=100)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    classifier = SVC(kernel="rbf",
                     C=2e11,
                     gamma=2e-1,
                     max_iter=5000,
                     class_weight="balanced",
                     verbose=1)

    print("fitting")
    classifier.fit(X_train, y_train)

    print("testing")
    y_predictions = classifier.predict(X_test)  #, y_test)

    return y_test, y_predictions
Exemple #34
0
def get_serialized_pipeline(train):
    from src.features import counting_feat, knn_similarity
    config = get_config()
    feature_names = [
        file_name for file_name in listdir(config['features_dump_dir'])
    ]
    return Pipeline([
        ('read', ReadFeatures(feature_names)),
        ("train_search", knn_similarity.TrainSearch(train=train)),
        ('tfidf', counting_feat.BagOfTfIDF(train)),  # cb
        ('transform', ToMatrix(features=feature_names)),
        ('norm', MinMaxScaler())
    ])