Esempio n. 1
0
def test_min_max_scaler_iris():
    X = iris.data
    scaler = MinMaxScaler()
    # default params
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), 0)
    assert_array_almost_equal(X_trans.min(axis=0), 0)
    assert_array_almost_equal(X_trans.max(axis=0), 1)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # not default params: min=1, max=2
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), 1)
    assert_array_almost_equal(X_trans.max(axis=0), 2)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # min=-.5, max=.6
    scaler = MinMaxScaler(feature_range=(-.5, .6))
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), -.5)
    assert_array_almost_equal(X_trans.max(axis=0), .6)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # raises on invalid range
    scaler = MinMaxScaler(feature_range=(2, 1))
    assert_raises(ValueError, scaler.fit, X)
Esempio n. 2
0
def test_min_max_scaler_zero_variance_features():
    # Check min max scaler on toy data with zero variance features
    X = [[0., 1., +0.5], [0., 1., -0.1], [0., 1., +1.1]]

    X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]]

    # default params
    scaler = MinMaxScaler()
    X_trans = scaler.fit_transform(X)
    X_expected_0_1 = [[0., 0., 0.5], [0., 0., 0.0], [0., 0., 1.0]]
    assert_array_almost_equal(X_trans, X_expected_0_1)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    X_trans_new = scaler.transform(X_new)
    X_expected_0_1_new = [[+0., 1., 0.500], [-1., 0., 0.083], [+0., 0., 1.333]]
    assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)

    # not default params
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    X_expected_1_2 = [[1., 1., 1.5], [1., 1., 1.0], [1., 1., 2.0]]
    assert_array_almost_equal(X_trans, X_expected_1_2)

    # function interface
    X_trans = minmax_scale(X)
    assert_array_almost_equal(X_trans, X_expected_0_1)
    X_trans = minmax_scale(X, feature_range=(1, 2))
    assert_array_almost_equal(X_trans, X_expected_1_2)
Esempio n. 3
0
def test_min_max_scaler_zero_variance_features():
    """Check min max scaler on toy data with zero variance features"""
    X = [[0., 1., +0.5],
         [0., 1., -0.1],
         [0., 1., +1.1]]

    X_new = [[+0., 2., 0.5],
             [-1., 1., 0.0],
             [+0., 1., 1.5]]

    # default params
    scaler = MinMaxScaler()
    X_trans = scaler.fit_transform(X)
    X_expected_0_1 = [[0., 0., 0.5],
                      [0., 0., 0.0],
                      [0., 0., 1.0]]
    assert_array_almost_equal(X_trans, X_expected_0_1)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    X_trans_new = scaler.transform(X_new)
    X_expected_0_1_new = [[+0., 1., 0.500],
                          [-1., 0., 0.083],
                          [+0., 0., 1.333]]
    assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)

    # not default params
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    X_expected_1_2 = [[1., 1., 1.5],
                      [1., 1., 1.0],
                      [1., 1., 2.0]]
    assert_array_almost_equal(X_trans, X_expected_1_2)
Esempio n. 4
0
def test_min_max_scaler_iris():
    X = iris.data
    scaler = MinMaxScaler()
    # default params
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), 0)
    assert_array_almost_equal(X_trans.min(axis=0), 0)
    assert_array_almost_equal(X_trans.max(axis=0), 1)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # not default params: min=1, max=2
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), 1)
    assert_array_almost_equal(X_trans.max(axis=0), 2)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # min=-.5, max=.6
    scaler = MinMaxScaler(feature_range=(-.5, .6))
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), -.5)
    assert_array_almost_equal(X_trans.max(axis=0), .6)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # raises on invalid range
    scaler = MinMaxScaler(feature_range=(2, 1))
    assert_raises(ValueError, scaler.fit, X)
Esempio n. 5
0
def spambase_transform(input_path, features_path, labels_path, metadata_path):
    metadata = create_metadata(
        VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {},
        sum(NUM_SAMPLES), CLASSES)

    input_file = open(input_path, "r")

    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    i = 0
    line = input_file.readline()
    while line != "":
        line = line.rstrip("\n")
        values = line.split(",")

        assert len(values) - 1 == len(VARIABLES), str(
            (len(values) - 1, len(VARIABLES)))

        for j, value in enumerate(values[:-1]):
            value = float(value)
            features[i, j] = value

        labels[i] = int(values[-1])

        i += 1

        line = input_file.readline()

    # scale
    scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    scaler.fit_transform(features)

    assert i == metadata["num_samples"]

    num_positive_samples = int(labels.sum())
    num_negative_samples = labels.shape[0] - num_positive_samples

    assert num_negative_samples == NUM_SAMPLES[0]
    assert num_positive_samples == NUM_SAMPLES[1]

    print("Negative samples: ", num_negative_samples)
    print("Positive samples: ", num_positive_samples)
    print("Total samples: ", features.shape[0])
    print("Features: ", features.shape[1])

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    metadata["features_min"] = scaler.data_min_.tolist()
    metadata["features_max"] = scaler.data_max_.tolist()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Esempio n. 6
0
def process_data():

    file_path = '/Users/fpena/Stuff/House Search/Dublin/viewings-ucd.csv'
    data_frame = pandas.read_csv(file_path)
    print(data_frame.columns.values.tolist())
    print(data_frame.head())
    print(data_frame.describe())
    print(data_frame['Price'])

    price_scaler = MinMaxScaler()
    data_frame['Price Score'] = 1 - price_scaler.fit_transform(
        data_frame[['Price']])
    data_frame['Cycle Time Score'] = 1 - price_scaler.fit_transform(
        data_frame[['Cycle Time']])
    data_frame['Score'] = 0.5 * (data_frame['Price Score'] +
                                 data_frame['Cycle Time Score'])
    data_frame['Rank'] = data_frame['Score'].rank(
        ascending=True) / (len(data_frame))

    cycle_hour_cost = 30
    working_days_per_month = 22
    data_frame['Money Score'] =\
        data_frame['Price'] + data_frame['Cycle Time'] / 60 * cycle_hour_cost * working_days_per_month
    data_frame.rename(columns={'Cycle Time': 'Cycle'}, inplace=True)
    # print(data_frame['Price Score'])
    # print(data_frame[['Score', 'Rank']])
    # with pandas.option_context('display.max_rows', 500, 'display.max_columns', 10):
    #   print(data_frame[['Address', 'Price', 'Cycle', 'Rank', 'Score']].sort_values('Rank', ascending=False))
    # print(data_frame[['Address', 'Price', 'Cycle', 'Rank', 'Score', 'Money Score']].to_string())
    print(data_frame[[
        'Address', 'Price', 'Cycle', 'Rank', 'Score', 'Money Score'
    ]].sort_values('Rank', ascending=False).to_string())

    # seaborn.(x='Price', y='Cycle Time', data_frame=data_frame)
    data_frame.plot.scatter(x='Price', y='Cycle')
    pyplot.savefig('/tmp/daft_scatter.pdf')
    pyplot.cla()
    pyplot.clf()

    data_frame.plot.scatter(x='Price Score', y='Cycle Time Score')
    pyplot.savefig('/tmp/daft_scatter_norm.pdf')
    pyplot.cla()
    pyplot.clf()

    seaborn.stripplot(x='Accommodation Type',
                      y='Price',
                      data=data_frame,
                      jitter=True)
    pyplot.savefig('/tmp/daft_price.pdf')
    pyplot.cla()
    pyplot.clf()

    data_frame.plot.scatter(x='Housemates', y='Price')
    pyplot.savefig('/tmp/daft_scatter_price_housemates.pdf')
    pyplot.cla()
    pyplot.clf()

    data_frame.to_csv('/tmp/daft-houses-processed.csv')
Esempio n. 7
0
def letter_recognition_transform(input_path, features_path, labels_path,
                                 metadata_path):
    metadata = create_metadata(
        VARIABLES, create_one_type_dictionary("numerical", VARIABLES), {},
        sum(NUM_SAMPLES), CLASSES)

    input_file = open(input_path, "r")

    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    i = 0
    line = input_file.readline()
    while line != "":
        line = line.rstrip("\n")
        values = line.split(",")

        assert len(values) - 1 == len(VARIABLES), str(
            (len(values) - 1, len(VARIABLES)))

        for j, value in enumerate(values[1:]):
            value = float(value)
            features[i, j] = value

        labels[i] = CLASS_TO_INDEX[values[0]]

        i += 1

        line = input_file.readline()

    # scale
    scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    scaler.fit_transform(features)

    assert i == metadata["num_samples"]

    for class_index in range(len(NUM_SAMPLES)):
        num_samples_class = (labels == class_index).sum()
        assert num_samples_class == NUM_SAMPLES[class_index]

    print("Total samples: ", features.shape[0])
    print("Features: ", features.shape[1])

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    metadata["features_min"] = scaler.data_min_.tolist()
    metadata["features_max"] = scaler.data_max_.tolist()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Esempio n. 8
0
def default_credit_card_transform(input_path, features_path, labels_path, metadata_path):
    input_file = open(input_path, "r")
    reader = csv.DictReader(input_file)

    variables = set(reader.fieldnames)
    variables.remove("ID")
    variables.remove("default payment next month")

    metadata = create_metadata(variables, TYPES, VALUES, NUM_SAMPLES, CLASSES)

    features = np.zeros((metadata["num_samples"], metadata["num_features"]), dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.int32)

    # transform
    for i, row in enumerate(reader):
        # the categorical variables are already one hot encoded
        for j, variable in enumerate(metadata["variables"]):
            value = row[variable]
            if TYPES[variable] == "numerical":
                value = float(value)
                features[i, metadata["value_to_index"][variable]] = value
            elif TYPES[variable] == "categorical":
                value = value.replace(".0", "")
                assert value in VALUES[variable], \
                    "'{}' is not a valid value for '{}'".format(value, variable)
                features[i, metadata["value_to_index"][variable][value]] = 1.0

        # the class needs to be transformed
        labels[i] = int(row["default payment next month"].replace(".0", ""))

    # scale
    scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    scaler.fit_transform(features)

    assert i == metadata["num_samples"] - 1

    num_positive_samples = int(labels.sum())
    num_negative_samples = labels.shape[0] - num_positive_samples

    print("Negative samples: ", num_negative_samples)
    print("Positive samples: ", num_positive_samples)
    print("Total samples: ", features.shape[0])
    print("Features: ", features.shape[1])

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    metadata["features_min"] = scaler.data_min_.tolist()
    metadata["features_max"] = scaler.data_max_.tolist()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)
Esempio n. 9
0
def load_data(n_samples, label_scaling: bool = False):
    """Take in Brian's data and spit out some numpy arrays for the PAL"""
    df_full_factorial_feat = pd.read_csv(
        os.path.join(DATADIR, 'new_features_full_random.csv'))[FEATURES].values
    a2 = pd.read_csv(
        os.path.join(
            DATADIR,
            'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values
    gibbs = pd.read_csv(os.path.join(
        DATADIR, 'b1-b21_random_deltaG.csv'))['deltaGmin'].values * (-1)
    gibbs_max = pd.read_csv(
        os.path.join(DATADIR,
                     'b1-b21_random_virial_large_new.csv'))['deltaGmax'].values
    rg = pd.read_csv(os.path.join(DATADIR, 'rg_results.csv'))['Rg'].values
    y = np.hstack(
        [rg.reshape(-1, 1),
         gibbs.reshape(-1, 1),
         gibbs_max.reshape(-1, 1)])
    assert len(df_full_factorial_feat) == len(a2) == len(gibbs) == len(y)

    feat_scaler = StandardScaler()
    X = feat_scaler.fit_transform(df_full_factorial_feat)

    if label_scaling:
        label_scaler = MinMaxScaler()
        y = label_scaler.fit_transform(y)

    greedy_indices = get_maxmin_samples(X, n_samples)

    nan_indices = np.unique(np.random.randint(0, len(y) - 1, int(len(y) / 3)))
    y[nan_indices, 2] = np.nan
    return X, y, greedy_indices
 def parkinsons_replicated_data(self, park_dat):
     df_parkinson = pd.read_csv(park_dat, sep=',')
     ylabel = df_parkinson['Status']
     xfeatures = df_parkinson.drop(['Status', 'ID'], axis=1)
     xfeats = df_parkinson.drop(['Status', 'ID'], axis=1).values
     x = (xfeats - np.min(xfeats)) / (np.max(xfeats) - np.min(xfeats))
     y = df_parkinson['Status'].values
     xfeatsp = pd.DataFrame(xfeatures)
     minmax_scaling = MinMaxScaler()
     x_scaledp = minmax_scaling.fit_transform(xfeatsp)
     x_scaledp = pd.DataFrame(x_scaledp)
     f1 = plt.figure(figsize=(19, 16))
     plt.matshow(x_scaledp.corr(), fignum=f1.number)
     plt.xticks(range(x_scaledp.shape[1]),
                x_scaledp.columns,
                fontsize=10,
                rotation=45)
     plt.xticks(range(x_scaledp.shape[1]), x_scaledp.columns, fontsize=10)
     cb = plt.colorbar()
     cb.ax.tick_params(labelsize=12)
     plt.show()
     for eachx in xfeatures:
         xfeatures[eachx] = (xfeatures[eachx] - xfeatures[eachx].min()
                             ) / xfeatures[eachx].max()
     ylabel = ylabel.values
     # ydata = ylabel[:, None]
     xdata = x_scaledp.to_numpy()
     targets = np.array(ylabel).reshape(-1)
     y = np.eye(2)[targets]
     xtrain, xtest, y_train, y_test = train_test_split(
         x, y, test_size=0.30)  #, shuffle=False)
     print(y_test)
     #y_train = ytrain[:, None]
     #y_test = ytest[:, None]
     return xtrain, xtest, y_train, y_test
Esempio n. 11
0
def pearson(A, B, scale=True):
    correlation = 0
    if scale:
        scaler = MinMaxScaler()
        A = scaler.fit_transform(A)
        B = scaler.fit_transform(B)
    for i in range(A.shape[1]):
        correlation = correlation + pearsonr(A[:, i], B[:, i])[0]
    return correlation / A.shape[1]
Esempio n. 12
0
def pearson(A, B, scale=True):
    correlation = 0
    if scale:
        scaler = MinMaxScaler()
        A = scaler.fit_transform(A)
        B = scaler.fit_transform(B)
    for i in range(A.shape[1]):
        correlation = correlation + pearsonr(A[:, i], B[:, i])[0]
    return correlation / A.shape[1]
Esempio n. 13
0
def calculate_scores(data_frame):
    data_frame = filter_data(data_frame)
    min_max_scaler = MinMaxScaler()
    data_frame['cycle_time'] = data_frame['distance_to_ucd'] * 6
    data_frame['price_score'] = 1 - min_max_scaler.fit_transform(
        data_frame[['price']])
    data_frame['cycle_time_score'] = 1 - min_max_scaler.fit_transform(
        data_frame[['cycle_time']])
    data_frame['money'] = \
        data_frame['price'] + data_frame['cycle_time'] * 22 * TIME_PRICE_PER_HOUR / 60
    data_frame['money_score'] = 1 - min_max_scaler.fit_transform(
        data_frame[['money']])
    data_frame['score'] =\
        data_frame['price_score'] + data_frame['cycle_time_score']
    data_frame['score'] = min_max_scaler.fit_transform(data_frame[['score']])
    data_frame['money_rank'] = data_frame['money'].rank(
        ascending=False) / (len(data_frame))

    pandas.options.display.max_colwidth = 200

    return data_frame
Esempio n. 14
0
    def classify(X_train, y_train, X_test, y_test):

        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)

        liberatore_NB = GaussianNB()

        liberatore_NB.fit(X_train, y_train)
        del X_train

        X_test = scaler.transform(X_test)

        predictions = liberatore_NB.predict(X_test)

        return y_test, predictions
Esempio n. 15
0
def classifier_dyer2012(X_train, y_train, X_test, y_test, time_train=None, time_test=None):

    obj = Dyer2012VNGPlusPlusClassifier()

    X_train, fields = dyer2012_tracestoInstances(obj, X_train, time_train)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)

    models1 = {
        'Bernoulli': BernoulliNB(),
        'Gaussian': GaussianNB(),
        'Multinomial': MultinomialNB(),
    }

    params1 = {
        'Bernoulli': {},
        'Gaussian': {},
        'Multinomial': {},
        #'SVC': [
        #    {'kernel': ['linear'], 'C': [1, 10]},
         #   {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
        #]
    }

    dyer_NB=MultinomialNB()
    dyer_NB.fit(X_train, y_train)
    del X_train

    #test
    X_test, fields = dyer2012_tracestoInstances(obj, X_test, time_test, fields)
    X_test = scaler.transform(X_test)


    predictions = dyer_NB.predict(X_test)
    del X_test

    labels = []
    for l in y_train:
        if l not in labels:
            labels.append(l)

    return y_test, predictions
 def xtraintestdata(self, datarray, yarray, dfiletowrite):
     x_train, x_test, y_train, y_test = train_test_split(datarray,
                                                         yarray,
                                                         test_size=0.2,
                                                         random_state=1)
     x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                       y_train,
                                                       test_size=0.2,
                                                       random_state=1)
     min_max_scaler = MinMaxScaler()
     # feed in a numpy array
     x_train_norm = min_max_scaler.fit_transform(x_train)
     _ = np.c_[x_train_norm, y_train]
     dirme = dfiletowrite
     sio.savemat(dirme, mdict={'UCIDat': yarray})
     xy_valid = np.c_[x_val, y_val]
     xy_train = np.c_[x_train, y_train]
     xy_test = np.c_[x_test, y_test]
     return xy_train, xy_test, xy_valid
Esempio n. 17
0
def load_data(n_samples, label_scaling: bool = False, method: str = 'maxmin'):
    """Take in Brian's data and spit out some numpy arrays for the PAL"""
    df_full_factorial_feat = pd.read_csv(
        os.path.join(DATADIR, 'new_features_full_random.csv'))[FEATURES].values
    a2 = pd.read_csv(
        os.path.join(
            DATADIR,
            'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values
    deltaGMax = pd.read_csv(
        os.path.join(
            DATADIR,
            'b1-b21_random_virial_large_new.csv'))['A2_normalized'].values  # pylint:disable=unused-variable
    gibbs = pd.read_csv(os.path.join(
        DATADIR, 'b1-b21_random_deltaG.csv'))['deltaGmin'].values * (-1)
    gibbs_max = pd.read_csv(
        os.path.join(DATADIR,
                     'b1-b21_random_virial_large_new.csv'))['deltaGmax'].values
    force_max = pd.read_csv(
        os.path.join(
            DATADIR,
            'b1-b21_random_virial_large_fit2.csv'))['F_repel_max'].values  # pylint:disable=unused-variable
    rg = pd.read_csv(os.path.join(DATADIR, 'rg_results.csv'))['Rg'].values
    y = np.hstack(
        [rg.reshape(-1, 1),
         gibbs.reshape(-1, 1),
         gibbs_max.reshape(-1, 1)])
    assert len(df_full_factorial_feat) == len(a2) == len(gibbs) == len(y)

    feat_scaler = StandardScaler()
    X = feat_scaler.fit_transform(df_full_factorial_feat)

    if label_scaling:
        label_scaler = MinMaxScaler()
        y = label_scaler.fit_transform(y)

    if method == 'maxmin':
        greedy_indices = get_maxmin_samples(X, n_samples)

    elif method == 'kmeans':
        greedy_indices = get_kmeans_samples(X, n_samples)

    return X, y, greedy_indices
Esempio n. 18
0
def prepare_df_for_violinplot(df,
                              feature_cols,
                              class_col,
                              class_indices=None,
                              minmaxscale=True):
    """
    Min-max-scale the data and then melt the dataframe into the long format
    """
    if class_indices:
        df = df.loc[list(class_indices)]
    df = df[feature_cols + [class_col]]

    if minmaxscale:
        from sklearn.preprocessing.data import MinMaxScaler
        scaler = MinMaxScaler()
        df[feature_cols] = scaler.fit_transform(df[feature_cols])

    prepared_df = pd.melt(df, value_vars=feature_cols, id_vars=class_col)

    return prepared_df
Esempio n. 19
0
def classifier_panchenko2016(X_train,
                             y_train,
                             X_test,
                             y_test,
                             separateClassifier=False):
    train_or_test_labels = ["train"
                            for i in y_train] + ["test" for i in y_test]
    y_train, X_train, y_test, X_test = outlier_removal(train_or_test_labels,
                                                       X_train + X_test,
                                                       y_train + y_test)

    y_train, X_train = features_extraction(
        y_train,
        X_train,
        separateClassifier=separateClassifier,
        featuresCount=100)

    y_test, X_test = features_extraction(y_test,
                                         X_test,
                                         separateClassifier=separateClassifier,
                                         featuresCount=100)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    classifier = SVC(kernel="rbf",
                     C=2e11,
                     gamma=2e-1,
                     max_iter=5000,
                     class_weight="balanced",
                     verbose=1)

    print("fitting")
    classifier.fit(X_train, y_train)

    print("testing")
    y_predictions = classifier.predict(X_test)  #, y_test)

    return y_test, y_predictions
Esempio n. 20
0
print min_exercised_stock_options, max_exercised_stock_options

features_list = [poi, feature_2, feature_1]
data = featureFormat(data_dict, features_list, remove_any_zeroes=True)
poi, finance_features = targetFeatureSplit(data)
finance_features = numpy.reshape(numpy.array(finance_features),
                                 (len(finance_features), 2))

##finance_features=sorted(finance_features,key=lambda x:x[0],reverse=True)
print finance_features
limit = len(finance_features)
scaler = MinMaxScaler()
###salaries=numpy.array([finance_features[0],[1000000.],finance_features[limit-1]])

rescaled_weight = scaler.fit_transform(finance_features)
print "The rescaled weight", rescaled_weight

data_dict.pop("TOTAL", 0)

### the input features we want to use
### can be any key in the person-level dictionary (salary, director_fees, etc.)
feature_1 = "salary"
feature_2 = "exercised_stock_options"
feature_3 = "total_payments"
poi = "poi"
features_list = [poi, feature_1, feature_2, feature_3]
data = featureFormat(data_dict, features_list, remove_any_zeroes=True)
poi, finance_features = targetFeatureSplit(data)
print "Last ", finance_features
Esempio n. 21
0
from sklearn import datasets

from sklearn.preprocessing.data import MinMaxScaler

iris = datasets.load_iris()

transformer = MinMaxScaler()
newX = transformer.fit_transform(iris.data)

print(iris.data)
print('==============')
print(newX)
Esempio n. 22
0
import sklearn.preprocessing.data
import numpy
from sklearn.preprocessing.data import MinMaxScaler

# Each element of the numpy array is a different training point
# Each element within the training point is a feature
# This example one feature - the weights feature
# Three different training points
# old_weights = numpy.array([[115], [140], [175]])
weights = numpy.array([[115.], [140.], [175.]])
# print("type(weights) - {}\n".format(type(weights)))
#        type(weights) - <class 'numpy.ndarray'>

scaler = MinMaxScaler()
# print("type(scaler) - {}\n".format(type(scaler)))
#        type(scaler) - <class 'sklearn.preprocessing.data.MinMaxScaler'>

rescaled_weight = scaler.fit_transform(weights)
# 1 of 2 steps fit - find x_min, x_max
# 2 of 2 steps transform - applies the formula to the elements

# print("rescaled_weight - ")
# print(rescaled_weight)
# rescaled_weight -
# [[ 0.        ]
#  [ 0.41666667]
#  [ 1.        ]]

print("type(rescaled_weight) - {}\n".format(type(rescaled_weight)))
#      type(rescaled_weight) - <class 'numpy.ndarray'>
Esempio n. 23
0
            print('................................xxxx{} - {}'.format(
                valueName, value))
        if valueName == 'salary' and value != 'NaN' and value > 190000 and value < 210000:
            print('................................xxxx{} - {}'.format(
                valueName, value))

    print()

print("smallest exercised_stock_options - {}".format(smallestESO))
print("largest exercised_stock_options - {}".format(largestESO))
print("smallestSalary - {}".format(smallestSalary))
print("largestSalary - {}\n".format(largestSalary))

scaler = MinMaxScaler()
original_salary = numpy.array([[smallestSalary], [200000.], [largestSalary]])
rescaled_salary = scaler.fit_transform(original_salary)
print('rescaled_salary ->')
print(rescaled_salary)

original_exercised_stock_options = numpy.array([[smallestESO], [1000000.],
                                                [largestESO]])
rescaled_exercised_stock_options = scaler.fit_transform(
    original_exercised_stock_options)
print('rescaled_exercised_stock_options ->')
print(rescaled_exercised_stock_options)

# class video - Introduction To Machine Learning - Clustering - Quiz: Clustering Features
# What features will your clustering algorithm use?
# answer - 1.) salary, 2.) exercised_stock_options
### the input features we want to use
### can be any key in the person-level dictionary (salary, director_fees, etc.)
Esempio n. 24
0
def split_train_validation_test(multi_time_series_df,
                                valid_start_time,
                                test_start_time,
                                features,
                                time_step_lag=1,
                                horizon=1,
                                target='target',
                                time_format='%Y-%m-%d %H:%M:%S',
                                freq='H'):

    if not isinstance(features, list) or len(features) < 1:
        raise Exception(
            "Bad input for features. It must be an array of dataframe colummns used"
        )

    train = multi_time_series_df.copy()[
        multi_time_series_df.index < valid_start_time]
    train = train[features]

    X_scaler = MinMaxScaler()

    if 'load' in features:
        y_scaler = MinMaxScaler()
        y_scaler.fit(train[['load']])
    else:
        y_scaler = MinMaxScaler()

        tg = train[target]
        y_scaler.fit(tg.values.reshape(-1, 1))

    train[features] = X_scaler.fit_transform(train)

    tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)}
    train_inputs = TimeSeriesTensor(train,
                                    target=target,
                                    H=horizon,
                                    freq=freq,
                                    tensor_structure=tensor_structure)

    print(train_inputs.dataframe.head())

    look_back_dt = dt.datetime.strptime(
        valid_start_time, time_format) - dt.timedelta(hours=time_step_lag - 1)
    valid = multi_time_series_df.copy()[
        (multi_time_series_df.index >= look_back_dt)
        & (multi_time_series_df.index < test_start_time)]
    valid = valid[features]
    valid[features] = X_scaler.transform(valid)
    tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)}
    valid_inputs = TimeSeriesTensor(valid,
                                    target=target,
                                    H=horizon,
                                    freq=freq,
                                    tensor_structure=tensor_structure)

    print(valid_inputs.dataframe.head())

    # test set
    # look_back_dt = dt.datetime.strptime(test_start_time, '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=time_step_lag - 1)
    test = multi_time_series_df.copy()[test_start_time:]
    test = test[features]
    test[features] = X_scaler.transform(test)
    test_inputs = TimeSeriesTensor(test,
                                   target=target,
                                   H=horizon,
                                   freq=freq,
                                   tensor_structure=tensor_structure)

    print("time lag:", time_step_lag, "original_feature:", len(features))

    return train_inputs, valid_inputs, test_inputs, y_scaler
Esempio n. 25
0
    if dropnan:
        agg.dropna(inplace=True)
    return agg


# load dataset
dataset = read_csv('data/pollution.csv', header=0, index_col=0)
values = dataset.values
# integer encode direction
encoder = LabelEncoder()
values[:, 4] = encoder.fit_transform(values[:, 4])
# ensure all data is float
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

# specify the number of lag hours
n_hours = 1
n_features = 8  ### NOT USED YET
# frame as supervised learning
reframed = series_to_supervised(scaled, n_hours, 1)
# drop columns we don't want to predict
reframed.drop(reframed.columns[[9, 10, 11, 12, 13, 14, 15]],
              axis=1,
              inplace=True)
print(reframed.head())

# split into train and test sets
values = reframed.values
n_train_hours = 365 * 24
Esempio n. 26
0
def online_news_popularity_transform(input_path, features_path, labels_path,
                                     metadata_path):
    variables = []
    types = {}
    values = {}

    for original_variable, original_type in ORIGINAL_TYPES.items():
        if "_is_" in original_variable:
            index = original_variable.index("_is_")
            variable = original_variable[:index]
            value = original_variable[index + 4:]
            if variable not in types:
                assert variable not in values
                types[variable] = "categorical"
                if CAN_BE_EMPTY[variable]:
                    values[variable] = ["none"]
                else:
                    values[variable] = []
                variables.append(variable)
            values[variable].append(value)
        else:
            variables.append(original_variable)
            types[original_variable] = original_type

    metadata = create_metadata(variables, types, values, NUM_SAMPLES)

    input_file = open(input_path, "r")
    reader = csv.DictReader(input_file)

    reader.fieldnames = [variable.strip() for variable in reader.fieldnames]

    features = np.zeros((metadata["num_samples"], metadata["num_features"]),
                        dtype=np.float32)
    labels = np.zeros(metadata["num_samples"], dtype=np.float32)

    # transform
    for i, row in enumerate(reader):
        # the categorical variables are already one hot encoded
        for j, variable in enumerate(metadata["variables"]):
            if types[variable] == "numerical":
                value = float(row[variable])
                features[i, metadata["value_to_index"][variable]] = value
            elif types[variable] == "categorical":
                value = None
                for possible_value in values[variable]:
                    if possible_value == "none":
                        continue
                    real_variable = "{}_is_{}".format(variable, possible_value)
                    if read_binary(row[real_variable]) == 1:
                        if value is None:
                            value = possible_value
                        else:
                            raise Exception(
                                "'{}' was already defined".format(variable))
                if value is None:
                    if "none" in values[variable]:
                        value = "none"
                    else:
                        for possible_value in values[variable]:
                            if possible_value == "none":
                                continue
                            real_variable = "{}_is_{}".format(
                                variable, possible_value)
                            print(possible_value, real_variable,
                                  read_binary(row[real_variable]))
                        raise Exception(
                            "'{}' has no valid value".format(variable))
                features[i, metadata["value_to_index"][variable][value]] = 1.0
            elif types[variable] == "binary":
                value = read_binary(row[variable])
                assert value in [
                    0, 1
                ], "'{}' is not a valid value for '{}'".format(
                    value, variable)
                features[i, metadata["value_to_index"][variable][value]] = 1.0
            else:
                raise Exception("Unknown variable type.")

        labels[i] = row["shares"]

    # scale
    scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    scaler.fit_transform(features)

    label_scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
    label_scaler.fit_transform(labels.reshape(-1, 1))

    assert i == metadata["num_samples"] - 1

    print("Total samples: ", features.shape[0])
    print("Features: ", features.shape[1])

    np.save(features_path, features)
    np.save(labels_path, labels)

    input_file.close()

    metadata["features_min"] = scaler.data_min_.tolist()
    metadata["features_max"] = scaler.data_max_.tolist()

    metadata["labels_min"] = label_scaler.data_min_.tolist()
    metadata["labels_max"] = label_scaler.data_max_.tolist()

    with open(metadata_path, "w") as metadata_file:
        json.dump(metadata, metadata_file)