def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    assert_raises(ValueError, StandardScaler().fit, X_csr)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    scaler = StandardScaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
    X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
Beispiel #2
0
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    assert_raises(ValueError, StandardScaler().fit, X_csr)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    scaler = StandardScaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
    X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
Beispiel #3
0
 def preprocess(self):
     sc = StandardScaler()
     sc.fit(self.X_train)
     X_train_std = sc.transform(self.X_train)
     X_test_std = sc.transform(self.X_test)
     self.train_dataset = self.Dataset(data=X_train_std,
                                       target=self.y_train)
     self.test_dataset = self.Dataset(data=X_test_std, target=self.y_test)
Beispiel #4
0
def imputeAndScale(X_train,X_test):
    imp= Imputer()
    X_train=imp.fit_transform(X_train)
    X_test=imp.transform(X_test)
    
    scaler= StandardScaler().fit(X_train)
    X_test=scaler.transform(X_test)
    X_train= scaler.transform(X_train)
    
    return X_train, X_test
Beispiel #5
0
Datei: ml.py Projekt: nvhuy/LM
def xval(feature_file, removed_columns=None):
    """
    Load features into file
    :param feature_file: feature file
    :param removed_columns: index of feature columns to remove
    """
    module_logger.info('------ Load feature data ::: {}'.format(feature_file))
    clf = svm_clf()

    fs = numpy.loadtxt(feature_file, delimiter='\t', skiprows=1)
    _, n = fs.shape
    iX = fs[:, 0]
    X = fs[:, 1:n - 1]
    y = fs[:, n - 1]

    if removed_columns is not None and len(removed_columns) > 0:
        X = numpy.delete(X, removed_columns, 1)
    module_logger.info('------ data dimension ::: {} ::: {}'.format(X.shape, n))

    y_true = numpy.array([])
    y_out = numpy.array([])
    y_prob = numpy.array([])
    y_i = numpy.array([])

    std_scaler = StandardScaler()

    skf = StratifiedKFold(n_splits=5)
    for train_index, test_index in skf.split(X, y):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        std_scaler.fit(X_train)
        X_train_scaled = std_scaler.transform(X_train, copy=True)
        X_test_scaled = std_scaler.transform(X_test, copy=True)

        clf.fit(X_train_scaled, y_train)
        y_pred = clf.predict(X_test_scaled)
        y_logp = clf.predict_proba(X_test_scaled)

        y_true = numpy.hstack((y_true, y_test))
        y_out = numpy.hstack((y_out, y_pred))
        y_prob = numpy.hstack((y_prob, numpy.max(y_logp, axis=1)))

        iX_test = iX[test_index]
        y_i = numpy.hstack((y_i, iX_test))

    return write_prediction_output(y_i, y_true, y_out, feature_file.replace('.csv', '_pred.csv'), y_prob)
Beispiel #6
0
    def prepare_time_data(data):
        data_scaler = StandardScaler()
        data_concat = np.concatenate(data, axis=0)
        data_scaler.fit(data_concat)
        new_data = [data_scaler.transform(data_) for data_ in data]

        return data_scaler, new_data
Beispiel #7
0
    def normalize_features(self, scaler: StandardScaler=None) \
            -> StandardScaler:
        '''
        Normalizes the features of the dataset using a StandardScaler
        (subtract mean, divide by standard deviation).

        If a scaler is provided, uses that scaler to perform the normalization.
        Otherwise fits a scaler to the features in the dataset and then
        performs the normalization.

        :param scaler: A fitted StandardScaler. Used if provided.
        Otherwise a StandardScaler is fit on this dataset and is then used.
        :param replace_nan_token: What to replace nans with.
        :return: A fitted StandardScaler. If a scaler is provided, this is the
        same scaler. Otherwise, this is a scaler fit on this dataset.
        '''
        if not self.data or not self.data[0].features:
            return None

        if not scaler:
            scaler = StandardScaler()

        features = np.vstack([d.features for d in self.data])
        scaler.fit(features)

        for d in self.data:
            d.set_features(scaler.transform(d.features.reshape(1, -1))[0])

        return scaler
 def __stdScaler(self):
     all_cols = list(self.data_df.columns.values)
     for col in all_cols:
         if col not in self.non_numeric_cols and col != 'time_to_failure':
             stdScaler = StandardScaler()
             stdScaler.fit(self.data_df[[col]])
             self.data_df[col] = stdScaler.transform(self.data_df[[col]])
     print('Standard Scaler applied ... ')
 def obtain_sets(self, psychological_construct, percentage):
     index = self.get_index(psychological_construct)
     logging.info("Psychological construct under analysis:" + 
                  psychological_construct)
     negative_students, positive_students = self.get_instances(index)
     train_set, dev_set, test_set = self.divide_sets(negative_students,
                                                     positive_students,
                                                     percentage)
     train_set_x, train_set_y = self.get_x_and_y(train_set, index)
     logging.info("Training set shape:" + str(train_set_x.shape))
     if self.norm == z_norm_literal:
         logging.info("Z-Normalizing")
         reshaped_train_set_x = self.reshape_numpy_array(train_set_x)
         scaler = StandardScaler()
         scaler.fit(reshaped_train_set_x)
         normalized_reshaped_train_x = scaler.transform(reshaped_train_set_x)
         normalized_train_set_x = np.reshape(normalized_reshaped_train_x,
                                             (train_set_x.shape[0],
                                              train_set_x.shape[1],
                                              train_set_x.shape[2],
                                              train_set_x.shape[3]))
     dev_set_x, dev_set_y = self.get_x_and_y(dev_set, index)
     if self.norm == z_norm_literal:
         logging.info("Z-Normalizing")
         reshaped_dev_x = self.reshape_numpy_array(dev_set_x)
         normalized_reshaped_dev_x = scaler.transform(reshaped_dev_x)
         normalized_dev_x = np.reshape(normalized_reshaped_dev_x,
                                       (dev_set_x.shape[0],
                                        dev_set_x.shape[1],
                                        dev_set_x.shape[2],
                                        dev_set_x.shape[3]))
     test_set_x, test_set_y = self.get_x_and_y(test_set, index,
                                               test_flag=True)
     if self.norm == z_norm_literal:
         logging.info("Z-Normalizing")
         reshaped_test_x = self.reshape_numpy_array(test_set_x)
         normalized_reshaped_test_x = scaler.transform(reshaped_test_x)
         normalized_test_x = np.reshape(normalized_reshaped_test_x,
                                        (test_set_x.shape[0],
                                         test_set_x.shape[1],
                                         test_set_x.shape[2],
                                         test_set_x.shape[3]))
         return normalized_train_set_x, train_set_y, normalized_dev_x, dev_set_y, normalized_test_x, test_set_y
     else:
         return train_set_x, train_set_y, dev_set_x, dev_set_y, test_set_x, test_set_y
Beispiel #10
0
    def test_simple_poly_dataset_scaled_cv(self):
        model = Model.create_model(
            model_type=Model.MODEL_TYPE_SVR,
            cross_validation=True,
            feature_scaling=True,
            C_range=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10],
            kernel=Model.KERNEL_RBF
        )
        train_dataset, test_dataset = test_datasets.get_simple_polynomial_datasets(n=1000)

        scaler = StandardScaler()
        scaler.fit(train_dataset.data)
        print("Train mean: " + str(scaler.transform(train_dataset.data).mean(axis=0)))
        print("Test mean: " + str( scaler.transform(test_dataset.data).mean(axis=0)))
        print("Train std: " + str(scaler.transform(train_dataset.data).std(axis=0)))
        print("Test str: " + str( scaler.transform(test_dataset.data).std(axis=0)))

        self._test_dataset(model, train_dataset, test_dataset, 0, title="SVR with RBF kernel, scaled CV on poly dataset")
Beispiel #11
0
def main():
    args = parse()
    n_rollout = args.nrollout
    n_epoch = args.epoch
    savename = args.savename if args.savename is not None else 'model-' + str(
        n_rollout) + 'unroll'

    np.random.seed(1098)
    path = args.filename
    names = ['target_pos', 'target_speed', 'pos', 'vel', 'effort']
    with h5py.File(path, 'r') as f:
        (target_pos, target_speed, pos, vel,
         effort) = [[np.array(val) for val in f[name].values()]
                    for name in names]

    x_target = np.array(target_pos)
    x_first = np.array([pos_[0] for pos_ in pos])
    x_speed = np.array(target_speed).reshape((-1, 1))
    aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort]

    x = np.concatenate((x_target, x_first, x_speed), axis=1)

    input_scaler = StandardScaler()
    x = input_scaler.fit_transform(x)
    output_scaler = StandardScaler()
    effort_concat = np.concatenate([a for a in effort], axis=0)
    output_scaler.fit(effort_concat)
    effort = [output_scaler.transform(eff) for eff in effort]

    y = pad_sequences(effort, padding='post', value=0.)
    aux_output = pad_sequences(aux_output, padding='post', value=0.)
    x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x,
                                                               y,
                                                               aux_output,
                                                               test_size=0.2)

    y_mask, y_test_mask = [this_y[:, :, 0] for this_y in (y_aux, y_aux_test)]
    y_aux_mask, y_aux_test_mask = [
        np.ones(this_y.shape[:2]) for this_y in (y_aux, y_aux_test)
    ]

    model = MyModel(train=[x, [y, y_aux]],
                    val=[x_test, [y_test, y_aux_test]],
                    train_mask=[y_mask, y_aux_mask],
                    val_mask=[y_test_mask, y_aux_test_mask],
                    max_unroll=n_rollout,
                    name=savename)

    if not os.path.exists('save'):
        os.makedirs('save')

    if args.train:
        model.fit(nb_epoch=n_epoch, batch_size=32)
    elif args.resume:
        model.resume(nb_epoch=n_epoch, batch_size=32)
def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target")
        normalizer_X = StandardScaler()
        trainX = normalizer_X.fit_transform(trainX)
        testX = normalizer_X.transform(testX)
        normalizer_Y = StandardScaler()
        trainY = normalizer_Y.fit_transform(trainY)
        testY = normalizer_Y.transform(testY)
        model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=parameters["C"], cache_size=5000), max_samples=parameters["max_samples"],n_estimators=parameters["n_estimators"], verbose=0, n_jobs=-1)
        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        prediction = normalizer_Y.inverse_transform(prediction)
        testY = normalizer_Y.inverse_transform(testY)
        all_obs.extend(testY)
        all_pred.extend(prediction)
        
    return rmseEval(all_obs, all_pred)[1]
Beispiel #13
0
def test_scalar():
    from sklearn.preprocessing.data import MinMaxScaler, StandardScaler
    scalar = StandardScaler()
    
    training = pd.read_csv(TRAIN_FEATURES_CSV, nrows=200000)
    test = pd.read_csv(TEST_FEATURES_CSV)
    
    # normalize the values
    for column in TOTAL_TRAINING_FEATURE_COLUMNS:
        training[column] = scalar.fit_transform(training[column])
        test[column] = scalar.transform(test[column])
Beispiel #14
0
def evalOne(parameters):
    all_obs = []
    all_pred = []
    for location in locations:
        trainX, testX, trainY, testY = splitDataForXValidation(
            location, "location", data, all_features, "target")
        normalizer_X = StandardScaler()
        trainX = normalizer_X.fit_transform(trainX)
        testX = normalizer_X.transform(testX)
        normalizer_Y = StandardScaler()
        trainY = normalizer_Y.fit_transform(trainY)
        testY = normalizer_Y.transform(testY)

        layers = []
        for _ in range(0, parameters["hidden_layers"]):
            layers.append(
                Layer(parameters["hidden_type"],
                      units=parameters["hidden_neurons"]))
        layers.append(Layer("Linear"))
        model = Regressor(layers=layers,
                          learning_rate=parameters["learning_rate"],
                          n_iter=parameters["iteration"],
                          random_state=42)

        X = np.array(trainX)
        y = np.array(trainY)

        model.fit(X, y)

        model.fit(trainX, trainY)
        prediction = model.predict(testX)
        prediction = normalizer_Y.inverse_transform(prediction)
        testY = normalizer_Y.inverse_transform(testY)

        print("location: " + str(location) + " -> " +
              str(rmseEval(prediction, testY)[1]))

        all_obs.extend(testY)
        all_pred.extend(prediction)

    return rmseEval(all_obs, all_pred)[1]
Beispiel #15
0
def neural_net_2(train, test, val, train_out, test_out, val_out, BigSigma_inv):
    clf = MLPClassifier(solver='sgd',
                        alpha=1e-5,
                        hidden_layer_sizes=(100, 1),
                        activation='logistic',
                        batch_size=BATCH_HUMAN,
                        shuffle=True,
                        max_iter=5000)

    scaler = StandardScaler()
    scaler.fit(train)
    train1 = scaler.transform(train)
    # apply same transformation to test data
    test = scaler.transform(test)
    train_out = train_out.astype(float)
    clf.fit(X=train1, y=train_out)
    predict_test = clf.predict(test)
    predict_val = clf.predict(val)
    print("TEST ERMS ACCURACY", mean_squared_error(test_out, predict_test),
          acc_manual(test_out, predict_test))
    print("VAL ERMS ACCURACY", mean_squared_error(val_out, predict_val),
          acc_manual(val_out, predict_test))
Beispiel #16
0
def test_center_kernel():
    """Test that KernelCenterer is equivalent to StandardScaler
       in feature space"""
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    scaler = StandardScaler(with_std=False)
    scaler.fit(X_fit)
    X_fit_centered = scaler.transform(X_fit)
    K_fit = np.dot(X_fit, X_fit.T)

    # center fit time matrix
    centerer = KernelCenterer()
    K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
    K_fit_centered2 = centerer.fit_transform(K_fit)
    assert_array_almost_equal(K_fit_centered, K_fit_centered2)

    # center predict time matrix
    X_pred = rng.random_sample((2, 4))
    K_pred = np.dot(X_pred, X_fit.T)
    X_pred_centered = scaler.transform(X_pred)
    K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
    K_pred_centered2 = centerer.transform(K_pred)
    assert_array_almost_equal(K_pred_centered, K_pred_centered2)
Beispiel #17
0
def test_center_kernel():
    """Test that KernelCenterer is equivalent to StandardScaler
       in feature space"""
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    scaler = StandardScaler(with_std=False)
    scaler.fit(X_fit)
    X_fit_centered = scaler.transform(X_fit)
    K_fit = np.dot(X_fit, X_fit.T)

    # center fit time matrix
    centerer = KernelCenterer()
    K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
    K_fit_centered2 = centerer.fit_transform(K_fit)
    assert_array_almost_equal(K_fit_centered, K_fit_centered2)

    # center predict time matrix
    X_pred = rng.random_sample((2, 4))
    K_pred = np.dot(X_pred, X_fit.T)
    X_pred_centered = scaler.transform(X_pred)
    K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
    K_pred_centered2 = centerer.transform(K_pred)
    assert_array_almost_equal(K_pred_centered, K_pred_centered2)
Beispiel #18
0
Datei: ml.py Projekt: nvhuy/LM
def train_test(feature_file, test_file, removed_columns=None):
    """
    Load features into file
    :param feature_file: feature file
    :param test_file: test file
    :param removed_columns: index of feature columns to remove
    """
    module_logger.info('------ Train/test model ::: {} ::: {}'.format(feature_file, test_file))

    clf = svm_clf()

    fs = numpy.loadtxt(feature_file, delimiter='\t', skiprows=1)
    _, n = fs.shape
    X_train = fs[:, 1:n - 1]
    y_train = fs[:, n - 1]

    fs = numpy.loadtxt(test_file, delimiter='\t', skiprows=1)
    _, n = fs.shape
    X_test = fs[:, 1:n - 1]
    y_test = fs[:, n - 1]
    y_i = fs[:, 0]

    if removed_columns is not None and len(removed_columns) > 0:
        X_test = numpy.delete(X_test, removed_columns, 1)
        X_train = numpy.delete(X_train, removed_columns, 1)
    module_logger.info('------ data dimension ::: {} ::: {} ::: {}'.format(X_train.shape, X_test.shape, n))

    std_scaler = StandardScaler()
    std_scaler.fit(X_train)
    X_train_scaled = std_scaler.transform(X_train, copy=True)
    X_test_scaled = std_scaler.transform(X_test, copy=True)

    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    y_logp = clf.predict_proba(X_test_scaled)

    return write_prediction_output(y_i, y_test, y_pred, test_file.replace('.csv', '_pred.csv'), y_logp)
Beispiel #19
0
def test_scale_sparse_with_mean_raise_exception():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X_csr = sparse.csr_matrix(X)

    # check scaling and fit with direct calls on sparse data
    assert_raises(ValueError, scale, X_csr, with_mean=True)
    assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr)

    # check transform and inverse_transform after a fit on a dense array
    scaler = StandardScaler(with_mean=True).fit(X)
    assert_raises(ValueError, scaler.transform, X_csr)

    X_transformed_csr = sparse.csr_matrix(scaler.transform(X))
    assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
Beispiel #20
0
def test_scale_sparse_with_mean_raise_exception():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X_csr = sparse.csr_matrix(X)

    # check scaling and fit with direct calls on sparse data
    assert_raises(ValueError, scale, X_csr, with_mean=True)
    assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr)

    # check transform and inverse_transform after a fit on a dense array
    scaler = StandardScaler(with_mean=True).fit(X)
    assert_raises(ValueError, scaler.transform, X_csr)

    X_transformed_csr = sparse.csr_matrix(scaler.transform(X))
    assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
Beispiel #21
0
class StandardScalerImpl():
    def __init__(self, copy=True, with_mean=True, with_std=True):
        self._hyperparams = {
            'copy': copy,
            'with_mean': with_mean,
            'with_std': with_std
        }
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Beispiel #22
0
    def _proccess_input(self, target_pos, target_speed, pos, vel, effort):
        x_target = np.array(target_pos)
        x_first = np.array([pos_[0] for pos_ in pos])
        x_speed = np.array(target_speed).reshape((-1, 1))
        aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort]

        x = np.concatenate((x_target, x_first, x_speed), axis=1)

        input_scaler = StandardScaler()
        x = input_scaler.fit_transform(x)
        output_scaler = StandardScaler()
        effort_concat = np.concatenate([a for a in effort], axis=0)
        output_scaler.fit(effort_concat)
        effort = [output_scaler.transform(eff) for eff in effort]

        y = pad_sequences(effort, padding='post', value=0.)
        aux_output = pad_sequences(aux_output, padding='post', value=0.)
        x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x, y, aux_output, test_size=0.2)
        return x, x_test, y, y_test, y_aux, y_aux_test
Beispiel #23
0
class CreateStandardScaler(CreateModel):
    def fit(self, data, args):
        self.model = StandardScaler()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval

    def test(self, data):
        assert self.model is not None

        return self.model.transform(data.X_test)

    def predict(self, data):
        with Timer() as t:
            self.predictions = self.test(data)

        data.learning_task = LearningTask.REGRESSION
        return t.interval
Beispiel #24
0
    def test_iris(self):
        train_X, test_X, train_y, test_y = data_io.get_iris_train_test()
        print("train_X's shape = %s, train_y's shape = %s" %
              (train_X.shape, train_y.shape))
        print("test_X's shape = %s, test_y's shape = %s" %
              (test_X.shape, test_y.shape))

        print("Applying standard scaling ...")
        scaler = StandardScaler()
        train_X = scaler.fit_transform(train_X)
        test_X = scaler.transform(test_X)

        # train_X = test_X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
        # train_y = test_y = np.array([0, 1, 1, 0])

        # train_X = test_X = np.array([[0], [1]])
        # train_y = test_y = np.array([0, 1])

        layers = [100]
        clf = MLPClassifier(layers,
                            batch_size=train_X.shape[0],
                            n_epochs=100,
                            learning_rate=0.1)
        print("clf: %s" % clf)

        print("Fitting ...")
        clf.fit(train_X, train_y)

        print("Predicting ...")
        pred_y = clf.predict(test_X)
        print("y = %s" % test_y)
        print("pred_y = \n%s" % pred_y)

        # pred_proba_y = clf.predict_proba(test_X)
        # print("pred_proba_y = \n%s" % pred_proba_y)

        accuracy = accuracy_score(test_y, pred_y)
        print("Accuracy = %g%%" % (100 * accuracy))

        self.assertGreaterEqual(accuracy, 0.89)
Beispiel #25
0
def split_train_validation_test(multi_time_series_df,
                                valid_start_time,
                                test_start_time,
                                features,
                                time_step_lag=1,
                                horizon=1,
                                target='target',
                                time_format='%Y-%m-%d %H:%M:%S',
                                freq='H'):

    if not isinstance(features, list) or len(features) < 1:
        raise Exception(
            "Bad input for features. It must be an array of dataframe colummns used"
        )

    train = multi_time_series_df.copy()[
        multi_time_series_df.index < valid_start_time]
    train_features = train[features]
    train_targets = train[target]

    # X_scaler = MinMaxScaler()
    # target_scaler = MinMaxScaler()
    # y_scaler = MinMaxScaler()

    X_scaler = StandardScaler()
    target_scaler = StandardScaler()
    y_scaler = StandardScaler()

    # 'load' is our key target. If it is in features, then we scale it.
    # if it not 'load', then we scale the first column
    if 'load' in features:
        tg = train[['load']]
        y_scaler.fit(tg)
    else:

        tg = train[target]
        ## scale the first column
        y_scaler.fit(tg.values.reshape(-1, 1))

    train[target] = target_scaler.fit_transform(train_targets)

    X_scaler.fit(train_features)
    train[features] = X_scaler.transform(train_features)

    tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)}
    train_inputs = TimeSeriesTensor(train,
                                    target=target,
                                    H=horizon,
                                    freq=freq,
                                    tensor_structure=tensor_structure)

    print(train_inputs.dataframe.head())

    look_back_dt = dt.datetime.strptime(
        valid_start_time, time_format) - dt.timedelta(hours=time_step_lag - 1)
    valid = multi_time_series_df.copy()[
        (multi_time_series_df.index >= look_back_dt)
        & (multi_time_series_df.index < test_start_time)]
    valid_features = valid[features]
    valid[features] = X_scaler.transform(valid_features)
    tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)}
    valid_inputs = TimeSeriesTensor(valid,
                                    target=target,
                                    H=horizon,
                                    freq=freq,
                                    tensor_structure=tensor_structure)

    print(valid_inputs.dataframe.head())

    # test set
    # look_back_dt = dt.datetime.strptime(test_start_time, '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=time_step_lag - 1)
    test = multi_time_series_df.copy()[test_start_time:]
    test_features = test[features]
    test[features] = X_scaler.transform(test_features)
    test_inputs = TimeSeriesTensor(test,
                                   target=target,
                                   H=horizon,
                                   freq=freq,
                                   tensor_structure=tensor_structure)

    print("time lag:", time_step_lag, "original_feature:", len(features))

    return train_inputs, valid_inputs, test_inputs, y_scaler
Beispiel #26
0
def test_scaler_int():
    # test that scaler converts integer input to floating
    # for both sparse and dense matrices
    rng = np.random.RandomState(42)
    X = rng.randint(20, size=(4, 5))
    X[:, 0] = 0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    with warnings.catch_warnings(record=True):
        X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    with warnings.catch_warnings(record=True):
        scaler = StandardScaler(with_mean=False).fit(X)
        X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    with warnings.catch_warnings(record=True):
        scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
        X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    with warnings.catch_warnings(record=True):
        scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
        X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., 1.109, 1.856, 21., 1.559], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(
        X_csr_scaled.astype(np.float))
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
Beispiel #27
0
def test_scaler_int():
    # test that scaler converts integer input to floating
    # for both sparse and dense matrices
    rng = np.random.RandomState(42)
    X = rng.randint(20, size=(4, 5))
    X[:, 0] = 0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    with warnings.catch_warnings(record=True):
        X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    with warnings.catch_warnings(record=True):
        scaler = StandardScaler(with_mean=False).fit(X)
        X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    with warnings.catch_warnings(record=True):
        scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
        X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    with warnings.catch_warnings(record=True):
        scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
        X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0),
        [0., 1.109, 1.856, 21., 1.559], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(
        X_csr_scaled.astype(np.float))
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
loadData("/data/york_hour_2013.csv", ["timestamp", "atc"], data, columns)

all_features = deepcopy(columns)
all_features.remove("target")
all_features.remove("location")

output = open(OUTPUT_DATA_FILE, 'w')
output.write("location,observation,prediction\n")

for location in locations:
    print(str(location))
    trainX, testX, trainY, testY = splitDataForXValidation(
        location, "location", data, all_features, "target")
    normalizer_X = StandardScaler()
    trainX = normalizer_X.fit_transform(trainX)
    testX = normalizer_X.transform(testX)
    normalizer_Y = StandardScaler()
    trainY = normalizer_Y.fit_transform(trainY)
    testY = normalizer_Y.transform(testY)
    model = BaggingRegressor(base_estimator=SVR(kernel='rbf',
                                                C=40,
                                                cache_size=5000),
                             max_samples=4200,
                             n_estimators=10,
                             verbose=0,
                             n_jobs=-1)
    model.fit(trainX, trainY)
    prediction = model.predict(testX)
    prediction = normalizer_Y.inverse_transform(prediction)
    testY = normalizer_Y.inverse_transform(testY)
Beispiel #29
0
del preds
print(y.shape)
"""
y = DataFrame(clf1.predict(dataTest))
print("Prediction done")

res = DataFrame(np.nan, index=range(len(ids)), columns=["Id", "Response"])
res["Id"] = ids
res["Response"] = y.values

res.to_csv("submission1.csv", index=False)

#Scale
print("Scaling")
dataTest = imputer.transform(dataTest)
dataTest = scaler.transform(dataTest)

print("Predicting")
"""
preds=[]
for i in range(6):
    dtest=dataTest.ix[range(bounds[i],bounds[i+1])]
    y_pred=clf2.predict(dtest)
    del dtest
    preds.append(DataFrame(y_pred))
    gc.collect()
print(preds)
y=concat(preds,axis=0,copy=False)

del dataTest
del preds
Beispiel #30
0
                      loss=loss_function)
 predicted_values = []
 real_values = []
 for student in students_gender_train:
     train_students = students_gender_train - set([student])
     print(train_students)
     test_student = set([student])
     print(test_student)
     train_x, train_y = dataset_loader.get_x_and_y(
         students_set=train_students, index=index, test_flag=False)
     test_x, test_y = dataset_loader.get_x_and_y(
         students_set=test_student, index=index, test_flag=True)
     reshaped_train_set_x = dataset_loader.reshape_numpy_array(train_x)
     scaler = StandardScaler()
     scaler.fit(reshaped_train_set_x)
     normalized_reshaped_train_x = scaler.transform(
         reshaped_train_set_x)
     normalized_train_set_x = np.reshape(
         normalized_reshaped_train_x,
         (train_x.shape[0], train_x.shape[1], train_x.shape[2],
          train_x.shape[3]))
     reshaped_test_x = dataset_loader.reshape_numpy_array(test_x)
     normalized_reshaped_test_x = scaler.transform(reshaped_test_x)
     normalized_test_x = np.reshape(normalized_reshaped_test_x,
                                    (test_x.shape[0], test_x.shape[1],
                                     test_x.shape[2], test_x.shape[3]))
     predicted_values.extend(
         cnn_classifier.train(normalized_train_set_x,
                              train_y,
                              normalized_test_x,
                              test_y,
                              student=student))
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing.data import StandardScaler
from sklearn.linear_model import Lasso
from mpl_toolkits.mplot3d import Axes3D

irisdata = load_iris()
iris_X = irisdata.data
iris_y = irisdata.target
scale = StandardScaler()
scale.fit(iris_X)
iris_x = scale.transform(iris_X)
pca = PCA(n_components=3)
iris_x = pca.fit_transform(iris_x)
fig = plt.figure()
ax = fig.add_subplot(111)
# ax.scatter(iris_x[:, 0], iris_x[:, 1], iris_x[:, 2], marker='o', c=iris_y)
x_tran, x_test, y_tran, y_test = train_test_split(iris_x,
                                                  iris_y,
                                                  test_size=0.3,
                                                  random_state=42)
result = {}
test_number = len(y_test)
for i in range(1, 11, 1):
    clf = Lasso(alpha=i / 10).fit(x_tran, y_tran)
    y_pre = clf.predict(x_test)
    result[i / 10] = sum(m < 0.5 for m in abs(y_test - y_pre)) / test_number
print(result)
ax.plot(list(result.keys()), list(result.values()))
Beispiel #32
0
from sklearn.preprocessing.data import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import mglearn

cancer = load_breast_cancer()

scaler = StandardScaler()
scaler.fit(cancer.data)
X_scaled = scaler.transform(cancer.data)

pca = PCA(n_components=2)
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)

print("original {}, reduction {}".format(X_scaled.shape, X_pca.shape))

plt.figure(figsize=(8,8))
mglearn.discrete_scatter(X_pca[:,0], X_pca[:,1], cancer.target)
plt.legend(["malignancy(cancer)", "benign"], loc="best")
plt.gca().set_aspect("equal")
plt.xlabel("1st principal component")
plt.ylabel("2nd principal component")
plt.draw()

print("PCA PC shape:{}".format(pca.components_.shape))
print("PCA PC {}".format(pca.components_))
plt.matshow(pca.components_, cmap='viridis')
plt.yticks([0,1], ["first principal component", "second principal component"])
plt.colorbar()
Beispiel #33
0
class SkRanker(Ranker, SkLearner):
    '''
    Basic ranker wrapping scikit-learn functions
    '''
    
    def train(self, dataset_filename, 
              scale=True, 
              feature_selector=None, 
              feature_selection_params={},
              feature_selection_threshold=.25, 
              learning_params={}, 
              optimize=True, 
              optimization_params={}, 
              scorers=['f1_score'],
              attribute_set=None,
              class_name=None,
              metaresults_prefix="./0-",
              **kwargs):
        
        plot_filename = "{}{}".format(metaresults_prefix, "featureselection.pdf")
        data, labels = dataset_to_instances(dataset_filename, attribute_set, class_name,  **kwargs)
        learner = self.learner
        
        #the class must remember the attribute_set and the class_name in order to reproduce the vectors
        self.attribute_set = attribute_set
        self.class_name = class_name

 
        #scale data to the mean
        if scale:
            log.info("Scaling datasets...")
            log.debug("Data shape before scaling: {}".format(data.shape))
            self.scaler = StandardScaler()
            data = self.scaler.fit_transform(data)
            log.debug("Data shape after scaling: {}".format(data.shape))
            log.debug("Mean: {} , Std: {}".format(self.scaler.mean_, self.scaler.std_))

        #avoid any NaNs and Infs that may have occurred due to the scaling
        data = np.nan_to_num(data)
        
        #feature selection
        if isinstance(feature_selection_params, basestring):
            feature_selection_params = eval(feature_selection_params)
        self.featureselector, data, metadata = self.run_feature_selection(data, labels, feature_selector, feature_selection_params, feature_selection_threshold, plot_filename) 
        
        #initialize learning method and scoring functions and optimize
        self.learner, self.scorers = self.initialize_learning_method(learner, data, labels, learning_params, optimize, optimization_params, scorers)

        log.info("Data shape before fitting: {}".format(data.shape))

        self.learner.fit(data, labels)
        self.fit = True
        return metadata
    
    def get_model_description(self):
        params = {}
        
        if self.scaler:
            params = self.scaler.get_params(deep=True)
        try: #these are for SVC
            if self.learner.kernel == "rbf":
                params["gamma"] = self.learner.gamma
                params["C"] = self.learner.C
                for i, n_support in enumerate(self.learner.n_support_):
                    params["n_{}".format(i)] = n_support
                log.debug(len(self.learner.dual_coef_))
                return params
            elif self.learner.kernel == "linear":
                coefficients = self.learner.coef_
                att_coefficients = {}
                for attname, coeff in zip(self.attribute_set.get_names_pairwise(), coefficients[0]):
                    att_coefficients[attname] = coeff
                return att_coefficients
        except AttributeError:
            pass
        try: #adaboost etc
            params = self.learner.get_params()
            numeric_params = OrderedDict()
            for key, value in params.iteritems():
                try:
                    value = float(value)
                except ValueError:
                    continue
                numeric_params[key] = value
            return numeric_params
        except:
            pass
        return {}
    
    
    def get_ranked_sentence(self, parallelsentence, critical_attribute="rank_predicted", 
                            new_rank_name="rank_hard", 
                            del_orig_class_att=False, 
                            bidirectional_pairs=False, 
                            ties=True,
                            reconstruct='hard'):
        """
        """
        if type(self.learner) == str:
            if self.classifier:
                self.learner = self.classifier
                # this is to provide backwards compatibility for old models 
                # whose classes used differeent attribute names
                try:
                    self.learner._dual_coef_ = self.learner.dual_coef_
                    self.learner._intercept_ = self.learner.intercept_
                except AttributeError:
                    # it's ok if the model doesn't have these variables
                    pass

                try: # backwards compatibility for old LogisticRegression
                    try_classes = self.learner.classes_
                except AttributeError:
                    self.learner.classes_ = [-1, 1]

        #de-compose multiranked sentence into pairwise comparisons
        pairwise_parallelsentences = parallelsentence.get_pairwise_parallelsentences(bidirectional_pairs=bidirectional_pairs,
                                                                                     class_name=self.class_name,
                                                                                     ties=ties)        
        if len(parallelsentence.get_translations()) == 1:
            log.warning("Parallelsentence has only one target sentence")
            parallelsentence.tgt[0].add_attribute(new_rank_name, 1)
            return parallelsentence, {}
        elif len(parallelsentence.get_translations()) == 0:
            return parallelsentence, {}
        #list that will hold the pairwise parallel sentences including the learner's decision
        classified_pairwise_parallelsentences = []
        resultvector = {}
        
        for pairwise_parallelsentence in pairwise_parallelsentences:
            #convert pairwise parallel sentence into an orange instance
            instance = parallelsentence_to_instance(pairwise_parallelsentence, attribute_set=self.attribute_set)
            #scale data instance to mean, based on trained scaler
            if self.scaler:
                try:
                    instance = np.nan_to_num(instance)
                    instance = self.scaler.transform(instance)
                except ValueError as e:
                    log.error("Could not transform instance: {}, scikit replied: {}".format(instance, e))
                    #raise ValueError(e)
                    pass
            try:
                if self.featureselector:
                    instance = np.nan_to_num(instance)
                    instance = self.featureselector.transform(instance)
            except AttributeError:
                pass
            log.debug('Instance = {}'.format(instance)) 
            #make sure no NaN or inf appears in the instance
            instance = np.nan_to_num(instance)
            #run learner for this instance
            predicted_value = self.learner.predict(instance)
            try:
                distribution = dict(zip(self.learner.classes_, self.learner.predict_proba(instance)[0]))
            except AttributeError: 
                #if learner does not support per-class probability (e.g. LinearSVC) assign 0.5
                distribution = dict([(cl, 0.5) for cl in self.learner.classes_])
            log.debug("Distribution: {}".format(distribution))
            log.debug("Predicted value: {}".format(predicted_value))
            #even if we have a binary learner, it may be that it cannot decide between two classes
            #for us, this means a tie
            if not bidirectional_pairs and distribution and len(distribution)==2 and float(distribution[1])==0.5:
                predicted_value = 0
                distribution[predicted_value] = 0.5
                
            log.debug("{}, {}, {}".format(pairwise_parallelsentence.get_system_names(), predicted_value, distribution))
            
            
            #gather several metadata from the classification, which may be needed 
            resultvector.update({'systems' : pairwise_parallelsentence.get_system_names(),
                                 'value' : predicted_value,
                                 'distribution': distribution,
                                 'confidence': distribution[int(predicted_value)],
#                                 'instance' : instance,
                                 })
            
            #add the new predicted ranks as attributes of the new pairwise sentence
            pairwise_parallelsentence.add_attributes({"rank_predicted":predicted_value,
                                                       "prob_-1":distribution[-1],
                                                       "prob_1":distribution[1]
                                                       })
            
            classified_pairwise_parallelsentences.append(pairwise_parallelsentence)

        
        #gather all classified pairwise comparisons of into one parallel sentence again
        sentenceset = CompactPairwiseParallelSentenceSet(classified_pairwise_parallelsentences)
        if reconstruct == 'hard':
            log.debug("Applying hard reconstruction to produce rank {}".format(new_rank_name))
            ranked_sentence = sentenceset.get_multiranked_sentence(critical_attribute=critical_attribute, 
                                                               new_rank_name=new_rank_name, 
                                                               del_orig_class_att=del_orig_class_att)
        else:
            attribute1 = "prob_-1"
            attribute2 = "prob_1"
            log.debug("Applying soft reconstruction to produce rank {}".format(new_rank_name))
            try:
                ranked_sentence = sentenceset.get_multiranked_sentence_with_soft_ranks(attribute1, attribute2, 
                        critical_attribute, new_rank_name, normalize_ranking=False)
            except:
                raise ValueError("Sentenceset {} from {} caused exception".format(classified_pairwise_parallelsentences, parallelsentence))
        return ranked_sentence, resultvector
Beispiel #34
0
from sklearn.datasets import load_boston
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing.data import StandardScaler
from sklearn.linear_model import ElasticNet  # In ElasticNet,we have two important variable, alpha and l1_ratio
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator

bostondata = load_boston()
boston_X = bostondata.data
boston_y = bostondata.target
scale = StandardScaler()
scale.fit(boston_X)
boston_x = scale.transform(boston_X)
pca = PCA(n_components=3)
# boston_x = pca.fit_transform(boston_x)
fig = plt.figure()
ax = plt.gca(projection='3d')
# ax.scatter(boston_x[:, 0], boston_x[:, 1], boston_x[:, 2], marker='o', c=boston_y)
x_tran, x_test, y_tran, y_test = train_test_split(boston_x, boston_y, test_size=0.3, random_state=42)
result = []
z = np.zeros(shape=(10, 10))
test_number = len(y_test)
for i in range(1, 11, 1):
    for j in range(1, 11, 1):
        clf = ElasticNet(alpha=i / 10, l1_ratio=j / 10).fit(x_tran, y_tran)
        y_pre = clf.predict(x_test)
        result.append([i, j, clf.score(x_test, y_test)])
        z[i - 1, j - 1] = clf.score(x_test, y_test)
Beispiel #35
0
def train_and_test(alpha,
                   predictors,
                   predictor_params,
                   x_filename,
                   y_filename,
                   n_users,
                   percTest,
                   featureset_to_use,
                   diff_weighting,
                   phi,
                   force_balanced_classes,
                   do_scaling,
                   optimise_predictors,
                   report,
                   conf_report=None):
    # all_X = numpy.loadtxt(x_filename, delimiter=",")
    all_X = numpy.load(x_filename + ".npy")
    all_y = numpy.loadtxt(y_filename, delimiter=",")

    print("loaded X and y files", x_filename, y_filename)

    if numpy.isnan(all_X.any()):
        print("nan in", x_filename)
        exit()

    if numpy.isnan(all_y.any()):
        print("nan in", y_filename)
        exit()

    #print("selecting balanced subsample")
    print("t t split")
    X_train, X_test, y_train, y_test = train_test_split(all_X,
                                                        all_y,
                                                        test_size=percTest,
                                                        random_state=666)

    # feature extraction
    # test = SelectKBest(score_func=chi2, k=100)
    # kb = test.fit(X_train, y_train)
    # # summarize scores
    # numpy.set_printoptions(precision=3)
    # print(kb.scores_)
    # features = kb.transform(X_train)
    # mask = kb.get_support()
    # # summarize selected features
    # print(features.shape)
    # X_train = X_train[:,mask]
    # X_test = X_test[:,mask]

    scaler = StandardScaler()
    rdim = FeatureAgglomeration(n_clusters=100)
    if do_scaling:
        # input(X_train.shape)
        X_train = rdim.fit_transform(X_train)
        X_test = rdim.transform(X_test)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        with open('../../../isaac_data_files/qutor_scaler.pkl',
                  'wb') as output:
            pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL)
        with open('../../../isaac_data_files/qutor_rdim.pkl', 'wb') as output:
            pickle.dump(rdim, output, pickle.HIGHEST_PROTOCOL)

    # print("feature reduction...")
    # pc = PCA(n_components=100)
    # X_train = pc.fit_transform(X_train)
    # X_test = pc.transform(X_test)

    classes = numpy.unique(y_train)
    sample_weights = None
    if (force_balanced_classes):
        X_train, y_train = balanced_subsample(X_train, y_train, 1.0)  #0.118)

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    print("tuning classifier ...")
    for ix, p in enumerate(predictors):
        print(type(p))
        print(p.get_params().keys())

        if optimise_predictors == True and len(predictor_params[ix]) > 1:
            pbest = run_random_search(p, X_train, y_train,
                                      predictor_params[ix])
        else:
            pbest = p.fit(X_train, y_train)
        predictors[ix] = pbest

    print("pickling classifier ...")
    for ix, p in enumerate(predictors):
        p_name = predictor_params[ix]['name']
        with open(
                '../../../isaac_data_files/p_{}_{}_{}.pkl'.format(
                    p_name, alpha, phi), 'wb') as output:
            pickle.dump(p, output, pickle.HIGHEST_PROTOCOL)
    print("done!")

    # report.write("* ** *** |\| \` | |  |) /; `|` / |_| *** ** *\n")
    # report.write("* ** *** | | /_ |^|  |) ||  |  \ | | *** ** *\n")
    #report.write("RUNS,P,FB,WGT,ALPHA,PHI,SCL,0p,0r,0F,0supp,1p,1r,1F,1supp,avg_p,avg_r,avg_F,#samples\n")
    for ix, p in enumerate(predictors):

        report.write(",".join(
            map(str, (all_X.shape[0], str(p).replace(",", ";").replace(
                "\n", ""), force_balanced_classes, diff_weighting, alpha, phi,
                      do_scaling))))

        y_pred_tr = p.predict(X_train)
        y_pred = p.predict(X_test)

        # for x,y,yp in zip(X_train, y_test, y_pred):

        if conf_report:
            conf_report.write(
                str(p).replace(",", ";").replace("\n", "") + "\n")
            conf_report.write(str(alpha) + "," + str(phi) + "\n")
            conf_report.write(str(confusion_matrix(y_test, y_pred)) + "\n")
            conf_report.write("\n")
        # p = precision_score(y_test, y_pred, average=None, labels=classes)
        # r = recall_score(y_test, y_pred, average=None, labels=classes)
        # F = f1_score(y_test, y_pred, average=None, labels=classes)
        p, r, F, s = precision_recall_fscore_support(y_test,
                                                     y_pred,
                                                     labels=classes,
                                                     average=None,
                                                     warn_for=('precision',
                                                               'recall',
                                                               'f-score'))
        avp, avr, avF, _ = precision_recall_fscore_support(
            y_test,
            y_pred,
            labels=classes,
            average='weighted',
            warn_for=('precision', 'recall', 'f-score'))
        for ix, c in enumerate(classes):
            report.write(",{},{},{},{},{},".format(c, p[ix], r[ix], F[ix],
                                                   s[ix]))
        report.write("{},{},{},{}\n".format(avp, avr, avF, numpy.sum(s)))

        # report.write(classification_report(y_test, y_pred)+"\n")
        # report.write("------END OF CLASSIFIER------\n")
        report.flush()
    return X_train, X_test, y_pred_tr, y_pred, y_test, scaler
                  tf.keras.metrics.AUC(name='auc')
              ])
save_best_callback = tf.keras.callbacks.ModelCheckpoint(
    './model-{epoch:02d}-{acc:.2f}.hdf5',
    monitor='acc',
    verbose=1,
    save_best_only=True,
    save_weights_only=False,
    save_freq=1)
logdir = os.path.join('tflogs',
                      datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))
tb_train_callback = tf.keras.callbacks.TensorBoard(logdir,
                                                   histogram_freq=1,
                                                   profile_batch=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

model.fit(
    X_train_scaled,
    y_train,
    class_weight=class_weight,
    # batch_size=64,
    validation_split=0.1,
    callbacks=[save_best_callback, tb_train_callback],
    epochs=50)

# model = tf.keras.models.load_model('./model-35-0.88.hdf5')
X_test_scaled = scaler.transform(X_test)
model.evaluate(X_test_scaled, y_test)
# print(np.round(model.predict(X_test)))