Exemple #1
0
def target_correlation(train, target,
                       correlation=['pearson', 'spearman', 'kendall']):
    """Return the correlation of all columns of train with a target feature.

    Parameters
    ----------
    train : array
        n by d training data matrix.
    target : list
        target for correlation.

    Returns
    -------
    metric : array
        len(metric) by d matrix of correlation coefficients.
    """
    # Scale and shape the data.
    train_data = standardize(train_matrix=train)['train']
    train_target = target
    output = []
    for c in correlation:
        correlation = c
        # Find the correlation.
        row = []
        for d in train_data.T:
            if correlation is 'pearson':
                row.append(pearsonr(d, train_target)[0])
            elif correlation is 'spearman':
                row.append(spearmanr(d, train_target)[0])
            elif correlation is 'kendall':
                row.append(kendalltau(d, train_target)[0])
        output.append(row)

    return output
Exemple #2
0
    def globalscaling(self, globalscaledata, train_features):
        """All sub-groups of traindata are scaled same.

        Parameters
        ----------
        globalscaledata : string
            The data will be scaled globally if requested.
        """
        g_data = globalscaledata[:, :self.features]

        if self.normalization:

            norm = normalize(train_matrix=g_data)
            s_feat, m_feat = norm['dif'], norm['mean']

            norm = normalize(train_matrix=train_features,
                             test_matrix=None,
                             dif=s_feat,
                             mean=m_feat)
            train_features = (norm['train'])

        else:
            std = standardize(train_matrix=g_data)
            s_feat, m_feat = std['std'], std['mean']
        return s_feat, m_feat, train_features
Exemple #3
0
    def scaling_data(self, train_features, train_targets, test_features, s_tar,
                     m_tar, s_feat, m_feat):
        """Scaling the data if requested.

        Parameters
        ----------
        train_feature : array
            Independent data used to train model.
        train_targets : array
            Dependent data used to train model.
        test_features : array
            Independent data used to test the model.
        s_tar : array
            Standard devation or (max-min), for the dependent train_targets.
        m_tar : array
            Mean for the dependent train_targets.
        s_feat : array
            Standard devation or (max-min), for the independent train_features.
        m_feat : array
            Mean for the independent train_features.
        """
        train_features = train_features[:, :self.features]
        test_features = test_features[:, :self.features]

        if self.scale:
            if self.normalization:
                # Normalize
                if s_tar is None or m_tar is None:
                    data = target_normalize(target=train_targets)
                    s_tar, m_tar, train_targets = (data['dif'], data['mean'],
                                                   data['target'])

                norm = normalize(train_matrix=train_features,
                                 test_matrix=test_features,
                                 dif=s_feat,
                                 mean=m_feat)
                train_features, test_features, s_feat, m_feat = (norm['train'],
                                                                 norm['test'],
                                                                 norm['dif'],
                                                                 norm['mean'])
            else:
                # Standardization
                if s_tar is None or m_tar is None:
                    data = target_standardize(target=train_targets)
                    s_tar, m_tar, train_targets = (data['std'], data['mean'],
                                                   data['target'])

                std = standardize(train_matrix=train_features,
                                  test_matrix=test_features,
                                  std=s_feat,
                                  mean=m_feat)
                train_features, test_features, s_feat, m_feat = (std['train'],
                                                                 std['test'],
                                                                 std['std'],
                                                                 std['mean'])
        return (s_tar, m_tar, s_feat, m_feat, train_targets, train_features,
                test_features)
def clean_features(features, scale=False):
    remove_indices = {
        'train': np.array([], dtype=int),
        'test': np.array([], dtype=int)
    }
    for key, feature_set in features.items():
        if feature_set is None or len(feature_set) == 0:
            continue
        bad_structure_indices = \
            np.where(np.isfinite(feature_set).all(axis=1) == False)
        for b in bad_structure_indices:
            if len(np.where(np.isfinite(feature_set[b]) == False)) > 1:
                remove_indices[key] = np.append(remove_indices[key], b)

        features[key] = np.delete(feature_set, remove_indices[key], axis=0)

    if not 'test' in features:
        features['test'] = None

    # Finite features
    features = clean_infinite(features['train'], features['test'])

    # Clean variance
    features = clean_variance(features['train'], features['test'])

    # Clean skewness & standardize
    if features['test'] is None or len(features['test']) == 0:
        features = clean_skewness(features['train'], skewness=3)
        if scale:
            features = standardize(features['train'])

    else:
        features = clean_skewness(features['train'],
                                  features['test'],
                                  skewness=3)
        if scale:
            features = standardize(features['train'], features['test'])

    return features, remove_indices
Exemple #5
0
    def test_scale(self):
        """Test data scaling functions."""
        train_features, train_targets, test_features, _ = self.get_data()
        sfp = standardize(train_matrix=train_features,
                          test_matrix=test_features)
        sfpg = standardize(train_matrix=train_features,
                           test_matrix=test_features,
                           local=False)
        self.assertFalse(np.allclose(sfp['train'], sfpg['train']))

        nfp = normalize(train_matrix=train_features, test_matrix=test_features)
        nfpg = normalize(train_matrix=train_features,
                         test_matrix=test_features,
                         local=False)
        self.assertFalse(np.allclose(nfp['train'], nfpg['train']))

        mmfp = min_max(train_matrix=train_features, test_matrix=test_features)
        mmfpg = min_max(train_matrix=train_features,
                        test_matrix=test_features,
                        local=False)
        self.assertFalse(np.allclose(mmfp['train'], mmfpg['train']))

        ulfp = unit_length(train_matrix=train_features,
                           test_matrix=test_features)
        ulfpg = unit_length(train_matrix=train_features,
                            test_matrix=test_features,
                            local=False)
        self.assertTrue(np.allclose(ulfp['train'], ulfpg['train']))

        ts = target_standardize(train_targets)
        self.assertFalse(np.allclose(ts['target'], train_targets))

        ts = target_normalize(train_targets)
        self.assertFalse(np.allclose(ts['target'], train_targets))

        ts = target_center(train_targets)
        self.assertFalse(np.allclose(ts['target'], train_targets))
Exemple #6
0
    def train(self):
        """Scale the training features and targets.

        Returns
        -------
        feature_data : array
            The scaled features for the training data.
        target_data : array
            The scaled targets for the training data.
        """
        self.feature_data = standardize(train_matrix=self.train_features)

        self.target_data = target_standardize(target=self.train_targets)

        return self.feature_data['train'], self.target_data['target']
Exemple #7
0
    def _standardize_scalar(self, train_features, test_features):
        """Function to feature data.

        Parameters
        ----------
        train_features : array
            The array of training features.
        test_features : array
            The array of test features.
        """
        std = standardize(train_features, test_features)

        self.scale_mean = std['mean']
        self.scale_std = std['std']

        return std['train'], std['test']
    def clean_standardize(self, df_train, df_test):
        """
        """
        # | - clean_standardize
        cleaned_data = standardize(
            df_train,
            test_matrix=df_test,
            mean=None,
            std=None,
            # COMBAK
            local=True)

        df_train = cleaned_data["train"]
        df_test = cleaned_data["test"]

        out_dict = {
            "df_train": df_train,
            "df_test": df_test,
        }

        return (out_dict)
Exemple #9
0
# Add random noise from a normal distribution to the target values.
target += noise_magnitude * np.random.randn(train_points, 1)

# Generate test datapoints x.
test_points = 513
test = np.vstack(
    np.linspace(np.min(train) - 0.1,
                np.max(train) + 0.1, test_points))

# Store standard deviations of the training data and targets.
stdx = np.std(train)
stdy = np.std(target)
tstd = 2.

# Standardize the training and test data on the same scale.
std = standardize(train_matrix=train, test_matrix=test)
# Standardize the training targets.
train_targets = target_standardize(target)
# Note that predictions will now be made on the standardized scale.

# Store the known underlying function for plotting.
linex = np.linspace(np.min(test), np.max(test), test_points)
liney = afunc(linex)
# Plotting.
fig = plt.figure(figsize=(15, 8))
if haz_gpy or haz_gpflow:
    grid = 230
    last = 6
else:
    grid = 220
    last = 4
def scale_test(train_matrix, train_targets, test_matrix):
    """Test data scaling functions."""
    sfp = standardize(train_matrix=train_matrix, test_matrix=test_matrix)
    ts = target_standardize(train_targets)
    return sfp['train'], ts['target'], sfp['test']
# Add noise (normal distributed)
y_train = y_train + A_noise * np.random.randn(Ntrain, 1)

# Generate test datapoints (around x_train)
Ntest = 50
x_test = np.vstack(
    np.linspace(np.min(x_train) - 0.1,
                np.max(x_train) + 0.1, Ntest))

# standard deviations of training data and targets
std_dev_x = np.std(x_train)
std_dev_y = np.std(y_train)
tstd = 2.0

# Standardize training and test data
std_data = standardize(train_matrix=x_train, test_matrix=x_test)
# In [8]: std_data.keys()
# Out[8]: dict_keys(['mean', 'std', 'train', 'test'])

plt.clf()
x = np.copy(std_data["train"][:, 0])
x.sort()
plt.plot(x, my_func(x), marker="o")
plt.savefig("IMG_train_data.pdf")

x = np.copy(std_data["test"][:, 0])
x.sort()
plt.plot(x, my_func(x), marker="o")
plt.savefig("IMG_test_data.pdf")

print("Pass here")