def target_correlation(train, target, correlation=['pearson', 'spearman', 'kendall']): """Return the correlation of all columns of train with a target feature. Parameters ---------- train : array n by d training data matrix. target : list target for correlation. Returns ------- metric : array len(metric) by d matrix of correlation coefficients. """ # Scale and shape the data. train_data = standardize(train_matrix=train)['train'] train_target = target output = [] for c in correlation: correlation = c # Find the correlation. row = [] for d in train_data.T: if correlation is 'pearson': row.append(pearsonr(d, train_target)[0]) elif correlation is 'spearman': row.append(spearmanr(d, train_target)[0]) elif correlation is 'kendall': row.append(kendalltau(d, train_target)[0]) output.append(row) return output
def globalscaling(self, globalscaledata, train_features): """All sub-groups of traindata are scaled same. Parameters ---------- globalscaledata : string The data will be scaled globally if requested. """ g_data = globalscaledata[:, :self.features] if self.normalization: norm = normalize(train_matrix=g_data) s_feat, m_feat = norm['dif'], norm['mean'] norm = normalize(train_matrix=train_features, test_matrix=None, dif=s_feat, mean=m_feat) train_features = (norm['train']) else: std = standardize(train_matrix=g_data) s_feat, m_feat = std['std'], std['mean'] return s_feat, m_feat, train_features
def scaling_data(self, train_features, train_targets, test_features, s_tar, m_tar, s_feat, m_feat): """Scaling the data if requested. Parameters ---------- train_feature : array Independent data used to train model. train_targets : array Dependent data used to train model. test_features : array Independent data used to test the model. s_tar : array Standard devation or (max-min), for the dependent train_targets. m_tar : array Mean for the dependent train_targets. s_feat : array Standard devation or (max-min), for the independent train_features. m_feat : array Mean for the independent train_features. """ train_features = train_features[:, :self.features] test_features = test_features[:, :self.features] if self.scale: if self.normalization: # Normalize if s_tar is None or m_tar is None: data = target_normalize(target=train_targets) s_tar, m_tar, train_targets = (data['dif'], data['mean'], data['target']) norm = normalize(train_matrix=train_features, test_matrix=test_features, dif=s_feat, mean=m_feat) train_features, test_features, s_feat, m_feat = (norm['train'], norm['test'], norm['dif'], norm['mean']) else: # Standardization if s_tar is None or m_tar is None: data = target_standardize(target=train_targets) s_tar, m_tar, train_targets = (data['std'], data['mean'], data['target']) std = standardize(train_matrix=train_features, test_matrix=test_features, std=s_feat, mean=m_feat) train_features, test_features, s_feat, m_feat = (std['train'], std['test'], std['std'], std['mean']) return (s_tar, m_tar, s_feat, m_feat, train_targets, train_features, test_features)
def clean_features(features, scale=False): remove_indices = { 'train': np.array([], dtype=int), 'test': np.array([], dtype=int) } for key, feature_set in features.items(): if feature_set is None or len(feature_set) == 0: continue bad_structure_indices = \ np.where(np.isfinite(feature_set).all(axis=1) == False) for b in bad_structure_indices: if len(np.where(np.isfinite(feature_set[b]) == False)) > 1: remove_indices[key] = np.append(remove_indices[key], b) features[key] = np.delete(feature_set, remove_indices[key], axis=0) if not 'test' in features: features['test'] = None # Finite features features = clean_infinite(features['train'], features['test']) # Clean variance features = clean_variance(features['train'], features['test']) # Clean skewness & standardize if features['test'] is None or len(features['test']) == 0: features = clean_skewness(features['train'], skewness=3) if scale: features = standardize(features['train']) else: features = clean_skewness(features['train'], features['test'], skewness=3) if scale: features = standardize(features['train'], features['test']) return features, remove_indices
def test_scale(self): """Test data scaling functions.""" train_features, train_targets, test_features, _ = self.get_data() sfp = standardize(train_matrix=train_features, test_matrix=test_features) sfpg = standardize(train_matrix=train_features, test_matrix=test_features, local=False) self.assertFalse(np.allclose(sfp['train'], sfpg['train'])) nfp = normalize(train_matrix=train_features, test_matrix=test_features) nfpg = normalize(train_matrix=train_features, test_matrix=test_features, local=False) self.assertFalse(np.allclose(nfp['train'], nfpg['train'])) mmfp = min_max(train_matrix=train_features, test_matrix=test_features) mmfpg = min_max(train_matrix=train_features, test_matrix=test_features, local=False) self.assertFalse(np.allclose(mmfp['train'], mmfpg['train'])) ulfp = unit_length(train_matrix=train_features, test_matrix=test_features) ulfpg = unit_length(train_matrix=train_features, test_matrix=test_features, local=False) self.assertTrue(np.allclose(ulfp['train'], ulfpg['train'])) ts = target_standardize(train_targets) self.assertFalse(np.allclose(ts['target'], train_targets)) ts = target_normalize(train_targets) self.assertFalse(np.allclose(ts['target'], train_targets)) ts = target_center(train_targets) self.assertFalse(np.allclose(ts['target'], train_targets))
def train(self): """Scale the training features and targets. Returns ------- feature_data : array The scaled features for the training data. target_data : array The scaled targets for the training data. """ self.feature_data = standardize(train_matrix=self.train_features) self.target_data = target_standardize(target=self.train_targets) return self.feature_data['train'], self.target_data['target']
def _standardize_scalar(self, train_features, test_features): """Function to feature data. Parameters ---------- train_features : array The array of training features. test_features : array The array of test features. """ std = standardize(train_features, test_features) self.scale_mean = std['mean'] self.scale_std = std['std'] return std['train'], std['test']
def clean_standardize(self, df_train, df_test): """ """ # | - clean_standardize cleaned_data = standardize( df_train, test_matrix=df_test, mean=None, std=None, # COMBAK local=True) df_train = cleaned_data["train"] df_test = cleaned_data["test"] out_dict = { "df_train": df_train, "df_test": df_test, } return (out_dict)
# Add random noise from a normal distribution to the target values. target += noise_magnitude * np.random.randn(train_points, 1) # Generate test datapoints x. test_points = 513 test = np.vstack( np.linspace(np.min(train) - 0.1, np.max(train) + 0.1, test_points)) # Store standard deviations of the training data and targets. stdx = np.std(train) stdy = np.std(target) tstd = 2. # Standardize the training and test data on the same scale. std = standardize(train_matrix=train, test_matrix=test) # Standardize the training targets. train_targets = target_standardize(target) # Note that predictions will now be made on the standardized scale. # Store the known underlying function for plotting. linex = np.linspace(np.min(test), np.max(test), test_points) liney = afunc(linex) # Plotting. fig = plt.figure(figsize=(15, 8)) if haz_gpy or haz_gpflow: grid = 230 last = 6 else: grid = 220 last = 4
def scale_test(train_matrix, train_targets, test_matrix): """Test data scaling functions.""" sfp = standardize(train_matrix=train_matrix, test_matrix=test_matrix) ts = target_standardize(train_targets) return sfp['train'], ts['target'], sfp['test']
# Add noise (normal distributed) y_train = y_train + A_noise * np.random.randn(Ntrain, 1) # Generate test datapoints (around x_train) Ntest = 50 x_test = np.vstack( np.linspace(np.min(x_train) - 0.1, np.max(x_train) + 0.1, Ntest)) # standard deviations of training data and targets std_dev_x = np.std(x_train) std_dev_y = np.std(y_train) tstd = 2.0 # Standardize training and test data std_data = standardize(train_matrix=x_train, test_matrix=x_test) # In [8]: std_data.keys() # Out[8]: dict_keys(['mean', 'std', 'train', 'test']) plt.clf() x = np.copy(std_data["train"][:, 0]) x.sort() plt.plot(x, my_func(x), marker="o") plt.savefig("IMG_train_data.pdf") x = np.copy(std_data["test"][:, 0]) x.sort() plt.plot(x, my_func(x), marker="o") plt.savefig("IMG_test_data.pdf") print("Pass here")