def _add_extra_fields(self, aws_access_key=None, aws_secret_key=None): data = load_data(self.name, self.train_path, aws_access_key, aws_secret_key) if self.n_examples is None: self.n_examples = len(data) if self.k_classes is None: self.k_classes = len(np.unique(data[self.class_column])) if self.d_features is None: total_features = data.shape[1] - 1 for column in data.columns: if data[column].dtype == 'object': total_features += len(np.unique(data[column])) - 1 self.d_features = total_features if self.majority is None: counts = data[self.class_column].value_counts() self.majority = float(max(counts)) / float(sum(counts)) if self.size_kb is None: self.size_kb = int(np.array(data).nbytes / 1000)
def load(self, test_size=0.3, random_state=0, aws_access_key=None, aws_secret_key=None): data = load_data(self.name, self.train_path, aws_access_key, aws_secret_key) if self.test_path: if self.name.endswith('.csv'): test_name = self.name.replace('.csv', '_test.csv') else: test_name = self.name + '_test' test_data = load_data(test_name, self.test_path, aws_access_key, aws_secret_key) return data, test_data else: return train_test_split(data, test_size=test_size, random_state=random_state)