def process_data(X, y=None, test_size=0.2): if y is None: km = dask_ml.cluster.KMeans(n_clusters=10, init_max_iter=100) km.fit(X.flatten().reshape(-1, 1)) y = km.labels_ y_uniqs = np.unique(y[:,0]) len_ = X.shape[0] X = prepare_dataset(X) shape_ = list(X.shape[1:]) if test_size != 0: samples = list() samples_labels = list() print('Preparing samples ...') for _ in range(2): for y_uniq in y_uniqs: sample = list() label = list() for xa, ya in zip(chunks(X, 10),chunks(y[:,0], 10)): try: sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]]) label.append(y_uniq) if len(sample) >= len(y_uniqs): break except: pass samples += sample samples_labels += label samples = da.vstack(samples) samples_labels = da.vstack(samples_labels) if test_size == 0: print('Training dataset shape x: ', X.shape) print('Training dataset shape y: ', y.shape) train_dataset = Dataset(X, y) return train_dataset else: X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size, random_state=config.seeds) X_train = X_train.reshape([X_train.shape[0]] + shape_) X_test = X_test.reshape([X_test.shape[0]] + shape_) print('Training dataset shape: ', X_train.shape) print('Validation dataset shape: ', X_test.shape) train_dataset = Dataset(X_train, y_train) test_dataset = Dataset(X_test, y_test) train_dataset.samples = samples train_dataset.samples_labels = samples_labels print('Sample dataset shape: ', train_dataset.samples.shape) return train_dataset, test_dataset
def process_data(X, y=None, test_size=0.20, dummies=False): if y is None: y = da.ones(X.shape[0]) y_uniqs = np.unique(y) len_ = X.shape[0] X = prepare_dataset(X) if dummies: y = dd.get_dummies(y) shape_ = list(X.shape[1:]) samples = list() for _ in range(10): for y_uniq in y_uniqs: sample = list() for xa, ya in zip(chunks(X, 10),chunks(y, 10)): try: sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]]) if len(sample) >= 500: break except: pass samples += sample samples = da.vstack(samples) X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size, random_state=4891) X_train = X_train.reshape([X_train.shape[0]] + shape_) X_test = X_test.reshape([X_test.shape[0]] + shape_) print('Training dataset shape: ', X_train.shape) print('Validation dataset shape: ', X_test.shape) train_dataset = Dataset(X_train, y_train) test_dataset = Dataset(X_test, y_test) train_dataset.samples = samples print('Sample dataset shape: ', train_dataset.samples.shape) return train_dataset, test_dataset