Beispiel #1
0
def process_data(X, y=None, test_size=0.2):
    if y is None:
        km = dask_ml.cluster.KMeans(n_clusters=10, init_max_iter=100)
        km.fit(X.flatten().reshape(-1, 1))
        y = km.labels_
    y_uniqs = np.unique(y[:,0])

    len_ = X.shape[0]
    X = prepare_dataset(X)

    shape_ = list(X.shape[1:])

    if test_size != 0:
        samples = list()
        samples_labels = list()
        print('Preparing samples ...')
        for _ in range(2):
            for y_uniq in y_uniqs:
                sample = list()
                label = list()
                for xa, ya in zip(chunks(X, 10),chunks(y[:,0], 10)):
                    try:
                        sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]])
                        label.append(y_uniq)
                        if len(sample) >= len(y_uniqs):
                            break
                    except:
                        pass
                samples += sample
                samples_labels += label
        samples = da.vstack(samples)
        samples_labels = da.vstack(samples_labels)

    if test_size == 0:
        print('Training dataset shape x: ', X.shape)
        print('Training dataset shape y: ', y.shape)

        train_dataset = Dataset(X, y)
        return train_dataset

    else:
        X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size,
                                                        random_state=config.seeds)

        X_train = X_train.reshape([X_train.shape[0]] + shape_)
        X_test = X_test.reshape([X_test.shape[0]] + shape_)

        print('Training dataset shape: ', X_train.shape)
        print('Validation dataset shape: ', X_test.shape)

        train_dataset = Dataset(X_train, y_train)
        test_dataset = Dataset(X_test, y_test)

        train_dataset.samples = samples
        train_dataset.samples_labels = samples_labels

        print('Sample dataset shape: ', train_dataset.samples.shape)
        return train_dataset, test_dataset
def process_data(X, y=None, test_size=0.20, dummies=False):
    if y is None:
        y = da.ones(X.shape[0])
    y_uniqs = np.unique(y)

    len_ = X.shape[0]
    X = prepare_dataset(X)

    if dummies:
        y = dd.get_dummies(y)

    shape_ = list(X.shape[1:])

    samples = list()
    for _ in range(10):
        for y_uniq in y_uniqs:
            sample = list()
            for xa, ya in zip(chunks(X, 10),chunks(y, 10)):
                try:
                    sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]])
                    if len(sample) >= 500:
                        break
                except:
                    pass
            samples += sample
    samples = da.vstack(samples)

    X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size,
                                                        random_state=4891)

    X_train = X_train.reshape([X_train.shape[0]] + shape_)
    X_test = X_test.reshape([X_test.shape[0]] + shape_)

    print('Training dataset shape: ', X_train.shape)
    print('Validation dataset shape: ', X_test.shape)

    train_dataset = Dataset(X_train, y_train)
    test_dataset = Dataset(X_test, y_test)

    train_dataset.samples = samples
    print('Sample dataset shape: ', train_dataset.samples.shape)
    return train_dataset, test_dataset