Python get_dataset Exemples, dang.util.get_dataset Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_dang_ts.py Projet : riccotti/DAG

def main():

    dataset = 'italypower'
    train_size = 1000

    D = get_dataset(dataset, path_dataset)
    X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D['X_test'], D['y_test']

    data_type = D['data_type']
    if data_type == 'txt':
        X_train, X_test = D['X_train_txt'], D['X_test_txt']

    if len(X_train) > train_size:
        idx = np.random.choice(len(X_train), size=train_size, replace=False)
        X_train = X_train[idx]

    idx = 0
    if data_type == 'tab':
        X_train_dt = [TabularRecord(x) for x in X_train]
        x_dt = TabularRecord(X_test[idx])

    elif data_type == 'ts':
        window_shape = D['window_sizes'][0]
        step = D['window_steps'][0]
        X_train_dt = [TimeSeriesRecord(x, window_shape=window_shape, step=step) for x in X_train]
        x_dt = TimeSeriesRecord(X_test[idx], window_shape=window_shape, step=step)

    elif data_type == 'img':
        window_shape = D['window_sizes'][2]
        step = D['window_steps'][2][0]
        X_train_dt = [ImageRecord(x, window_shape=window_shape, step=step) for x in X_train]
        x_dt = ImageRecord(X_test[idx], window_shape=window_shape, step=step)

    elif data_type == 'txt':
        window_shape = 3
        step = 1
        X_train_dt = [TextRecord(x, window_shape=window_shape, step=step, text_length=100) for x in X_train]
        x_dt = TextRecord(X_test[idx], window_shape=window_shape, step=step, text_length=100)

    else:
        raise ValueError('Unknown data type %s' % data_type)

    Z = dang_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5, neighgen_op=neighgen_operators, base=None)

    # Z = rand_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5)

    # Z = supp_neighborhood_generation(x_dt, 0.0, n_samples=1000, indpb=0.5)

    # Z = norm_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5)

    print(len(Z), np.mean(X_train), np.mean(Z))

    plt.plot(x_dt.data.tolist(), lw=5)
    for i in range(10):
        plt.plot(Z[i*2].data.tolist())
    plt.show()

Exemple #2

0

Afficher le fichier

def main():
    method = 'dang'

    for dataset in tab_datasets:
        print(datetime.datetime.now(), 'Dataset: %s' % dataset)

        D = get_dataset(dataset, path_dataset, normalize)

        run_experiment(D, dataset, method)
        print('')

Exemple #3

0

Afficher le fichier

Fichier : train_tab_cgan.py Projet : riccotti/DAG

def main():

    dataset = 'diabetes'
    epochs = 300
    train_size = 0.7

    print(datetime.datetime.now(), 'Dataset: %s' % dataset)
    D = get_dataset(dataset, path_dataset, normalize=None)

    X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D[
        'X_test'], D['y_test']
    # n_classes = D['n_classes']
    n_features = D['n_features']
    feature_names = D['feature_names']
    class_name = D['class_name']

    le = LabelEncoder()
    le.fit(y_train)
    y_train = le.transform(y_train)
    y_test = le.transform(y_test)

    Xy_train = np.hstack((X_train, y_train.reshape(-1, 1)))

    print(datetime.datetime.now(), 'Training CTGAN')
    ctgan = CTGANSynthesizer(embedding_dim=128,
                             gen_dim=(256, 256),
                             dis_dim=(256, 256),
                             l2scale=1e-6,
                             batch_size=500)
    ts = time.time()
    ctgan.fit(Xy_train, epochs=epochs, discrete_columns=[n_features + 1])
    cgan_fit_time = time.time() - ts

    n_fake_instances = len(Xy_train)

    print(datetime.datetime.now(), 'Generating synthetic data')
    ts = time.time()
    Xy_fake = ctgan.sample(n_fake_instances)
    cgan_gen_time = time.time() - ts

    # print('F 0', np.mean(Xy_fake[:, 0]), np.min(Xy_fake[:,0]), np.max(Xy_fake[:,0]))
    # print('F 1', np.mean(Xy_fake[:, 1]), np.min(Xy_fake[:, 1]), np.max(Xy_fake[:, 1]))
    #
    # print('R 0', np.mean(X_train[:, 0]), np.min(X_train[:, 0]), np.max(X_train[:, 0]))
    # print('R 1', np.mean(X_train[:, 1]), np.min(X_train[:, 1]), np.max(X_train[:, 1]))
    # return -1

    print(datetime.datetime.now(), 'Storing synthetic data')
    df = pd.DataFrame(data=Xy_fake, columns=feature_names + [class_name])
    df.to_csv(path_syht_dataset + '%s.csv' % dataset, index=False)

    X_fake = Xy_fake[:, :-1]
    X_real = X_train

    y_real = np.ones(len(X_real))
    y_fake = np.zeros(len(X_fake))

    X_rf = np.concatenate([X_real, X_fake])
    y_rf = np.concatenate([y_real, y_fake])

    X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(
        X_rf, y_rf, train_size=train_size, stratify=y_rf)

    res_dict = dict()

    for clf_name, clf in clf_list.items():
        print(datetime.datetime.now(), 'Training %s' % clf_name)
        ts = time.time()
        clf.fit(X_rf_train, y_rf_train)
        disc_fit_time = time.time() - ts
        pickle.dump(
            clf, open(path_discr + '%s_%s.pickle' % (dataset, clf_name), 'wb'))

        y_pred_train = clf.predict(X_rf_train)
        y_pred_test = clf.predict(X_rf_test)
        acc_train = accuracy_score(y_rf_train, y_pred_train)
        acc_test = accuracy_score(y_rf_test, y_pred_test)
        res_dict['%s_acc_train' % clf_name] = acc_train
        res_dict['%s_acc_test' % clf_name] = acc_test
        res_dict['%s_disc_fit_time' % clf_name] = disc_fit_time
        print(datetime.datetime.now(),
              '\taccuracy %.3f, %.3f' % (acc_train, acc_test))

    res_dict['dataset'] = dataset
    res_dict['cgan_fit_time'] = cgan_fit_time
    res_dict['cgan_gen_time'] = cgan_gen_time

    print(datetime.datetime.now(), 'Storing evaluation')
    store_result(res_dict, path_ctgan_eval + 'tabular.json')

Exemple #4

0

Afficher le fichier

def main():
    dataset = 'wdbc'
    train_size = 1000

    D = get_dataset(dataset, path_dataset, normalize)
    X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D[
        'X_test'], D['y_test']

    data_type = D['data_type']

    if data_type == 'txt':
        X_train, X_test = D['X_train_txt'], D['X_test_txt']

    if len(X_train) > train_size:
        idx = np.random.choice(len(X_train), size=train_size, replace=False)
        X_train = X_train[idx]
    print(dataset, X_train.shape, data_type)

    idx = 0
    if data_type == 'tab':
        X_train_dt = [TabularRecord(x) for x in X_train]
        x_dt = TabularRecord(X_test[idx])

    elif data_type == 'ts':
        window_shape = D['window_sizes'][0]
        step = D['window_steps'][0]
        X_train_dt = [
            TimeSeriesRecord(x, window_shape=window_shape, step=step)
            for x in X_train
        ]
        x_dt = TimeSeriesRecord(X_test[idx],
                                window_shape=window_shape,
                                step=step)

    elif data_type == 'img':
        window_shape = D['window_sizes'][2]
        step = D['window_steps'][2][0]
        X_train_dt = [
            ImageRecord(x, window_shape=window_shape, step=step)
            for x in X_train
        ]
        x_dt = ImageRecord(X_test[idx], window_shape=window_shape, step=step)

    elif data_type == 'txt':
        window_shape = 3
        step = 1
        X_train_dt = [
            TextRecord(x,
                       window_shape=window_shape,
                       step=step,
                       text_length=100) for x in X_train
        ]
        x_dt = TextRecord(X_test[idx],
                          window_shape=window_shape,
                          step=step,
                          text_length=100)

    else:
        raise ValueError('Unknown data type %s' % data_type)

    Z = dang_neighborhood_generation(x_dt,
                                     X_train_dt,
                                     n_samples=1000,
                                     indpb=0.5,
                                     neighgen_op=neighgen_operators,
                                     base=None)

    print(x_dt.data)
    print('----')
    for i in range(10):
        print(Z[i])
        print('----')

Exemple #5

0

Afficher le fichier

def main():
    # dataset = 'wdbc'
    # dataset = 'italypower'
    # dataset = 'mnist'
    dataset = '20newsgroups'

    D = get_dataset(dataset, path_dataset, normalize)
    X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D[
        'X_test'], D['y_test']

    n_classes = D['n_classes']
    data_type = D['data_type']
    print(X_train.shape, data_type)
    if data_type == 'txt':
        X_train, X_test = D['X_train_txt'], D['X_test_txt']

    idx = 0
    if data_type == 'tab':
        X_train_dt = [TabularRecord(x) for x in X_train]
        x_dt = TabularRecord(X_test[idx])

    elif data_type == 'ts':
        print(data_type, D['window_sizes'][0], D['window_steps'][0])
        window_shape = D['window_sizes'][0]
        step = D['window_steps'][0]
        X_train_dt = [
            TimeSeriesRecord(x, window_shape=window_shape, step=step)
            for x in X_train
        ]
        x_dt = TimeSeriesRecord(X_test[idx],
                                window_shape=window_shape,
                                step=step)

    elif data_type == 'img':
        print(data_type)
        window_shape = (14, 14)  #D['window_sizes'][2]
        step = 7  #D['window_steps'][2][0]
        X_train_dt = [
            ImageRecord(x, window_shape=window_shape, step=step)
            for x in X_train[:1000]
        ]
        x_dt = ImageRecord(X_test[idx], window_shape=window_shape, step=step)

    elif data_type == 'txt':
        print(data_type)
        window_shape = 3
        step = 1
        X_train_dt = [
            TextRecord(x,
                       window_shape=window_shape,
                       step=step,
                       text_length=100) for x in X_train[:1000]
        ]
        x_dt = TextRecord(X_test[idx],
                          window_shape=window_shape,
                          step=step,
                          text_length=100)

    else:
        raise ValueError('Unknown data type %s' % data_type)

    # print(x_dt)
    Z = dang_neighborhood_generation(x_dt,
                                     X_train_dt,
                                     n_samples=1000,
                                     indpb=0.5,
                                     neighgen_op=neighgen_operators,
                                     base=None)
    print(len(Z))
    # print(Z[3].data.shape)

    # print(x_dt[0].data)
    # plt.imshow(x_dt[0].data, cmap='gray')
    # plt.show()

    # print(Z[0])
    # plt.imshow(Z[1].data, cmap='gray')
    # plt.show()

    # plt.plot(x_dt.data.tolist(), lw=5)
    # for i in range(10):
    #     plt.plot(Z[i*2].data.tolist())
    # plt.show()

    print(' '.join([term for term in x_dt.data if len(term) > 0]))
    print('----')
    for i in range(10):
        print(' '.join([term for term in Z[i].data if len(term) > 0]))
        print('----')

Exemple #6

0

Afficher le fichier

Fichier : train_ts_cgan.py Projet : riccotti/DAG

def main():

    print(K.tensorflow_backend._get_available_gpus())

    dataset = 'arrowhead'
    epochs = 10100
    train_size = 0.7
    latent_dim = 32
    window = 3

    print(datetime.datetime.now(), 'Dataset: %s' % dataset)
    D = get_dataset(dataset, path_dataset, normalize='standard')

    X_train, y_train, _, _ = D['X_train'], D['y_train'], D['X_test'], D[
        'y_test']

    n_timestamps = D['n_timestamps']
    n_classes = D['n_classes']

    le = LabelEncoder()
    le.fit(y_train)
    y_train = le.transform(y_train)
    # y_test = le.transfrm(y_test)

    print(datetime.datetime.now(), 'Training CGAN')
    cigan = CGAN(n_timestamps,
                 n_classes,
                 latent_dim,
                 window=window,
                 img_path=path_cgan_images + '/ts/%s_' % dataset,
                 verbose=True)
    ts = time.time()
    cigan.fit(X_train,
              y_train,
              epochs=epochs,
              batch_size=32,
              sample_interval=100)
    cgan_fit_time = time.time() - ts

    n_fake_instances = len(X_train)

    print(datetime.datetime.now(), 'Generating synthetic data')
    ts = time.time()
    X_fake = cigan.sample(n_fake_instances)
    cgan_gen_time = time.time() - ts

    print(datetime.datetime.now(), 'Storing synthetic data')
    np.save(path_syht_dataset + '%s' % dataset, X_fake)

    X_real = X_train

    # print('F', np.mean(X_fake[:, 0]), np.min(X_fake[:,0]), np.max(X_fake[:,0]))
    # print('R', np.mean(X_real[:,0]), np.min(X_real[:,0]), np.max(X_real[:,0]))
    #
    # print('')
    # print('F', np.mean(X_fake), np.min(X_fake), np.max(X_fake))
    # print('R', np.mean(X_real), np.min(X_real), np.max(X_real))
    #
    # return -1

    y_real = np.ones(len(X_real))
    y_fake = np.zeros(len(X_fake))

    X_rf = np.concatenate([X_real, X_fake])
    y_rf = np.concatenate([y_real, y_fake])

    X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(
        X_rf, y_rf, train_size=train_size, stratify=y_rf)

    res_dict = dict()

    for clf_name, clf in clf_list.items():
        print(datetime.datetime.now(), 'Training %s' % clf_name)
        ts = time.time()
        clf.fit(X_rf_train, y_rf_train)
        disc_fit_time = time.time() - ts
        pickle.dump(
            clf, open(path_discr + '%s_%s.pickle' % (dataset, clf_name), 'wb'))

        y_pred_train = clf.predict(X_rf_train)
        y_pred_test = clf.predict(X_rf_test)
        acc_train = accuracy_score(y_rf_train, y_pred_train)
        acc_test = accuracy_score(y_rf_test, y_pred_test)
        res_dict['%s_acc_train' % clf_name] = acc_train
        res_dict['%s_acc_test' % clf_name] = acc_test
        res_dict['%s_disc_fit_time' % clf_name] = disc_fit_time
        print(datetime.datetime.now(),
              '\taccuracy %.3f, %.3f' % (acc_train, acc_test))

    res_dict['dataset'] = dataset
    res_dict['cgan_fit_time'] = cgan_fit_time
    res_dict['cgan_gen_time'] = cgan_gen_time

    print(datetime.datetime.now(), 'Storing evaluation')
    store_result(res_dict, path_ctgan_eval + 'time_series.json')

Exemple #7

0

Afficher le fichier

def main():

    dataset = '20newsgroups'
    train_size = 1000
    text_length = 1000

    D = get_dataset(dataset,
                    path_dataset,
                    categories=['alt.atheism', 'talk.religion.misc'])
    X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D[
        'X_test'], D['y_test']

    data_type = D['data_type']
    if data_type == 'txt':
        X_train, X_test = D['X_train_txt'], D['X_test_txt']

    if len(X_train) > train_size:
        idx = np.random.choice(len(X_train), size=train_size, replace=False)
        X_train = [x for i, x in enumerate(X_train) if i in idx]

    idx = 0
    if data_type == 'tab':
        X_train_dt = [TabularRecord(x) for x in X_train]
        x_dt = TabularRecord(X_test[idx])

    elif data_type == 'ts':
        window_shape = D['window_sizes'][0]
        step = D['window_steps'][0]
        X_train_dt = [
            TimeSeriesRecord(x, window_shape=window_shape, step=step)
            for x in X_train
        ]
        x_dt = TimeSeriesRecord(X_test[idx],
                                window_shape=window_shape,
                                step=step)

    elif data_type == 'img':
        window_shape = D['window_sizes'][2]
        step = D['window_steps'][2][0]
        X_train_dt = [
            ImageRecord(x, window_shape=window_shape, step=step)
            for x in X_train
        ]
        x_dt = ImageRecord(X_test[idx], window_shape=window_shape, step=step)

    elif data_type == 'txt':
        window_shape = 3
        step = 3
        X_train_dt = [
            TextRecord(x,
                       window_shape=window_shape,
                       step=step,
                       text_length=text_length) for x in X_train
        ]
        x_dt = TextRecord(X_test[idx],
                          window_shape=window_shape,
                          step=step,
                          text_length=text_length)

    else:
        raise ValueError('Unknown data type %s' % data_type)

    Z = dang_neighborhood_generation(x_dt,
                                     X_train_dt,
                                     n_samples=1000,
                                     indpb=0.5,
                                     neighgen_op=neighgen_operators,
                                     base=None)

    # Z = rand_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5)

    # Z = supp_neighborhood_generation(x_dt, ' ', n_samples=1000, indpb=0.5)

    # Z = mode_neighborhood_generation(x_dt, X_train_dt, n_samples=1000, indpb=0.5)

    print(len(Z))

    print(' '.join([term for term in x_dt.data if len(term) > 0]))
    print('----O------')
    for i in range(10):
        print(' '.join([term for term in Z[i].data if len(term) > 0]))
        print('----')

Exemple #8

0

Afficher le fichier

def main():

    print(K.tensorflow_backend._get_available_gpus())

    dataset = 'imdb'
    epochs = 100
    train_size = 0.7
    maxlen = 3
    step = 1

    print(datetime.datetime.now(), 'Dataset: %s' % dataset)
    if dataset == '20newsgroups':
        categories = ['alt.atheism', 'talk.religion.misc']
    else:
        categories = None
    D = get_dataset(dataset, path_dataset, categories=categories)

    # X_train, y_train, X_test, y_test = D['X_train'], D['y_train'], D['X_test'], D['y_test']
    # n_classes = D['n_classes']
    # class_name = D['class_name']
    X_train_txt = D['X_train_txt']
    y_train = D['y_train']
    if dataset == 'imdb':
        X_train_txt, _, y_train, _ = train_test_split(X_train_txt,
                                                      y_train,
                                                      train_size=1000,
                                                      stratify=y_train)

    X_train_txt = clean_texts(X_train_txt, min_chars=10)
    text_lengths = [len(x) for x in X_train_txt]

    # X_test_txt = D['X_test_txt']
    # X_test_txt = clean_texts(X_test_txt, min_chars=10)

    # print(np.mean(lens), np.median(lens), np.min(lens), np.max(lens))
    # return -1

    X, y, words, words_indices, indices_words = texts2texts_nextword(
        X_train_txt, maxlen=maxlen, step=step)
    nbr_terms = len(words)
    start_texts_idx = np.random.choice(len(X_train_txt),
                                       min(1000, len(X_train_txt)),
                                       replace=False)
    start_texts = [X_train_txt[i] for i in start_texts_idx]

    print(datetime.datetime.now(), 'Training GAN')
    gan = GAN(nbr_terms,
              maxlen,
              latent_dim=100,
              term_indices=words_indices,
              indices_term=indices_words,
              txt_path=path_cgan_images + '/txt/%s_' % dataset,
              verbose=False,
              start_texts=start_texts,
              text_lengths=text_lengths)
    ts = time.time()
    gan.fit(X, y, epochs=epochs, batch_size=128)
    cgan_fit_time = time.time() - ts

    n_fake_instances = len(X_train_txt)

    print(datetime.datetime.now(), 'Generating synthetic data')
    ts = time.time()
    X_fake_txt = gan.sample(n=n_fake_instances,
                            t=20,
                            start_texts=start_texts,
                            diversity_list=[0.2, 0.5, 0.7, 1.0, 1.2])
    cgan_gen_time = time.time() - ts

    # print('Fake start')
    # for f in X_fake_txt[:5]:
    #     print(f)
    # print('Fake end')
    #
    # print('Real start')
    # for r in X_train_txt[:5]:
    #     print(r)
    # print('Real end')
    # # return -1

    print(datetime.datetime.now(), 'Storing synthetic data')
    # df = pd.DataFrame(data=Xy_fake, columns=feature_names + [class_name])
    # df.to_csv(path_syht_dataset + '%s.csv' % dataset, index=False)
    fout = open(path_syht_dataset + '%s.csv' % dataset, 'w')
    for txt in X_fake_txt:
        fout.write(txt + '\n')
    fout.close()

    X_fake = X_fake_txt
    X_real = X_train_txt

    y_real = np.ones(len(X_real))
    y_fake = np.zeros(len(X_fake))

    X_rf = np.concatenate([X_real, X_fake])
    y_rf = np.concatenate([y_real, y_fake])

    X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(
        X_rf, y_rf, train_size=train_size, stratify=y_rf)

    vectorizer = TfidfVectorizer(max_features=1000,
                                 stop_words=stopwords.words('english'))
    X_rf_train = vectorizer.fit_transform(X_rf_train).toarray()
    X_rf_test = vectorizer.transform(X_rf_test).toarray()

    res_dict = dict()

    for clf_name, clf in clf_list.items():
        print(datetime.datetime.now(), 'Training %s' % clf_name)
        ts = time.time()
        clf.fit(X_rf_train, y_rf_train)
        disc_fit_time = time.time() - ts
        pickle.dump(
            clf, open(path_discr + '%s_%s.pickle' % (dataset, clf_name), 'wb'))

        y_pred_train = clf.predict(X_rf_train)
        y_pred_test = clf.predict(X_rf_test)
        acc_train = accuracy_score(y_rf_train, y_pred_train)
        acc_test = accuracy_score(y_rf_test, y_pred_test)
        res_dict['%s_acc_train' % clf_name] = acc_train
        res_dict['%s_acc_test' % clf_name] = acc_test
        res_dict['%s_disc_fit_time' % clf_name] = disc_fit_time
        print(datetime.datetime.now(),
              '\taccuracy %.3f, %.3f' % (acc_train, acc_test))

    res_dict['dataset'] = dataset
    res_dict['cgan_fit_time'] = cgan_fit_time
    res_dict['cgan_gen_time'] = cgan_gen_time

    print(datetime.datetime.now(), 'Storing evaluation')
    store_result(res_dict, path_ctgan_eval + 'text.json')

Exemple #9

0

Afficher le fichier

def main():

    print(K.tensorflow_backend._get_available_gpus())

    dataset = 'cifar10'
    epochs = 20000
    train_size = 0.7
    categories = None
    latent_dim = 100

    print(datetime.datetime.now(), 'Dataset: %s' % dataset)
    D = get_dataset(dataset, path_dataset, categories=categories)

    X_train, y_train, _, _ = D['X_train'], D['y_train'], D['X_test'], D[
        'y_test']

    X_train = X_train * 255.0
    X_train = (X_train - 127.5) / 127.5

    n_classes = D['n_classes']
    # n_features = D['n_features']
    # feature_names = D['feature_names']
    # class_name = D['class_name']
    img_rows, img_cols = D['w'], D['h']
    channels = 1

    le = LabelEncoder()
    le.fit(y_train)
    y_train = le.transform(y_train)
    # y_test = le.transfrm(y_test)

    print(datetime.datetime.now(), 'Training CGAN')
    cigan = CGAN(img_rows,
                 img_cols,
                 channels,
                 n_classes,
                 latent_dim,
                 img_path=path_cgan_images + '/imgs/%s_' % dataset,
                 verbose=True)
    ts = time.time()
    cigan.fit(X_train,
              y_train,
              epochs=epochs,
              batch_size=32,
              sample_interval=100)
    cgan_fit_time = time.time() - ts

    n_fake_instances = len(X_train)

    print(datetime.datetime.now(), 'Generating synthetic data')
    ts = time.time()
    X_fake = cigan.sample(n_fake_instances)
    cgan_gen_time = time.time() - ts

    print(datetime.datetime.now(), 'Storing synthetic data')
    np.save(path_syht_dataset + '%s' % dataset, X_fake)

    s0, s1, s2 = X_fake.shape
    X_fake = X_fake.reshape(s0, s1 * s2)
    X_real = X_train.reshape(s0, s1 * s2)

    # print(np.mean(X_fake[0]), np.min(X_fake[0]), np.max(X_fake[0]))
    # print(np.mean(X_real[0]), np.min(X_real[0]), np.max(X_real[0]))

    y_real = np.ones(len(X_real))
    y_fake = np.zeros(len(X_fake))

    X_rf = np.concatenate([X_real, X_fake])
    y_rf = np.concatenate([y_real, y_fake])

    X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(
        X_rf, y_rf, train_size=train_size, stratify=y_rf)

    res_dict = dict()

    for clf_name, clf in clf_list.items():
        print(datetime.datetime.now(), 'Training %s' % clf_name)
        ts = time.time()
        clf.fit(X_rf_train, y_rf_train)
        disc_fit_time = time.time() - ts
        pickle.dump(
            clf, open(path_discr + '%s_%s.pickle' % (dataset, clf_name), 'wb'))

        y_pred_train = clf.predict(X_rf_train)
        y_pred_test = clf.predict(X_rf_test)
        acc_train = accuracy_score(y_rf_train, y_pred_train)
        acc_test = accuracy_score(y_rf_test, y_pred_test)
        res_dict['%s_acc_train' % clf_name] = acc_train
        res_dict['%s_acc_test' % clf_name] = acc_test
        res_dict['%s_disc_fit_time' % clf_name] = disc_fit_time
        print(datetime.datetime.now(),
              '\taccuracy %.3f, %.3f' % (acc_train, acc_test))

    res_dict['dataset'] = dataset
    res_dict['cgan_fit_time'] = cgan_fit_time
    res_dict['cgan_gen_time'] = cgan_gen_time

    print(datetime.datetime.now(), 'Storing evaluation')
    store_result(res_dict, path_ctgan_eval + 'images.json')