Beispiel #1
0
def main():
    args = _parse_args()

    if args.tsv:
        data, discrete_columns = read_tsv(args.data, args.metadata)
    else:
        data, discrete_columns = read_csv(
            args.data, args.metadata, args.header, args.discrete
        )

    if args.load:
        model = CTGANSynthesizer.load(args.load)
    else:
        model = CTGANSynthesizer()
    model.fit(data, discrete_columns, args.epochs)

    if args.save is not None:
        model.save(args.save)

    num_samples = args.num_samples or len(data)

    if args.sample_condition_column is not None:
        assert args.sample_condition_column_value is not None

    sampled = model.sample(
        num_samples, args.sample_condition_column, args.sample_condition_column_value
    )

    if args.tsv:
        write_tsv(sampled, args.metadata, args.output)
    else:
        sampled.to_csv(args.output, index=False)
import numpy as np
import pandas as pd
import os
import sys
import tqdm
import pickle
import pathlib
from pathlib import Path


def get_domain_dims(DIR='us_import1'):
    with open('./generated_data_v1/{}/domain_dims.pkl'.format(DIR),
              'rb') as fh:
        domain_dims = pickle.load(fh)
    return domain_dims


def convert_np_to_pd(data_np, domain_dims):
    columns = list(domain_dims.keys())
    df = pd.DataFrame(data=data_np, columns=columns)
    return df, columns


real_data = np.load('./generated_data_v1/us_import1/pos_data.npy')
domain_dims = get_domain_dims()
data_df, columns = convert_np_to_pd(real_data, domain_dims)

ctgan_obj = CTGANSynthesizer()
ctgan_obj.fit(data, columns)
ctgan_obj.save('ctgan.pkl')