def main():
	eps = float(sys.argv[1])
	seed = int(sys.argv[2])
	model_fname = os.system("ls ./female_models/ | grep {} | grep {} >> model_fnames.txt".format(eps, seed))
	model_fnames_file = open("model_fnames.txt", "r")
	model_fnames = model_fnames_file.readlines()
	model_fnames_file.close()
	alive_model_fname = [fname for fname in model_fnames if 'alive' in fname][0][:-1]
	dead_model_fname = [fname for fname in model_fnames if 'dead' in fname][0][:-1]
	os.system("rm model_fnames.txt")
	alive_female_models = pd.read_pickle('./female_models/{}'.format(alive_model_fname))
	dead_female_models = pd.read_pickle('./female_models/{}'.format(dead_model_fname))
	for i_rep, (alive_female_model, dead_female_model) in enumerate(zip(alive_female_models, dead_female_models)):
		alive_female_variable_types = {key : female_variable_types[key] for \
						key in alive_female_model.param_dims.keys()}
		dead_female_variable_types = {key : female_variable_types[key] for key in dead_female_model.param_dims.keys()}

		print(i_rep)
		noisy_dead_proportion = (N_female_dead+np.random.laplace(scale=(1./0.01)))/N_female
		N_syn_female_alive = int((1.-noisy_dead_proportion)*N_female) 
		N_syn_female_dead = int(noisy_dead_proportion*N_female) 
		alive_female_syn_data = fast_sample(alive_female_model, alive_female_variable_types, N_syn_female_alive)
		dead_female_syn_data = fast_sample(dead_female_model, dead_female_variable_types, N_syn_female_dead)
		alive_female_syn_data['ep'] = 0
		alive_female_syn_data['lex.dur'] = 1.0
		female_syn_data = pd.concat([alive_female_syn_data, dead_female_syn_data])
		female_syn_decoded = decode_data(female_syn_data, maps, for_poisson=False)
		female_syn_decoded.to_csv('./syn_data/female_data_{}_{}_{}.csv'.format(seed, np.round(eps, 2), i_rep), index=False)
Ejemplo n.º 2
0
def main():
    eps = float(sys.argv[1])
    os.system("python3 join_models.py {}".format(eps))
    alive_female_models = pd.read_pickle(
        './female_models/alive_female_models_{}.p'.format(np.round(eps, 2)))
    dead_female_models = pd.read_pickle(
        './female_models/dead_female_models_{}.p'.format(np.round(eps, 2)))
    for i_rep, (alive_female_model, dead_female_model) in enumerate(
            zip(alive_female_models, dead_female_models)):
        alive_female_variable_types = {key : female_variable_types[key] for \
            key in alive_female_model.param_dims.keys()}
        dead_female_variable_types = {
            key: female_variable_types[key]
            for key in dead_female_model.param_dims.keys()
        }

        print(i_rep)
        alive_female_syn_data = fast_sample(alive_female_model,
                                            alive_female_variable_types,
                                            int(208148 * 0.8))
        dead_female_syn_data = fast_sample(dead_female_model,
                                           dead_female_variable_types,
                                           int(208148 * 0.2))
        alive_female_syn_data['ep'] = 0
        alive_female_syn_data['lex.dur'] = 1.0
        female_syn_data = pd.concat(
            [alive_female_syn_data, dead_female_syn_data])
        female_syn_decoded = decode_data(female_syn_data,
                                         maps,
                                         for_poisson=False)
        female_syn_decoded.to_csv('./syn_data/female_data_{}_{}.csv'.format(
            np.round(eps, 2), i_rep),
                                  index=False)
Ejemplo n.º 3
0
def main():
    eps = float(sys.argv[1])
    seed = int(sys.argv[2])
    model_fname = os.system(
        "ls ./male_models/ | grep {} | grep {} >> model_fnames.txt".format(
            eps, seed))
    model_fnames_file = open("model_fnames.txt", "r")
    model_fnames = model_fnames_file.readlines()
    model_fnames_file.close()
    model_fname = [fname for fname in model_fnames][0][:-1]
    print(model_fname)
    os.system("rm model_fnames.txt")
    male_models = pd.read_pickle('./male_models/{}'.format(model_fname))
    for i_rep, male_model in enumerate(male_models):
        male_variable_types = {
            key: male_variable_types_[key]
            for key in male_model.param_dims.keys()
        }

        print(i_rep)
        male_syn_data = fast_sample(male_model, male_variable_types, N_male)
        #male_syn_data[male_syn_data["ep"] == 0]["lex.dur"] = 1.0
        male_syn_decoded = decode_data(male_syn_data, maps, for_poisson=False)
        male_syn_decoded.to_csv('./syn_data/male_data_{}_{}_{}.csv'.format(
            seed, np.round(eps, 2), i_rep),
                                index=False)
Ejemplo n.º 4
0
def main():
    eps = float(sys.argv[1])
    seed = int(sys.argv[2])
    model_fname = os.system(
        "ls ./train_models/ | grep {} | grep {} >> model_fnames.txt".format(
            eps, seed))
    model_fnames_file = open("model_fnames.txt", "r")
    model_fnames = model_fnames_file.readlines()
    model_fnames_file.close()
    model_fname = model_fnames[0][:-1]
    os.system("rm model_fnames.txt")
    train_models = pd.read_pickle('./train_models/{}'.format(model_fname))
    for i_rep, train_model in enumerate(train_models):
        train_variable_types = {
            key: train_variable_types_base[key]
            for key in train_model.param_dims.keys()
        }
        print(i_rep)
        syn_data = fast_sample(train_model, train_variable_types, N)
        female_syn_data = syn_data[syn_data["is.female"] == 1]
        male_syn_data = syn_data[syn_data["is.female"] == 0]
        female_syn_decoded = decode_data(female_syn_data,
                                         maps,
                                         for_poisson=False)
        male_syn_decoded = decode_data(male_syn_data, maps, for_poisson=False)
        female_syn_decoded.to_csv('./syn_data/female_data_{}_{}_{}.csv'.format(
            seed, np.round(eps, 2), i_rep),
                                  index=False)
        male_syn_decoded.to_csv('./syn_data/male_data_{}_{}_{}.csv'.format(
            seed, np.round(eps, 2), i_rep),
                                index=False)
Ejemplo n.º 5
0
    poor_models = [
        pd.read_pickle('./res/models_poor_2019-04-25_{}_{}.p'.format(
            sigma, seed))[0] for seed in seeds
    ]
    params = [
        pd.read_pickle('./res/params_rich_2019-04-25_{}_{}.p'.format(
            sigma, seed)) for seed in seeds
    ][0]
    i_run = 0
    classifiers = []
    for rich_model, poor_model in zip(rich_models, poor_models):
        ## Generate data
        N_rich = int(N_rich_true +
                     np.random.laplace(scale=100))  ## Epsilon = 0.01
        N_poor = N - N_rich
        syn_rich = fast_sample(rich_model, variable_types, N_rich)
        syn_poor = fast_sample(poor_model, variable_types, N_poor)
        X_syn_dpvi = syn_rich.append(syn_poor)
        y_syn_dpvi = np.concatenate([np.ones(N_rich), np.zeros(N_poor)])

        ## Decode data for classification
        X_syn_dpvi = decode_for_classification(X_syn_dpvi)
        X_syn_dpvi['Sex'] = X_syn_dpvi['Sex'].map({'Female': 0, 'Male': 1})
        continuous_feats = X_syn_dpvi.columns[X_syn_dpvi.dtypes == 'float']
        X_syn_dpvi[continuous_feats] = pd.DataFrame(scaler.fit_transform(X_syn_dpvi[continuous_feats]\
          .astype("float64")), columns=continuous_feats)

        ## Train classifier with syn_dpvi data
        cls_syn_dpvi = linear_model.LogisticRegression()
        cls_syn_dpvi.fit(X_syn_dpvi, y_syn_dpvi)
        missing_cols = [
Ejemplo n.º 6
0
dpvi_err = []
dpvi_times = []
learn = 0
if learn:
    sys.path.append('../../dpvi/')
    from sampler import fast_sample
    for d in ds:
        app_data = pd.read_csv('../../data/subsets/carat_apps_sub{}.dat'.format(d), sep=' ', header=None)\
              .astype('float').values
        N = len(app_data)
        models = pickle.load(
            open(
                '../../dpvi/models_{0}/models_{0}_{1}.p'.format(fname_dpvi, d),
                'rb'))
        for model in models:
            syn_app_data = fast_sample(model, N)

            syn_cov = np.cov(syn_app_data.T)
            orig_cov = np.cov(app_data.T)
            dpvi_err.append(np.linalg.norm(orig_cov - syn_cov))
        log = open('logs_{0}/out_file_{0}_{1}.txt'.format(fname_dpvi, d), 'r')
        wall_time, cpu_time = log.readlines()[-2:]
        log.close()
        wall_time = float(wall_time.strip('Wall time').strip('\n'))
        cpu_time = float(cpu_time.strip('CPU time').strip('\n'))
        dpvi_times.append((wall_time, cpu_time))
    pd.DataFrame(dpvi_err).to_csv('../plot_data/dpvi_cov_err_(8,16,32,64,96)_{}.csv'\
          .format(fname_dpvi), sep=';', header=None, index=False)
    pd.DataFrame(dpvi_times).to_csv('../plot_data/dpvi_times_(8,16,32,64,96)_{}.csv'\
          .format(fname_dpvi), sep=';', header=None, index=False)
else: