def seeker(input_dict: Dict) -> np.ndarray: """Solution seeker function. Args: input_dict (Dict): Dictionary that contains the seeker function inputs, as below: * "seed" (int): Random seed provided by the competition, use for reproducibility. * "generated_data" (np.ndarray of float): Generated dataset from hider data, shape [num_examples, max_seq_len, num_features]. * "enlarged_data" (np.ndarray of float): Enlarged original dataset, shape [num_examples_enlarge, max_seq_len, num_features]. * "generated_data_padding_mask" (np.ndarray of bool): Padding mask of bools, generated dataset, same shape as "generated_data". * "enlarged_data_padding_mask" (np.ndarray of bool): Padding mask of bools, enlarged dataset, same shape as "enlarged_data". Returns: np.ndarray: The reidentification labels produced by the seeker, expected shape [num_examples_enlarge]. """ # Get the inputs. seed = input_dict["seed"] generated_data = input_dict["generated_data"] enlarged_data = input_dict["enlarged_data"] generated_data_padding_mask = input_dict["generated_data_padding_mask"] enlarged_data_padding_mask = input_dict["enlarged_data_padding_mask"] # Get processed and imputed data, if desired: generated_data_preproc, generated_data_imputed = preprocess_data( generated_data, generated_data_padding_mask) enlarged_data_preproc, enlarged_data_imputed = preprocess_data( enlarged_data, enlarged_data_padding_mask) # TODO: Put your seeker code to replace Example 1 below. # Feel free play around with Examples 1 (knn) and 2 (binary_predictor) below. # --- Example 1: knn --- # from examples.seeker.knn import knn_seeker # # reidentified_data = knn_seeker.knn_seeker(generated_data_imputed, enlarged_data_imputed) # return reidentified_data # --- Example 2: binary_predictor --- from utils.misc import tf115_found assert tf115_found is True, "TensorFlow 1.15 not found, which is required to run binary_predictor." from examples.seeker.binary_predictor import binary_predictor reidentified_data = binary_predictor.binary_predictor( generated_data_imputed, enlarged_data_imputed, verbose=True) return generated_data
def seeker(input_dict): seed = input_dict["seed"] generated_data = input_dict["generated_data"] enlarged_data = input_dict["enlarged_data"] generated_data_padding_mask = input_dict["generated_data_padding_mask"] enlarged_data_padding_mask = input_dict["enlarged_data_padding_mask"] print("Competition-provided random seed:", seed) generated_data_preproc, generated_data_imputed = preprocess_data( generated_data, generated_data_padding_mask) enlarged_data_preproc, enlarged_data_imputed = preprocess_data( enlarged_data, enlarged_data_padding_mask) return binary_predictor(generated_data_imputed, enlarged_data_imputed)
def hider(input_dict: Dict) -> Union[np.ndarray, Tuple[np.ndarray, Optional[np.ndarray]]]: """Solution hider function. Args: input_dict (Dict): Dictionary that contains the hider function inputs, as below: * "seed" (int): Random seed provided by the competition, use for reproducibility. * "data" (np.ndarray of float): Input data, shape [num_examples, max_seq_len, num_features]. * "padding_mask" (np.ndarray of bool): Padding mask of bools, same shape as data. Returns: Return format is: np.ndarray (of float) [, np.ndarray (of bool)] first argument is the hider generated data, expected shape [num_examples, max_seq_len, num_features]); second optional argument is the corresponding padding mask, same shape. """ # Get the inputs. seed = input_dict["seed"] # Random seed provided by the competition, use for reproducibility. data = input_dict["data"] # Input data, shape [num_examples, max_seq_len, num_features]. padding_mask = input_dict["padding_mask"] # Padding mask of bools, same shape as data. # Get processed and imputed data: data_preproc, data_imputed = preprocess_data(data, padding_mask) print('Starting imputation GAIN ...') num_epoch = 4000 batch_size = 64 data_imputed, max_length = imputation_gain(data_preproc, data_imputed, num_epoch) print('Starting recurrent GAN ...', file=sys.stdout) generated_samples, gan_model, _discriminator_loss, _generator_loss = fit_recurrent_gan(data_imputed, max_length, num_epoch, batch_size) return generated_samples
def hider(input_dict): seed = input_dict["seed"] data = input_dict["data"] padding_mask = input_dict["padding_mask"] print("Competition-provided random seed:", seed) data_preproc, data_imputed = preprocess_data(data, padding_mask) return timegan(data_imputed)
def hider( input_dict: Dict ) -> Union[np.ndarray, Tuple[np.ndarray, Optional[np.ndarray]]]: """Solution hider function. Args: input_dict (Dict): Dictionary that contains the hider function inputs, as below: * "seed" (int): Random seed provided by the competition, use for reproducibility. * "data" (np.ndarray of float): Input data, shape [num_examples, max_seq_len, num_features]. * "padding_mask" (np.ndarray of bool): Padding mask of bools, same shape as data. Returns: Return format is: np.ndarray (of float) [, np.ndarray (of bool)] first argument is the hider generated data, expected shape [num_examples, max_seq_len, num_features]); second optional argument is the corresponding padding mask, same shape. Alternatively, may return a str "rescore" if there has been a previous successful submission and wishing to just re-run the vs-seekers scoring step. """ # Get the inputs. seed = input_dict[ "seed"] # Random seed provided by the competition, use for reproducibility. data = input_dict[ "data"] # Input data, shape [num_examples, max_seq_len, num_features]. padding_mask = input_dict[ "padding_mask"] # Padding mask of bools, same shape as data. # Get processed and imputed data, if desired: data_preproc, data_imputed = preprocess_data(data, padding_mask) # TODO: Put your hider code to replace Example 1 below. # Feel free play around with Examples 1 (add_noise) and 2 (timegan) below. # --- Example 1: add_noise --- # from examples.hider.add_noise import add_noise # # generated_data = add_noise.add_noise(data_imputed, noise_size=0.1) # generated_padding_mask = np.copy(padding_mask) # return generated_data, generated_padding_mask # --- Example 2: timegan --- from utils.misc import tf115_found assert tf115_found is True, "TensorFlow 1.15 not found, which is required to run timegan." from examples.hider.timegan import timegan generated_data = timegan.timegan(data_imputed) return generated_data
def hider( input_dict: Dict, ) -> Union[ np.ndarray, # return generated_data Tuple[np.ndarray, Optional[np.ndarray]], # return generated_data, generated_padding_mask Tuple[np.ndarray, Optional[int]], # return generated_data, 3 Tuple[np.ndarray, Optional[np.ndarray], Optional[int]], # return generated_data, generated_padding_mask, 3 str, # return "rescore" ]: """Solution hider function. Args: input_dict (Dict): Dictionary that contains the hider function inputs, as below: * "seed" (int): Random seed provided by the competition, use for reproducibility. * "data" (np.ndarray of float): Input data, shape [num_examples, max_seq_len, num_features]. * "padding_mask" (np.ndarray of bool): Padding mask of bools, same shape as data. Returns: Return format is: np.ndarray (of float) [, np.ndarray (of bool), int] * First argument is the hider generated data, expected shape [num_examples, max_seq_len, num_features]); * Second optional argument is the corresponding padding mask, same shape; * Third optional argument is the number of seeds to use in hider evaluation step, expected range [0, 5], where 0 means that evaluation is skipped. Alternatively, may return a str "rescore" if there has been a previous successful submission and wishing to just re-run the vs-seekers scoring step. """ # Get the inputs. seed = input_dict["seed"] # Random seed provided by the competition, use for reproducibility. data = input_dict["data"] # Input data, shape [num_examples, max_seq_len, num_features]. padding_mask = input_dict["padding_mask"] # Padding mask of bools, same shape as data. # Get processed and imputed data, if desired: data_preproc, data_imputed = preprocess_data(data, padding_mask) # TODO: Put your hider code to replace Example 1 below. # Feel free play around with Examples 1 (add_noise) and 2 (timegan) below. # --- Example 1: add_noise --- from examples.hider.add_noise import add_noise generated_data = add_noise.add_noise(data_imputed, noise_size=0.1) generated_padding_mask = np.copy(padding_mask) return generated_data, generated_padding_mask
def train(dim_word=100, # word vector dimensionality dim_char=10, # the number of LSTM units max_char=10, # the number of LSTM units dim=100, # the number of LSTM units win=5, #Window size bs=5, #number of backprop through time steps seed=123, verbose=1, use_model='GRU', #Choose the model from- LSTM, DEEPLSTM, RNN, patience=10, # early stopping patience max_epochs=50, lrate=0.0005, # learning rate maxlen=100, # maximum length of the description data_train=['data/qe/train/train.src.lc', 'data/qe/train/train.mt.lc', 'data/qe/train/train.align'], data_train_y = 'data/qe/train/train.tags', data_valid=['data/qe/dev/dev.src.lc', 'data/qe/dev/dev.mt.lc', 'data/qe/dev/dev.align'], data_valid_y = 'data/qe/dev/dev.tags', data_test=['data/qe/test/test.src.lc', 'data/qe/test/test.mt.lc', 'data/qe/test/test.align'], data_test_y = 'data/qe/test/test.tags', dictionaries=['data/qe/train/train.src.lc.json', 'data/qe/train/train.mt.lc.json'], character2index=['data/qe/train/train.src.lc.dict_char.json', 'data/qe/train/train.mt.lc.dict_char.json'], label2index = 'data/qe/train/train.tags.json', embeddings=['data/qe/pretrain/ep_qe.en.vector.txt', 'data/qe/pretrain/ep_qe.de.vector.txt'], use_adadelta=False, use_bilingual=False, use_pretrain=False, use_quest=False, use_tag=False, use_char=False, saveto=False, shuffle_each_epoch=True, load_data=None, ): model_options = OrderedDict(sorted(locals().copy().items())) print 'Model_Options:', model_options model_name = model_options['use_model'][0] if model_options['use_adadelta']: model_name += '_adadelta' if model_options['use_char']: model_name += '_char' if model_options['use_bilingual']: model_name += '_bilingual' if model_options['use_pretrain']: model_name += '_pretrain' print 'Using model:', model_name processed_data = [] if load_data: with gzip.open(load_data[0],'rb') as fp: processed_data = cPickle.load(fp) else: processed_data = preprocess_data(data_train=model_options['data_train'], data_train_y=model_options['data_train_y'][0], data_valid=model_options['data_valid'], data_valid_y=model_options['data_valid_y'][0], data_test=model_options['data_test'], data_test_y=model_options['data_test_y'][0], dictionaries=model_options['dictionaries'], character2index=model_options['character2index'], label2index = model_options['label2index'][0], embeddings = model_options['embeddings'], use_bilingual=model_options['use_bilingual'], use_char=model_options['use_char'], use_pretrain=model_options['use_pretrain']) """ Savinn the model/data with model_name """ save_data = folder = '' if use_tag: save_data = 'tag.data_' + model_name + '.pkl.gz' folder = 'tag.' + model_name if use_quest: save_data = 'quest.data_' + model_name + '.pkl.gz' folder = 'quest.' + model_name if saveto: with gzip.open(save_data,'wb') as fp: cPickle.dump(processed_data, fp) if not os.path.exists(folder): os.mkdir(folder) train, train_y, test, test_y, valid, valid_y, w2idxs, char2idxs, label2idxs, embs=processed_data idx2label = dict((k,v) for v,k in label2idxs.iteritems()) #print len(train), len(test), len(valid) vocsize_s = vocsize_t = vocsize_schar = vocsize_tchar = 0 emb_s, emb_t, train_s, train_schar, train_t, train_tchar, test_s, test_schar, test_t, test_tchar, valid_s, valid_schar, valid_t, valid_tchar = ([] for i in range(14)) if (use_bilingual or len(train) == 4) and use_char: emb_s, emb_t = embs train_s, train_t, train_schar, train_tchar = train test_s, test_t, test_schar, test_tchar = test valid_s, valid_t, valid_schar, valid_tchar = valid vocsize_s = len(w2idxs[0]) vocsize_t = len(w2idxs[1]) vocsize_schar = len(char2idxs[0]) vocsize_tchar = len(char2idxs[1]) elif use_char: emb_t = embs[0] train_t, train_tchar = train test_t, test_tchar = test valid_t, valid_tchar = valid vocsize_t = len(w2idxs[0]) vocsize_tchar = len(char2idxs[0]) elif use_bilingual or len(train) == 2: emb_s, emb_t = embs train_s, train_t = train test_s, test_t = test valid_s, valid_t = valid vocsize_s = len(w2idxs[0]) vocsize_t = len(w2idxs[1]) else : emb_t = embs[0] train_t = train[0] test_t = test[0] valid_t = valid[0] vocsize_t = len(w2idxs[0]) nclasses = len(label2idxs) nsentences = len(train_t) numpy.random.seed(model_options['seed']) # instanciate the model rnn = select_model[model_name]( nh = model_options['dim'], nc = nclasses, de = model_options['dim_word'], cs = model_options['win'], de_char = model_options['dim_char'], ne_char = vocsize_tchar, ne_src = vocsize_s, ne_tgt = vocsize_t, emb_src = emb_s, emb_tgt = emb_t, max_char = model_options['max_char']) # train with early stopping on validation set best_f1 = -numpy.inf model_options['patience'] = 2 batch_size = (nsentences/100) * 10 n_batches = nsentences//batch_size print n_batches for e in xrange(model_options['max_epochs']): model_options['ce'] = e #shuffle if shuffle_each_epoch: shuffle([train_t, train_s, train_tchar, train_y], model_options['seed']) tic = time.time() for k in xrange(n_batches): #Creating batches batch_train_s = [] batch_train_char = [] if model_options['use_bilingual']: batch_train_s = train_s[k*batch_size:(k+1)*batch_size] if model_options['use_char']: batch_train_char = train_tchar[k*batch_size:(k+1)*batch_size] batch_train_t = train_t[k*batch_size:(k+1)*batch_size] batch_train_y = train_y[k*batch_size:(k+1)*batch_size] batch_err = 0 for i in xrange(batch_size): cwords_src = [] padded_chars = [] if model_options['use_bilingual']: cwords_src = contextwin(batch_train_s[i], model_options['win']) if model_options['use_char']: padded_chars = add_padding(batch_train_char[i], model_options['max_char']) #print batch_train_char[0] #print padded_chars cwords_tgt = contextwin(batch_train_t[i], model_options['win']) labels = batch_train_y[i] if model_options['use_bilingual'] and model_options['use_char']: err = rnn.train_grad_shared(cwords_src, cwords_tgt, padded_chars, labels, model_options['lrate']) elif model_options['use_char']: err = rnn.train_grad_shared(cwords_tgt, padded_chars, labels, model_options['lrate']) elif model_options['use_bilingual']: err = rnn.train_grad_shared(cwords_src, cwords_tgt, labels, model_options['lrate']) elif model_options['use_adadelta']: err = rnn.train_grad_shared(cwords_tgt, labels, model_options['lrate']) else: err = rnn.train(cwords_tgt, labels, model_options['lrate']) if model_options['use_adadelta']: rnn.train_update(model_options['lrate']) rnn.normalize() if model_options['verbose']: print '[learning] epoch %i batch %i >> %2.2f%%'%(e, k, (i+1)*100./batch_size),'completed in %.2f (sec) <<\r'%(time.time()-tic), sys.stdout.flush() if(k % model_options['patience'] == 0): predictions_test, groundtruth_test, predictions_valid, \ groundtruth_valid = ([] for i in range(4)) if model_options['use_bilingual'] and model_options['use_char']: predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'), numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'), numpy.asarray(add_padding(__x, model_options['max_char'])).astype('int32'))) for x, _x, __x in zip(test_s, test_t, test_tchar) ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] #words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'), numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'), numpy.asarray(add_padding(__x, model_options['max_char'])).astype('int32'))) for x, _x, __x in zip(valid_s, valid_t, valid_tchar) ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] elif model_options['use_bilingual']: #evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x_src, model_options['win'])).astype('int32'), numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32'))) for x_src, x_tgt in zip(test_s, test_t) ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] #words_test = [ map(lambda x: idx2word_de[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x_src, model_options['win'])).astype('int32'), numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32'))) for x_src, x_tgt in zip(valid_s, valid_t) ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] #words_valid = [ map(lambda x: idx2word_de[x], w) for w in valid_lex] elif model_options['use_char']: predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'), numpy.asarray(add_padding(_x, model_options['max_char'])).astype('int32'))) for x, _x, in zip(test_t, test_tchar) ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] #words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] predictions_valid = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'), numpy.asarray(add_padding(_x, model_options['max_char'])).astype('int32'))) for x, _x, in zip(valid_t, valid_tchar) ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] else: #evaluation // back into the real world : idx -> words predictions_test = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'))) for x in test_t ] groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] #words_test = [ map(lambda x: idx2word[x], w) for w in test_t] predictions_valid = [ map(lambda x: idx2label[x], rnn.classify(numpy.asarray(contextwin(x, model_options['win'])).astype('int32'))) for x in valid_t ] groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] #words_valid = [ map(lambda x: idx2word[x], w) for w in valid_t] #evaluation // compute the accuracy using conlleval.pl res_test = [] res_valid = [] current_score = 0 if model_options['use_quest']: res_test=wmt_eval(predictions_test, groundtruth_test, folder+'/current.test.txt') res_valid=wmt_eval(predictions_valid, groundtruth_valid, folder+'/current.valid.txt') current_score = res_valid[2][0] if model_options['use_tag']: res_test=icon_eval(predictions_test, groundtruth_test, folder+'/current.test.txt') res_valid=icon_eval(predictions_valid, groundtruth_valid, folder+'/current.valid.txt') current_score = res_valid[1] if current_score > best_f1: """ Save the model and model parameters """ rnn.save(folder) filename = folder +'/model' with open('%s.json'%filename, 'wb') as f: json.dump(model_options, f, indent=2) best_f1 = current_score if model_options['verbose']: print 'NEW BEST: epoch', e, 'valid F1', res_valid, 'test F1' , res_test , ' '*20 model_options['be'] = e subprocess.call(['mv', folder + '/current.test.txt.hyp', folder+'/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt.hyp', folder+'/best.valid.txt']) else: print '' #Break if no improvement in 10 epochs if abs(model_options['be']-model_options['ce']) >= 10: break print 'BEST RESULT: epoch', model_options['be'] , 'valid F1', best_f1 , 'with the model', folder
def main(args): """The main script - hider from `hider.py` and the seeker from `seeker.py` will be imported and played against each other. Stages of the script: * Load data. * Run the hider. * Evaluate hider via feature prediction and one-step-ahead prediction. * Run the seeker (on the hider's generated data). Args: args (argparse.Namespace): parsed arguments from the command line. Raises: ValueError: in case there are issues with required files or directories. """ # ================================================= System setup. ================================================== # If no TensorFlow 1.15 found on the system, skip parts of the script. if not tf115_found: args.skip_fp = True args.skip_osa = True # Fix random seeds. fix_all_random_seeds(args.seed) # NOTE: # The fix_all_random_seeds() call may not be sufficient to make tensorflow fully deterministic. # See, for example: https://github.com/NVIDIA/framework-determinism # ============================================== Prepare directories. ============================================== # Code directory. code_dir = os.path.abspath(".") if not os.path.exists(code_dir): raise ValueError(f"Code directory not found at {code_dir}.") print(f"\nCode directory:\t\t{code_dir}") # Data path. data_path = os.path.abspath(args.data_path) if not os.path.exists(data_path): raise ValueError(f"Data file not found at {data_path}.") print(f"Data file:\t\t{data_path}") data_dir = os.path.dirname(data_path) data_file_name = os.path.basename(data_path) # Output directories. out_dir = os.path.abspath(args.output_dir) if not os.path.exists(out_dir): os.makedirs(out_dir, exist_ok=True) print(f"Output directory:\t{out_dir}") hider_dir = os.path.join(out_dir, "hider") if os.path.exists(hider_dir): shutil.rmtree(hider_dir) os.makedirs(hider_dir, exist_ok=True) seeker_dir = os.path.join(out_dir, "seeker") if os.path.exists(seeker_dir): shutil.rmtree(seeker_dir) os.makedirs(seeker_dir, exist_ok=True) print(f" ├ Hider output:\t{hider_dir}") print(f" └ Seeker output:\t{seeker_dir}\n") # =================================================== Load data. =================================================== if args.debug_data <= 0: args.debug_data = False with in_progress("Preprocessing and loading data"): original_data, original_padding_mask, train_idx, test_idx = load_data( data_dir=data_dir, data_file_name=data_file_name, max_seq_len=args.max_seq_len, seed=args.seed, train_rate=args.train_frac, force_reprocess= True, # If True, re-preprocess data every time (rather than reusing). debug_data=args.debug_data, ) print( f"\nOriginal data preview (original_data[:2, -10:, :2]):\n{original_data[:2, -10:, :2]}\n" ) # ================================================= Part I: Hider. ================================================= # Set up hider input. original_data_train = original_data[train_idx] original_padding_mask_train = original_padding_mask[train_idx] hider_input = { "data": original_data_train, "seed": args.seed, "padding_mask": original_padding_mask_train } # Run hider. with in_progress("Running Hider"): hider_output = hider_module.hider(hider_input) generated_data, generated_data_padding_mask = parse_hider_output( hider_output) print( f"\nGenerated data preview (generated_data[:2, -10:, :2]):\n{generated_data[:2, -10:, :2]}\n" ) # Save hider output. hider_output_file = os.path.join(hider_dir, "data.npz") np.savez( hider_output_file, generated_data=generated_data, padding_mask=generated_data_padding_mask if generated_data_padding_mask is not None else [], ) # Evaluate hider. # - Prepare data if not (args.skip_fp and args.skip_osa): with in_progress("Preparing data for hider evaluation"): generated_data, generated_data_padding_mask = load_generated_data( hider_output_file) _, original_data_train_imputed = prp.preprocess_data( original_data_train, original_padding_mask_train) _, generated_data_imputed = prp.preprocess_data( generated_data, generated_data_padding_mask) _, original_data_test_imputed = prp.preprocess_data( original_data[test_idx], original_padding_mask[test_idx]) # - Feature prediction step. if not args.skip_fp: num_features = original_data_train.shape[2] with temp_seed_numpy(args.seed): feature_idx = np.random.permutation( num_features)[:args.feature_prediction_no] print(f"\nFeature prediction evaluation on IDs: {feature_idx}\n") with in_progress("Running feature prediction"): with in_progress("Running on [original data]"): with tf_fixed_seed_seesion(args.seed): original_feature_prediction_accuracy, ori_task_types = feature_prediction( train_data=original_data_train_imputed, test_data=original_data_test_imputed, index=feature_idx, verbose=args.eval_verbose, ) with in_progress("Running on [generated data]"): with tf_fixed_seed_seesion(args.seed): new_feature_prediction_accuracy, new_task_types = feature_prediction( train_data=generated_data_imputed, test_data=original_data_test_imputed, index=feature_idx, verbose=args.eval_verbose, ) print("\nFeature prediction errors (per feature):") print(f"Original data:\t\t{original_feature_prediction_accuracy}") print(f"New (hider-generated):\t{new_feature_prediction_accuracy}\n") # - Save results. with open(os.path.join(hider_dir, "feature_prediction_scores.txt"), "w") as f: for score in new_feature_prediction_accuracy: print(score.astype(str), file=f) else: print( f"Feature prediction step skipped!{ '' if tf115_found else ' (TensorFlow 1.15 not found)' }\n" ) # - One-step-ahead prediction step. if not args.skip_osa: with in_progress("Running one-step-ahead prediction"): with in_progress("Running on [original data]"): with tf_fixed_seed_seesion(args.seed): original_osa_perf = one_step_ahead_prediction( train_data=original_data_train_imputed, test_data=original_data_test_imputed, verbose=args.eval_verbose, ) with in_progress("Running on [generated data]"): with tf_fixed_seed_seesion(args.seed): new_osa_perf = one_step_ahead_prediction( train_data=generated_data_imputed, test_data=original_data_test_imputed, verbose=args.eval_verbose, ) print("\nOne-step-ahead prediction errors (per feature):") print(f"Original data:\t\t{original_osa_perf}") print(f"New (hider-generated):\t{new_osa_perf}\n") # - Save results. with open(os.path.join(hider_dir, "osa_score.txt"), "w") as f: print(new_osa_perf.astype(str), file=f) else: print( f"One-step-ahead prediction step skipped!{ '' if tf115_found else ' (TensorFlow 1.15 not found)' }\n" ) if not args.skip_fp and not args.skip_osa: passed = benchmark_hider( feat_scores=new_feature_prediction_accuracy, task_types=new_task_types, osa_score=new_osa_perf, eval_feat_scores=original_feature_prediction_accuracy, eval_task_types=ori_task_types, eval_osa_score=original_osa_perf, threshold_auroc=0.85, threshold_rmse=5.00, ) print(f'>>> Hider evaluation: {"passed" if passed else "failed"}') # Validation of hider results: validate_hider_output( hider="hider from hider.py", hider_dir=hider_dir, features=feature_idx if not args.skip_fp else None, data_shape=original_data_train.shape, raise_exception=True, skip_fp=args.skip_fp, skip_osa=args.skip_osa, ) # ======================================= Part II: Seeker (vs Part I Hider). ======================================= # Set up seeker input. seeker_input = { "generated_data": generated_data, "enlarged_data": original_data, "seed": args.seed, "generated_data_padding_mask": generated_data_padding_mask, "enlarged_data_padding_mask": original_padding_mask, } # Run seeker. with in_progress("Running Seeker"): reidentified_labels = seeker_module.seeker(seeker_input) # Save seeker output. seeker_output_file = os.path.join(seeker_dir, "data.npz") np.savez(seeker_output_file, reidentified_data=reidentified_labels) # Evaluate seeker (vs hider). true_labels = np.isin(np.arange(original_data.shape[0]), train_idx) reidentified_labels = validate_seeker_output( seeker="seeker from seeker.py", seeker_output_path=seeker_output_file, labels=true_labels, raise_exception=True) reidentification_score = reidentify_score(true_labels, reidentified_labels) print(f"\nTrue labels:\t\t\t\t{true_labels.astype(int)}") print(f"Reidentified (by seeker) labels:\t{reidentified_labels}") print(f"Reidentification score:\t\t\t{reidentification_score:.4f}\n")