Ejemplo n.º 1
0
def seeker(input_dict: Dict) -> np.ndarray:
    """Solution seeker function.

    Args:
        input_dict (Dict): Dictionary that contains the seeker function inputs, as below:
            * "seed" (int): Random seed provided by the competition, use for reproducibility.
            * "generated_data" (np.ndarray of float): Generated dataset from hider data,
                shape [num_examples, max_seq_len, num_features].
            * "enlarged_data" (np.ndarray of float): Enlarged original dataset,
                shape [num_examples_enlarge, max_seq_len, num_features].
            * "generated_data_padding_mask" (np.ndarray of bool): Padding mask of bools, generated dataset,
                same shape as "generated_data".
            * "enlarged_data_padding_mask" (np.ndarray of bool): Padding mask of bools, enlarged dataset,
                same shape as "enlarged_data".

    Returns:
        np.ndarray: The reidentification labels produced by the seeker, expected shape [num_examples_enlarge].
    """

    # Get the inputs.
    seed = input_dict["seed"]
    generated_data = input_dict["generated_data"]
    enlarged_data = input_dict["enlarged_data"]
    generated_data_padding_mask = input_dict["generated_data_padding_mask"]
    enlarged_data_padding_mask = input_dict["enlarged_data_padding_mask"]

    # Get processed and imputed data, if desired:
    generated_data_preproc, generated_data_imputed = preprocess_data(
        generated_data, generated_data_padding_mask)
    enlarged_data_preproc, enlarged_data_imputed = preprocess_data(
        enlarged_data, enlarged_data_padding_mask)

    # TODO: Put your seeker code to replace Example 1 below.
    # Feel free play around with Examples 1 (knn) and 2 (binary_predictor) below.

    # --- Example 1: knn ---
    # from examples.seeker.knn import knn_seeker
    #
    # reidentified_data = knn_seeker.knn_seeker(generated_data_imputed, enlarged_data_imputed)
    # return reidentified_data

    # --- Example 2: binary_predictor ---
    from utils.misc import tf115_found
    assert tf115_found is True, "TensorFlow 1.15 not found, which is required to run binary_predictor."
    from examples.seeker.binary_predictor import binary_predictor
    reidentified_data = binary_predictor.binary_predictor(
        generated_data_imputed, enlarged_data_imputed, verbose=True)
    return generated_data
Ejemplo n.º 2
0
def seeker(input_dict):

    seed = input_dict["seed"]
    generated_data = input_dict["generated_data"]
    enlarged_data = input_dict["enlarged_data"]
    generated_data_padding_mask = input_dict["generated_data_padding_mask"]
    enlarged_data_padding_mask = input_dict["enlarged_data_padding_mask"]

    print("Competition-provided random seed:", seed)

    generated_data_preproc, generated_data_imputed = preprocess_data(
        generated_data, generated_data_padding_mask)
    enlarged_data_preproc, enlarged_data_imputed = preprocess_data(
        enlarged_data, enlarged_data_padding_mask)

    return binary_predictor(generated_data_imputed, enlarged_data_imputed)
Ejemplo n.º 3
0
def hider(input_dict: Dict) -> Union[np.ndarray, Tuple[np.ndarray, Optional[np.ndarray]]]:
    """Solution hider function.

    Args:
        input_dict (Dict): Dictionary that contains the hider function inputs, as below:
            * "seed" (int): Random seed provided by the competition, use for reproducibility.
            * "data" (np.ndarray of float): Input data, shape [num_examples, max_seq_len, num_features].
            * "padding_mask" (np.ndarray of bool): Padding mask of bools, same shape as data.

    Returns:
        Return format is:
            np.ndarray (of float) [, np.ndarray (of bool)]
        first argument is the hider generated data, expected shape [num_examples, max_seq_len, num_features]);
        second optional argument is the corresponding padding mask, same shape.
    """

    # Get the inputs.
    seed = input_dict["seed"]  # Random seed provided by the competition, use for reproducibility.
    data = input_dict["data"]  # Input data, shape [num_examples, max_seq_len, num_features].
    padding_mask = input_dict["padding_mask"]  # Padding mask of bools, same shape as data.

    # Get processed and imputed data:
    data_preproc, data_imputed = preprocess_data(data, padding_mask)
    
    print('Starting imputation GAIN ...')
    num_epoch = 4000
    batch_size = 64
    data_imputed, max_length = imputation_gain(data_preproc, data_imputed, num_epoch)
    
    print('Starting recurrent GAN ...', file=sys.stdout)
    generated_samples, gan_model, _discriminator_loss, _generator_loss = fit_recurrent_gan(data_imputed, max_length, num_epoch, batch_size)
    
    return generated_samples
Ejemplo n.º 4
0
def hider(input_dict):

    seed = input_dict["seed"]
    data = input_dict["data"]
    padding_mask = input_dict["padding_mask"]

    print("Competition-provided random seed:", seed)

    data_preproc, data_imputed = preprocess_data(data, padding_mask)

    return timegan(data_imputed)
Ejemplo n.º 5
0
def hider(
    input_dict: Dict
) -> Union[np.ndarray, Tuple[np.ndarray, Optional[np.ndarray]]]:
    """Solution hider function.

    Args:
        input_dict (Dict): Dictionary that contains the hider function inputs, as below:
            * "seed" (int): Random seed provided by the competition, use for reproducibility.
            * "data" (np.ndarray of float): Input data, shape [num_examples, max_seq_len, num_features].
            * "padding_mask" (np.ndarray of bool): Padding mask of bools, same shape as data.

    Returns:
        Return format is:
            np.ndarray (of float) [, np.ndarray (of bool)]
        first argument is the hider generated data, expected shape [num_examples, max_seq_len, num_features]);
        second optional argument is the corresponding padding mask, same shape.

        Alternatively, may return a str "rescore" if there has been a previous successful submission and wishing to
        just re-run the vs-seekers scoring step.
    """

    # Get the inputs.
    seed = input_dict[
        "seed"]  # Random seed provided by the competition, use for reproducibility.
    data = input_dict[
        "data"]  # Input data, shape [num_examples, max_seq_len, num_features].
    padding_mask = input_dict[
        "padding_mask"]  # Padding mask of bools, same shape as data.

    # Get processed and imputed data, if desired:
    data_preproc, data_imputed = preprocess_data(data, padding_mask)

    # TODO: Put your hider code to replace Example 1 below.
    # Feel free play around with Examples 1 (add_noise) and 2 (timegan) below.

    # --- Example 1: add_noise ---
    # from examples.hider.add_noise import add_noise
    #
    # generated_data = add_noise.add_noise(data_imputed, noise_size=0.1)
    # generated_padding_mask = np.copy(padding_mask)
    # return generated_data, generated_padding_mask

    # --- Example 2: timegan ---
    from utils.misc import tf115_found
    assert tf115_found is True, "TensorFlow 1.15 not found, which is required to run timegan."
    from examples.hider.timegan import timegan
    generated_data = timegan.timegan(data_imputed)
    return generated_data
Ejemplo n.º 6
0
def hider(
    input_dict: Dict,
) -> Union[
    np.ndarray,  # return generated_data
    Tuple[np.ndarray, Optional[np.ndarray]],  # return generated_data, generated_padding_mask
    Tuple[np.ndarray, Optional[int]],  # return generated_data, 3
    Tuple[np.ndarray, Optional[np.ndarray], Optional[int]],  # return generated_data, generated_padding_mask, 3
    str,  # return "rescore"
]:
    """Solution hider function.

    Args:
        input_dict (Dict): Dictionary that contains the hider function inputs, as below:
            * "seed" (int): Random seed provided by the competition, use for reproducibility.
            * "data" (np.ndarray of float): Input data, shape [num_examples, max_seq_len, num_features].
            * "padding_mask" (np.ndarray of bool): Padding mask of bools, same shape as data.

    Returns:
        Return format is:
            np.ndarray (of float) [, np.ndarray (of bool), int]
        * First argument is the hider generated data, expected shape [num_examples, max_seq_len, num_features]);
        * Second optional argument is the corresponding padding mask, same shape;
        * Third optional argument is the number of seeds to use in hider evaluation step, expected range [0, 5], where 
            0 means that evaluation is skipped.
        
        Alternatively, may return a str "rescore" if there has been a previous successful submission and wishing to 
        just re-run the vs-seekers scoring step.
    """

    # Get the inputs.
    seed = input_dict["seed"]  # Random seed provided by the competition, use for reproducibility.
    data = input_dict["data"]  # Input data, shape [num_examples, max_seq_len, num_features].
    padding_mask = input_dict["padding_mask"]  # Padding mask of bools, same shape as data.

    # Get processed and imputed data, if desired:
    data_preproc, data_imputed = preprocess_data(data, padding_mask)

    # TODO: Put your hider code to replace Example 1 below.
    # Feel free play around with Examples 1 (add_noise) and 2 (timegan) below.

    # --- Example 1: add_noise ---
    from examples.hider.add_noise import add_noise

    generated_data = add_noise.add_noise(data_imputed, noise_size=0.1)
    generated_padding_mask = np.copy(padding_mask)
    return generated_data, generated_padding_mask
Ejemplo n.º 7
0
def train(dim_word=100,  # word vector dimensionality
          dim_char=10,  # the number of LSTM units
          max_char=10,  # the number of LSTM units
          dim=100,  # the number of LSTM units
	  win=5, #Window size
	  bs=5, #number of backprop through time steps
	  seed=123,
	  verbose=1,
          use_model='GRU', #Choose the model from- LSTM, DEEPLSTM, RNN, 
          patience=10,  # early stopping patience
          max_epochs=50,
          lrate=0.0005,  # learning rate
          maxlen=100,  # maximum length of the description
          data_train=['data/qe/train/train.src.lc',
              'data/qe/train/train.mt.lc',
              'data/qe/train/train.align'],
          data_train_y = 'data/qe/train/train.tags',
          data_valid=['data/qe/dev/dev.src.lc',
                'data/qe/dev/dev.mt.lc',
                'data/qe/dev/dev.align'],
          data_valid_y = 'data/qe/dev/dev.tags',
          data_test=['data/qe/test/test.src.lc',
                'data/qe/test/test.mt.lc',
                'data/qe/test/test.align'],
          data_test_y = 'data/qe/test/test.tags',
          dictionaries=['data/qe/train/train.src.lc.json',
              'data/qe/train/train.mt.lc.json'],
          character2index=['data/qe/train/train.src.lc.dict_char.json',
              'data/qe/train/train.mt.lc.dict_char.json'],
	  label2index = 'data/qe/train/train.tags.json',
          embeddings=['data/qe/pretrain/ep_qe.en.vector.txt',
              'data/qe/pretrain/ep_qe.de.vector.txt'],
	  use_adadelta=False,
          use_bilingual=False,
          use_pretrain=False,
          use_quest=False,
          use_tag=False,
          use_char=False,
          saveto=False,
          shuffle_each_epoch=True,
	  load_data=None,
    ):

	model_options = OrderedDict(sorted(locals().copy().items()))
	print 'Model_Options:', model_options

	model_name = model_options['use_model'][0]
	if model_options['use_adadelta']:
		model_name += '_adadelta'
	if model_options['use_char']:
		model_name += '_char'
	if model_options['use_bilingual']:
		model_name += '_bilingual'
	if model_options['use_pretrain']:
		model_name += '_pretrain'

	print 'Using model:', model_name

	processed_data = []
	if load_data:
	    with gzip.open(load_data[0],'rb') as fp:
			processed_data = cPickle.load(fp)
	else:
	    processed_data = preprocess_data(data_train=model_options['data_train'], 
		data_train_y=model_options['data_train_y'][0],
		data_valid=model_options['data_valid'], data_valid_y=model_options['data_valid_y'][0], 
		data_test=model_options['data_test'], data_test_y=model_options['data_test_y'][0], 
		dictionaries=model_options['dictionaries'],
		character2index=model_options['character2index'],
		label2index = model_options['label2index'][0],
		embeddings = model_options['embeddings'],
		use_bilingual=model_options['use_bilingual'], 
		use_char=model_options['use_char'], 
		use_pretrain=model_options['use_pretrain'])

	"""
	Savinn the model/data with model_name
	"""
	save_data = folder = ''
	if use_tag:
		save_data = 'tag.data_' + model_name + '.pkl.gz'
		folder = 'tag.' + model_name
	if use_quest:
		save_data = 'quest.data_' + model_name + '.pkl.gz'
		folder = 'quest.' + model_name

	if saveto:
		with gzip.open(save_data,'wb') as fp:
       			cPickle.dump(processed_data, fp)
    	if not os.path.exists(folder): os.mkdir(folder)

	train, train_y, test, test_y, valid, valid_y, w2idxs, char2idxs, label2idxs, embs=processed_data
	idx2label = dict((k,v) for v,k in label2idxs.iteritems())
	#print len(train), len(test), len(valid)

	vocsize_s = vocsize_t = vocsize_schar = vocsize_tchar = 0
        emb_s, emb_t, train_s, train_schar, train_t, train_tchar, test_s, test_schar, test_t, test_tchar, valid_s, valid_schar, valid_t, valid_tchar = ([] for i in range(14))
		
	if (use_bilingual or len(train) == 4) and use_char:
		emb_s, emb_t = embs
		train_s, train_t, train_schar, train_tchar = train
		test_s, test_t, test_schar, test_tchar = test
		valid_s, valid_t, valid_schar, valid_tchar = valid
    		vocsize_s = len(w2idxs[0])
    		vocsize_t = len(w2idxs[1])
		vocsize_schar = len(char2idxs[0])
		vocsize_tchar = len(char2idxs[1])

	elif use_char:
		emb_t = embs[0]
		train_t, train_tchar = train
		test_t, test_tchar = test
		valid_t, valid_tchar = valid
    		vocsize_t = len(w2idxs[0])
		vocsize_tchar = len(char2idxs[0])

	elif use_bilingual or len(train) == 2:
		emb_s, emb_t = embs
		train_s, train_t = train
		test_s, test_t = test
		valid_s, valid_t = valid
    		vocsize_s = len(w2idxs[0])
    		vocsize_t = len(w2idxs[1])
	else :
		emb_t = embs[0]
		train_t = train[0]
		test_t = test[0]
		valid_t = valid[0]
    		vocsize_t = len(w2idxs[0])

    	nclasses = len(label2idxs)
    	nsentences = len(train_t)

    	numpy.random.seed(model_options['seed'])
    	# instanciate the model
    	rnn = select_model[model_name]( nh = model_options['dim'],
                    nc = nclasses,
                    de = model_options['dim_word'],
                    cs = model_options['win'],
                    de_char = model_options['dim_char'],
		    ne_char = vocsize_tchar,
		    ne_src = vocsize_s,
		    ne_tgt = vocsize_t,
		    emb_src = emb_s,
		    emb_tgt = emb_t,
		    max_char = model_options['max_char'])

    	# train with early stopping on validation set
    	best_f1 = -numpy.inf
    	model_options['patience'] = 2
    	batch_size = (nsentences/100) * 10
    	n_batches = nsentences//batch_size
    	print n_batches
    	for e in xrange(model_options['max_epochs']):
	  model_options['ce'] = e
      	  #shuffle
	  if shuffle_each_epoch:
      	  	shuffle([train_t, train_s, train_tchar, train_y], model_options['seed'])

      	  tic = time.time()
      	  for k in xrange(n_batches):
            #Creating batches
	    batch_train_s = []
	    batch_train_char = []

	    if model_options['use_bilingual']:
            	batch_train_s = train_s[k*batch_size:(k+1)*batch_size]
	    if model_options['use_char']:
            	batch_train_char = train_tchar[k*batch_size:(k+1)*batch_size]

            batch_train_t = train_t[k*batch_size:(k+1)*batch_size]
            batch_train_y = train_y[k*batch_size:(k+1)*batch_size]
            batch_err = 0
            for i in xrange(batch_size):
		cwords_src = []
		padded_chars = []
		if model_options['use_bilingual']:
                	cwords_src = contextwin(batch_train_s[i], model_options['win'])
		if model_options['use_char']:
			padded_chars = add_padding(batch_train_char[i], model_options['max_char'])

		#print batch_train_char[0]
		#print padded_chars
                cwords_tgt = contextwin(batch_train_t[i], model_options['win'])
                labels = batch_train_y[i]

		if model_options['use_bilingual'] and model_options['use_char']:
                     err = rnn.train_grad_shared(cwords_src, cwords_tgt, padded_chars, labels, model_options['lrate'])
		elif model_options['use_char']:
                     err = rnn.train_grad_shared(cwords_tgt, padded_chars, labels, model_options['lrate'])
		elif model_options['use_bilingual']:
                     err = rnn.train_grad_shared(cwords_src, cwords_tgt, labels, model_options['lrate'])
		elif model_options['use_adadelta']:
                     err = rnn.train_grad_shared(cwords_tgt, labels, model_options['lrate'])
		else:
		     err = rnn.train(cwords_tgt, labels, model_options['lrate'])
                
		if model_options['use_adadelta']:
		     rnn.train_update(model_options['lrate'])

                rnn.normalize()
                
                if model_options['verbose']:
                    print '[learning] epoch %i batch %i >> %2.2f%%'%(e, k, (i+1)*100./batch_size),'completed in %.2f (sec) <<\r'%(time.time()-tic),
		    sys.stdout.flush()

	    if(k % model_options['patience'] == 0):

		predictions_test, groundtruth_test, predictions_valid, \
			groundtruth_valid = ([] for i in range(4))

		if model_options['use_bilingual'] and model_options['use_char']:
			predictions_test = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'),
				numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'),
				numpy.asarray(add_padding(__x, 
				model_options['max_char'])).astype('int32')))
				for x, _x, __x in zip(test_s, test_t, test_tchar) ]
                	groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
                	#words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

                	predictions_valid = [ map(lambda x: idx2label[x],
                                 rnn.classify(numpy.asarray(contextwin(x, 
				 model_options['win'])).astype('int32'),
				 numpy.asarray(contextwin(_x, model_options['win'])).astype('int32'),
				 numpy.asarray(add_padding(__x, 
				 model_options['max_char'])).astype('int32')))
                                 for x, _x, __x in zip(valid_s, valid_t, valid_tchar) ]
                	groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]

		elif model_options['use_bilingual']:
			#evaluation // back into the real world : idx -> words
            		predictions_test = [ map(lambda x: idx2label[x],
                                 rnn.classify(numpy.asarray(contextwin(x_src, 
				 model_options['win'])).astype('int32'),
                                 numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32')))
                                 for x_src, x_tgt in zip(test_s, test_t) ]
            		groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
           		#words_test = [ map(lambda x: idx2word_de[x], w) for w in test_lex]

            		predictions_valid = [ map(lambda x: idx2label[x],
                                 rnn.classify(numpy.asarray(contextwin(x_src, 
				 model_options['win'])).astype('int32'),
                                 numpy.asarray(contextwin(x_tgt,model_options['win'])).astype('int32')))
                                 for x_src, x_tgt in zip(valid_s, valid_t) ]
            		groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
            		#words_valid = [ map(lambda x: idx2word_de[x], w) for w in valid_lex]


		elif model_options['use_char']:
			predictions_test = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'),
				numpy.asarray(add_padding(_x, 
				model_options['max_char'])).astype('int32')))
                                for x, _x, in zip(test_t, test_tchar) ]
                	groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
                	#words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

                	predictions_valid = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'),
				numpy.asarray(add_padding(_x,
				model_options['max_char'])).astype('int32')))
                                for x, _x, in zip(valid_t, valid_tchar) ]
                	groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
		else:
                	#evaluation // back into the real world : idx -> words
                	predictions_test = [ map(lambda x: idx2label[x],
				rnn.classify(numpy.asarray(contextwin(x,
				model_options['win'])).astype('int32'))) for x in test_t ]

                	groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
                	#words_test = [ map(lambda x: idx2word[x], w) for w in test_t]

                	predictions_valid = [ map(lambda x: idx2label[x], 
				rnn.classify(numpy.asarray(contextwin(x, 
				model_options['win'])).astype('int32'))) for x in valid_t ]
                	groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ]
                	#words_valid = [ map(lambda x: idx2word[x], w) for w in valid_t]

                #evaluation // compute the accuracy using conlleval.pl
		res_test = []
		res_valid = []
		current_score = 0
		if model_options['use_quest']:
                   res_test=wmt_eval(predictions_test, groundtruth_test, folder+'/current.test.txt')
               	   res_valid=wmt_eval(predictions_valid, groundtruth_valid, folder+'/current.valid.txt')
		   current_score = res_valid[2][0]
		if model_options['use_tag']:
                  res_test=icon_eval(predictions_test, groundtruth_test, folder+'/current.test.txt')
                  res_valid=icon_eval(predictions_valid, groundtruth_valid, folder+'/current.valid.txt')
		  current_score = res_valid[1]

                if current_score > best_f1:

		    """
			Save the model and model parameters
		    """
                    rnn.save(folder)
		    filename = folder +'/model'
		    with open('%s.json'%filename, 'wb') as f:
			  json.dump(model_options, f, indent=2)

                    best_f1 = current_score
                    if model_options['verbose']:
                        print 'NEW BEST: epoch', e, 'valid F1', res_valid, 'test F1' , res_test , ' '*20
                    model_options['be'] = e
		    subprocess.call(['mv', folder + '/current.test.txt.hyp', folder+'/best.test.txt'])
                    subprocess.call(['mv', folder + '/current.valid.txt.hyp', folder+'/best.valid.txt'])
                else:
                    print ''
          #Break if no improvement in 10 epochs
          if abs(model_options['be']-model_options['ce']) >= 10:  break
        print 'BEST RESULT: epoch', model_options['be'] , 'valid F1', best_f1 , 'with the model', folder
Ejemplo n.º 8
0
def main(args):
    """The main script - hider from `hider.py` and the seeker from `seeker.py` will be imported and played against 
    each other.

    Stages of the script:
        * Load data.
        * Run the hider.
        * Evaluate hider via feature prediction and one-step-ahead prediction.
        * Run the seeker (on the hider's generated data).

    Args:
        args (argparse.Namespace): parsed arguments from the command line.

    Raises:
        ValueError: in case there are issues with required files or directories.
    """

    # ================================================= System setup. ==================================================

    # If no TensorFlow 1.15 found on the system, skip parts of the script.
    if not tf115_found:
        args.skip_fp = True
        args.skip_osa = True

    # Fix random seeds.
    fix_all_random_seeds(args.seed)
    # NOTE:
    # The fix_all_random_seeds() call may not be sufficient to make tensorflow fully deterministic.
    # See, for example: https://github.com/NVIDIA/framework-determinism

    # ============================================== Prepare directories. ==============================================
    # Code directory.
    code_dir = os.path.abspath(".")
    if not os.path.exists(code_dir):
        raise ValueError(f"Code directory not found at {code_dir}.")
    print(f"\nCode directory:\t\t{code_dir}")

    # Data path.
    data_path = os.path.abspath(args.data_path)
    if not os.path.exists(data_path):
        raise ValueError(f"Data file not found at {data_path}.")
    print(f"Data file:\t\t{data_path}")
    data_dir = os.path.dirname(data_path)
    data_file_name = os.path.basename(data_path)

    # Output directories.
    out_dir = os.path.abspath(args.output_dir)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir, exist_ok=True)
    print(f"Output directory:\t{out_dir}")
    hider_dir = os.path.join(out_dir, "hider")
    if os.path.exists(hider_dir):
        shutil.rmtree(hider_dir)
    os.makedirs(hider_dir, exist_ok=True)
    seeker_dir = os.path.join(out_dir, "seeker")
    if os.path.exists(seeker_dir):
        shutil.rmtree(seeker_dir)
    os.makedirs(seeker_dir, exist_ok=True)
    print(f"  ├ Hider output:\t{hider_dir}")
    print(f"  └ Seeker output:\t{seeker_dir}\n")

    # =================================================== Load data. ===================================================
    if args.debug_data <= 0:
        args.debug_data = False
    with in_progress("Preprocessing and loading data"):
        original_data, original_padding_mask, train_idx, test_idx = load_data(
            data_dir=data_dir,
            data_file_name=data_file_name,
            max_seq_len=args.max_seq_len,
            seed=args.seed,
            train_rate=args.train_frac,
            force_reprocess=
            True,  # If True, re-preprocess data every time (rather than reusing).
            debug_data=args.debug_data,
        )
    print(
        f"\nOriginal data preview (original_data[:2, -10:, :2]):\n{original_data[:2, -10:, :2]}\n"
    )

    # ================================================= Part I: Hider. =================================================
    # Set up hider input.
    original_data_train = original_data[train_idx]
    original_padding_mask_train = original_padding_mask[train_idx]
    hider_input = {
        "data": original_data_train,
        "seed": args.seed,
        "padding_mask": original_padding_mask_train
    }

    # Run hider.
    with in_progress("Running Hider"):
        hider_output = hider_module.hider(hider_input)
        generated_data, generated_data_padding_mask = parse_hider_output(
            hider_output)
    print(
        f"\nGenerated data preview (generated_data[:2, -10:, :2]):\n{generated_data[:2, -10:, :2]}\n"
    )

    # Save hider output.
    hider_output_file = os.path.join(hider_dir, "data.npz")
    np.savez(
        hider_output_file,
        generated_data=generated_data,
        padding_mask=generated_data_padding_mask
        if generated_data_padding_mask is not None else [],
    )

    # Evaluate hider.

    # - Prepare data
    if not (args.skip_fp and args.skip_osa):
        with in_progress("Preparing data for hider evaluation"):
            generated_data, generated_data_padding_mask = load_generated_data(
                hider_output_file)
            _, original_data_train_imputed = prp.preprocess_data(
                original_data_train, original_padding_mask_train)
            _, generated_data_imputed = prp.preprocess_data(
                generated_data, generated_data_padding_mask)
            _, original_data_test_imputed = prp.preprocess_data(
                original_data[test_idx], original_padding_mask[test_idx])

    # - Feature prediction step.
    if not args.skip_fp:
        num_features = original_data_train.shape[2]
        with temp_seed_numpy(args.seed):
            feature_idx = np.random.permutation(
                num_features)[:args.feature_prediction_no]
        print(f"\nFeature prediction evaluation on IDs: {feature_idx}\n")

        with in_progress("Running feature prediction"):
            with in_progress("Running on [original data]"):
                with tf_fixed_seed_seesion(args.seed):
                    original_feature_prediction_accuracy, ori_task_types = feature_prediction(
                        train_data=original_data_train_imputed,
                        test_data=original_data_test_imputed,
                        index=feature_idx,
                        verbose=args.eval_verbose,
                    )
            with in_progress("Running on [generated data]"):
                with tf_fixed_seed_seesion(args.seed):
                    new_feature_prediction_accuracy, new_task_types = feature_prediction(
                        train_data=generated_data_imputed,
                        test_data=original_data_test_imputed,
                        index=feature_idx,
                        verbose=args.eval_verbose,
                    )

        print("\nFeature prediction errors (per feature):")
        print(f"Original data:\t\t{original_feature_prediction_accuracy}")
        print(f"New (hider-generated):\t{new_feature_prediction_accuracy}\n")

        # - Save results.
        with open(os.path.join(hider_dir, "feature_prediction_scores.txt"),
                  "w") as f:
            for score in new_feature_prediction_accuracy:
                print(score.astype(str), file=f)

    else:
        print(
            f"Feature prediction step skipped!{ '' if tf115_found else ' (TensorFlow 1.15 not found)' }\n"
        )

    # - One-step-ahead prediction step.
    if not args.skip_osa:
        with in_progress("Running one-step-ahead prediction"):
            with in_progress("Running on [original data]"):
                with tf_fixed_seed_seesion(args.seed):
                    original_osa_perf = one_step_ahead_prediction(
                        train_data=original_data_train_imputed,
                        test_data=original_data_test_imputed,
                        verbose=args.eval_verbose,
                    )
            with in_progress("Running on [generated data]"):
                with tf_fixed_seed_seesion(args.seed):
                    new_osa_perf = one_step_ahead_prediction(
                        train_data=generated_data_imputed,
                        test_data=original_data_test_imputed,
                        verbose=args.eval_verbose,
                    )

        print("\nOne-step-ahead prediction errors (per feature):")
        print(f"Original data:\t\t{original_osa_perf}")
        print(f"New (hider-generated):\t{new_osa_perf}\n")

        # - Save results.
        with open(os.path.join(hider_dir, "osa_score.txt"), "w") as f:
            print(new_osa_perf.astype(str), file=f)

    else:
        print(
            f"One-step-ahead prediction step skipped!{ '' if tf115_found else ' (TensorFlow 1.15 not found)' }\n"
        )

    if not args.skip_fp and not args.skip_osa:
        passed = benchmark_hider(
            feat_scores=new_feature_prediction_accuracy,
            task_types=new_task_types,
            osa_score=new_osa_perf,
            eval_feat_scores=original_feature_prediction_accuracy,
            eval_task_types=ori_task_types,
            eval_osa_score=original_osa_perf,
            threshold_auroc=0.85,
            threshold_rmse=5.00,
        )
        print(f'>>> Hider evaluation: {"passed" if passed else "failed"}')

    # Validation of hider results:
    validate_hider_output(
        hider="hider from hider.py",
        hider_dir=hider_dir,
        features=feature_idx if not args.skip_fp else None,
        data_shape=original_data_train.shape,
        raise_exception=True,
        skip_fp=args.skip_fp,
        skip_osa=args.skip_osa,
    )

    # ======================================= Part II: Seeker (vs Part I Hider). =======================================
    # Set up seeker input.
    seeker_input = {
        "generated_data": generated_data,
        "enlarged_data": original_data,
        "seed": args.seed,
        "generated_data_padding_mask": generated_data_padding_mask,
        "enlarged_data_padding_mask": original_padding_mask,
    }

    # Run seeker.
    with in_progress("Running Seeker"):
        reidentified_labels = seeker_module.seeker(seeker_input)

    # Save seeker output.
    seeker_output_file = os.path.join(seeker_dir, "data.npz")
    np.savez(seeker_output_file, reidentified_data=reidentified_labels)

    # Evaluate seeker (vs hider).
    true_labels = np.isin(np.arange(original_data.shape[0]), train_idx)
    reidentified_labels = validate_seeker_output(
        seeker="seeker from seeker.py",
        seeker_output_path=seeker_output_file,
        labels=true_labels,
        raise_exception=True)
    reidentification_score = reidentify_score(true_labels, reidentified_labels)

    print(f"\nTrue labels:\t\t\t\t{true_labels.astype(int)}")
    print(f"Reidentified (by seeker) labels:\t{reidentified_labels}")
    print(f"Reidentification score:\t\t\t{reidentification_score:.4f}\n")