def _dataset_fn(input_context): replica_batch_size = input_context.get_per_replica_batch_size( args.global_batch_size) dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, to_sparse_tensor=True, repeat=1) # because each worker has its own data source, so that no need to shard the dataset. return dataset
def _dataset_fn(input_context): replica_batch_size = input_context.get_per_replica_batch_size( args.global_batch_size) dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, to_sparse_tensor=True, repeat=1, args=args) dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) return dataset
def test_tf_dense_model(args, init_tensors, *random_samples): dataset = utils.tf_dataset(*random_samples, batchsize=args.global_batch_size, to_sparse_tensor=False, repeat=1) loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True) tf_dense_demo = TfDenseDemo(init_tensors, args.global_batch_size, args.slot_num, args.nnz_per_slot, args.embedding_vec_size) optimizer = utils.get_dense_optimizer(args.optimizer)(learning_rate=0.1) if args.mixed_precision: optimizer = tf.keras.mixed_precision.LossScaleOptimizer( optimizer, initial_scale=1024) @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit, embedding_vector = tf_dense_demo(inputs, training=True) loss = loss_fn(labels, logit) if args.mixed_precision: _loss = optimizer.get_scaled_loss(loss) else: _loss = loss grads = tape.gradient(_loss, tf_dense_demo.trainable_variables) if args.mixed_precision: grads = optimizer.get_unscaled_gradients(grads) optimizer.apply_gradients(zip(grads, tf_dense_demo.trainable_variables)) return loss, embedding_vector tf_results = list() for i, (input_tensors, labels) in enumerate(dataset): print("-" * 30, str(i), "-" * 30) loss, embedding_vector = _train_step(input_tensors, labels) print("[INFO]: iteration {}, loss {}".format(i, loss)) tf_results.append(embedding_vector.numpy()) if not hasattr(args, "task_id"): args.task_id = 0 if 1 == args.save_params and args.task_id == 0: filepath = r"./embedding_variables/" utils.save_to_file(os.path.join(filepath, r"tf_variable.file"), tf_dense_demo.params.numpy()) return tf_results
def draw_adv(model, sk_func, num_examples_draw, margin, phase): batch_size_draw = 1000 dataset = tf_dataset(num_examples_draw // batch_size_draw, batch_size_draw, sk_func) inf_x, sup_x, inf_y, sup_y = plotex.get_limits(sk_func(100, noise=0.1)[0], 0.6) plt.xlim(inf_x, sup_x) plt.ylim(inf_y, sup_y) for x, _ in dataset: w_adv, hinge_adv = generate_adversarial(model, x, margin, phase) plt.scatter(x[:,0], x[:,1], c='red', alpha=0.1, marker='.') plt.scatter(hinge_adv[:,0], hinge_adv[:,1], c='green', alpha=0.2, marker='x') plt.scatter(w_adv[:,0], w_adv[:,1], c='blue', alpha=0.2, marker='+') plt.xlabel('X') plt.ylabel('Y') patch_1 = mpatches.Patch(color='red', label=f'support') patch_2 = mpatches.Patch(color='green', label=f'x + delta; x + delta') patch_3 = mpatches.Patch(color='blue', label=f'x - delta') plt.legend(handles=[patch_1, patch_2, patch_3])
def draw_adv(model, sk_func, X, num_examples_draw, batch_size_draw, fig, index): scale = gin.query_parameter('one_class_wasserstein.scale') margin = gin.query_parameter('one_class_wasserstein.margin') if X.shape[1] == 2: fig.add_subplot(index) else: plt.twinx() dataset = tf_dataset(num_examples_draw // batch_size_draw, batch_size_draw, sk_func) inf_x, sup_x, inf_y, sup_y = plotex.get_limits(X) xs, advs = [], [] for x, _ in dataset: adv = complement_distribution(model, x, scale, margin) xs.append(x) advs.append(adv) xs = np.concatenate(xs) advs = np.concatenate(advs) if X.shape[1] == 2: plt.xlim(inf_x, sup_x) plt.ylim(inf_y, sup_y) plt.scatter(xs[:, 0], xs[:, 1], c='red', alpha=0.1, marker='.') plt.scatter(advs[:, 0], advs[:, 1], c='green', alpha=0.2, marker='x') plt.xlabel('X') plt.ylabel('Y') else: plt.hist(xs[:, 0], bins=100, fc=(1, 0, 0, 0.5), histtype='stepfilled', density=True) plt.hist(advs[:, 0], bins=100, fc=(0, 1, 0, 0.5), histtype='stepfilled', density=True) patch_1 = mpatches.Patch(color='red', label=f'support') patch_2 = mpatches.Patch(color='green', label=f'adv') plt.legend(handles=[patch_1, patch_2])
def plot_levels_lines(sk_func): input_shape = (2) seed_dispatcher(None) model = models.get_mlp_baseline(input_shape) num_batchs = 500 batch_size = 100 lbda = 1. alpha = 10. phase = 'symmetric' margin = 0.2 dilatation = 1. skfunc = dilated_func(sk_func, dilatation) dataset = tf_dataset(num_batchs, batch_size, sk_func) X, Y = sk_func(100, noise=0.1) num_examples_draw = 1000 fig = plt.figure(figsize=(20,14)) plt.subplot(2, 3, 1) plotex.plot_levels(X, Y, model) plt.subplot(2, 3, 2) draw_adv(model, sk_func, num_examples_draw, margin, phase) plt.subplot(2, 3, 3) penalties = train_OOD_detector(model, dataset, num_batchs, lbda, alpha, margin, phase) iterations = np.arange(len(penalties)) plt.plot(iterations, np.log10(tf.reduce_mean(penalties, axis=1).numpy())) plt.plot(iterations, np.log10(tf.reduce_min(penalties, axis=1).numpy())) plt.plot(iterations, np.log10(tf.reduce_max(penalties, axis=1).numpy())) plt.title(r'Log Gradient Norm $\|\nabla_x f\|_2$') plt.subplot(2, 3, 4) plotex.plot_levels(X, Y, model) plt.subplot(2, 3, 5) draw_adv(model, sk_func, num_examples_draw, margin, phase) plt.show()
def _dataset_fn(input_context): replica_batch_size = input_context.get_per_replica_batch_size(args.global_batch_size) dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, to_sparse_tensor=False, repeat=1) return dataset
def plot_levels_lines(sk_func_name=gin.REQUIRED, num_batchs=gin.REQUIRED, batch_size=gin.REQUIRED, k_lip=gin.REQUIRED, num_examples_draw=gin.REQUIRED, batch_size_draw=gin.REQUIRED, proj1D=gin.REQUIRED, init_landscape=gin.REQUIRED): seed_dispatcher(None) scale = gin.query_parameter('one_class_wasserstein.scale') if sk_func_name == 'make_moons': sk_func = lambda n: make_moons(n, shuffle=True, noise=0.05) if proj1D: sk_func = projected(sk_func, [1, 0]) elif sk_func_name == 'make_circles': sk_func = lambda n: make_circles(n, shuffle=True, noise=0.05) if proj1D: sk_func = projected(sk_func, [1, 0]) elif sk_func_name == 'make_blobs': dim = 1 if proj1D else 2 seed = random.randint(1, 1000) sk_func = lambda n: make_blobs(n, centers=3, cluster_std=1. * scale, n_features=dim, shuffle=True, random_state=seed) sk_func = dilated_func(sk_func, scale) X, _ = sk_func(num_examples_draw) dataset = tf_dataset(num_batchs, batch_size, sk_func) input_shape = X.shape[1:] model = models.get_mlp_baseline(input_shape, k_lip) fig = plt.figure(figsize=(22, 15)) plotex.plot_levels(X, model, fig, 121 if proj1D else 231) draw_adv(model, sk_func, X, num_examples_draw, batch_size_draw, fig, 232) if not proj1D and init_landscape: plotex.plot3d(X, model, fig, 233) try: penalties = train_OOD_detector(model, dataset, num_batchs) except tf.python.framework.errors_impl.InvalidArgumentError as e: from deel.lip.normalizers import bjorck_normalization, spectral_normalization for layer in model.layers: W_bar, _u, sigma = spectral_normalization( layer.kernel, layer.u, niter=layer.niter_spectral) norm = tf.reduce_sum(W_bar**2.) W_bar = bjorck_normalization(W_bar, niter=layer.niter_bjorck) print('############################################') print(norm, sigma, _u, layer.bias, W_bar) print('\n\n\n') raise e if not proj1D and not init_landscape: fig.add_subplot(233) iterations = np.arange(len(penalties)) plt.plot(iterations, np.log10(tf.reduce_mean(penalties, axis=1).numpy())) plt.plot(iterations, np.log10(tf.reduce_min(penalties, axis=1).numpy())) plt.plot(iterations, np.log10(tf.reduce_max(penalties, axis=1).numpy())) plt.title(r'Log Gradient Norm $\log_{10}{\|\nabla_x f\|_2}$') plotex.plot_levels(X, model, fig, 122 if proj1D else 234) draw_adv(model, sk_func, X, num_examples_draw, batch_size_draw, fig, 235) if not proj1D: plotex.plot3d(X, model, fig, 236) plt.show()
def test_tf_multi_dense_emb(args): dataset_filenames = [ args.file_prefix + str(task_id) + ".file" for task_id in range(args.worker_num) ] samples_total = [list() for _ in range(args.dataset_iter_num)] labels_total = [list() for _ in range(args.dataset_iter_num)] replica_batch_size = args.global_batch_size // args.worker_num for worker_id in range(args.worker_num): samples, labels = utils.restore_from_file(dataset_filenames[worker_id]) for i in range(args.dataset_iter_num): samples_total[i].extend(samples[i * replica_batch_size:(i + 1) * replica_batch_size]) labels_total[i].extend(labels[i * replica_batch_size:(i + 1) * replica_batch_size]) samples_total = np.concatenate(samples_total, axis=0) labels_total = np.concatenate(labels_total, axis=0) dataset = utils.tf_dataset(samples_total, labels_total, batchsize=args.global_batch_size, to_sparse_tensor=False, repeat=1) dataset = dataset.prefetch(tf.data.AUTOTUNE) model = TFDenseModel( vocabulary_size=args.max_vocabulary_size_per_gpu * args.worker_num, embedding_vec_size_list=args.embedding_vec_size_list, slot_num_list=args.slot_num_list, nnz_per_slot_list=[ args.nnz_per_slot for _ in range(len(args.slot_num_list)) ], num_dense_layers=args.num_dense_layers) optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) if args.mixed_precision: optimizer = tf.keras.mixed_precision.LossScaleOptimizer( optimizer, initial_scale=1024) # set initial value to embedding variables for i, param in enumerate(model.embedding_params): init_tensors = utils.get_ones_tensor( max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu * args.worker_num, embedding_vec_size=args.embedding_vec_size_list[i], num=1) param.assign(init_tensors[0]) loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True) @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit, all_vectors = model(inputs, training=True) loss = loss_fn(labels, logit) if args.mixed_precision: _loss = optimizer.get_scaled_loss(loss) else: _loss = loss grads = tape.gradient(_loss, model.trainable_variables) if args.mixed_precision: grads = optimizer.get_unscaled_gradients(grads) optimizer.apply_gradients(zip(grads, model.trainable_variables)) return loss, all_vectors # save its results tf_results = list() for i, (inputs, labels) in enumerate(dataset): if args.stop_iter >= 0 and i >= args.stop_iter: break loss, all_vectors = _train_step(inputs, labels) print("[INFO]: Iteration: {}, loss={}".format(i, loss)) with tf.device("CPU:0"): tf_results.append(all_vectors) return tf_results
def get_sok_results(args, init_tensors, *random_samples): if args.distributed_tool == "onedevice": strategy = strategy_wrapper.OneDeviceStrategy() elif args.distributed_tool == "horovod": import horovod.tensorflow as hvd hvd.init() strategy = strategy_wrapper.HorovodStrategy() else: raise ValueError(f"{args.distributed_tool} is not supported.") with strategy.scope(): sok_init_op = sok.Init(global_batch_size=args.global_batch_size) embedding_initializer = tf.keras.initializers.Ones( ) if args.use_tf_initializer else None sok_dense_demo = SOKDemo( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, use_hashtable=args.use_hashtable, dynamic_input=args.dynamic_input, num_of_dense_layers=0, key_dtype=args.key_dtype, embedding_initializer=embedding_initializer) emb_opt = utils.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer( args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = sok.tf.keras.mixed_precision.LossScaleOptimizer( emb_opt, 1024) sok_saver = sok.Saver() restore_op = list() for i, embedding_layer in enumerate(sok_dense_demo.embedding_layers): control_inputs = [restore_op[-1]] if restore_op else None with tf.control_dependencies(control_inputs): if args.restore_params: filepath = r"./embedding_variables" op = sok_saver.restore_from_file( embedding_layer.embedding_variable, filepath) else: if not args.use_tf_initializer: op = sok_saver.load_embedding_values( embedding_layer.embedding_variable, init_tensors[i]) else: op = tf.constant(1.0) restore_op.append(op) loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none') def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) def _train_step(inputs, labels, training): def _step_fn(inputs, labels): logit, embedding_vector = sok_dense_demo(inputs, training=training) loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(loss) else: _loss = loss emb_var, other_var = sok.split_embedding_variable_from_others( sok_dense_demo.trainable_variables) grads = tf.gradients( _loss, emb_var + other_var, colocate_gradients_with_ops=True, unconnected_gradients=tf.UnconnectedGradients.NONE) emb_grads, other_grads = grads[:len(emb_var)], grads[len(emb_var):] if args.mixed_precision: other_grads = emb_opt.get_unscaled_gradients(other_grads) emb_grads = emb_opt.get_unscaled_gradients(emb_grads) if "plugin" in args.optimizer: emb_train_op = emb_opt.apply_gradients(zip(emb_grads, emb_var)) else: with sok.OptimizerScope(emb_var): emb_train_op = emb_opt.apply_gradients( zip(emb_grads, emb_var)) with tf.control_dependencies([*emb_grads]): # in case NCCL runs concurrently via SOK and horovod other_grads = strategy.reduce("sum", other_grads) other_train_op = dense_opt.apply_gradients( zip(other_grads, other_var)) with tf.control_dependencies([emb_train_op, other_train_op]): total_loss = strategy.reduce("sum", loss) total_loss = tf.identity(total_loss) return total_loss, embedding_vector return strategy.run(_step_fn, inputs, labels) replica_batch_size = args.global_batch_size // args.gpu_num dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, to_sparse_tensor=False, repeat=1, args=args) train_iterator = dataset.make_initializable_iterator() iterator_init = train_iterator.initializer inputs, labels = train_iterator.get_next() graph_results = _train_step(inputs, labels, training=True) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) if "plugin" in args.optimizer: init_op = tf.group(init_op, emb_opt.initializer) save_op = list() for i, embedding_layer in enumerate(sok_dense_demo.embedding_layers): control_inputs = [save_op[-1]] if save_op else None with tf.control_dependencies(control_inputs): if args.save_params: filepath = r"./embedding_variables/" utils.try_make_dirs(filepath) op = sok_saver.dump_to_file(embedding_layer.embedding_variable, filepath) else: op = tf.constant(1.0) save_op.append(op) sok_results = list() config = tf.ConfigProto() config.log_device_placement = False with tf.Session(config=config) as sess: sess.run(sok_init_op) sess.run([init_op, iterator_init]) sess.run(restore_op) sess.graph.finalize() for step in range(args.iter_num): loss_v, emb_vector_v = sess.run([*graph_results]) print("*" * 80) print(f"Step: {step}, loss: {loss_v}" ) #", embedding_vector:\n{emb_vector_v}") sok_results.append(emb_vector_v) sess.run(save_op) name = list() for embedding_layer in sok_dense_demo.embedding_layers: name.append(embedding_layer.embedding_variable.m_var_name) return sok_results, name
def get_tf_results(args, init_tensors, *random_samples): graph = tf.Graph() with graph.as_default(): tf_dense_demo = TFDemo( vocabulary_size=args.max_vocabulary_size_per_gpu * args.gpu_num, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, embedding_vec_size=args.embedding_vec_size, num_of_dense_layers=0, use_hashtable=False, dynamic_input=False) optimizer = utils.get_dense_optimizer( args.optimizer)(learning_rate=0.1) if args.mixed_precision: optimizer = sok.tf.keras.mixed_precision.LossScaleOptimizer( optimizer, 1024) loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True) def _train_step(inputs, labels, training): logit, embedding_vector = tf_dense_demo(inputs, training=training) loss = loss_fn(labels, logit) if args.mixed_precision: _loss = optimizer.get_scaled_loss(loss) else: _loss = loss grads = tf.gradients( _loss, tf_dense_demo.trainable_variables, colocate_gradients_with_ops=True, unconnected_gradients=tf.UnconnectedGradients.NONE) if args.mixed_precision: grads = optimizer.get_unscaled_gradients(grads) train_op = optimizer.apply_gradients( zip(grads, tf_dense_demo.trainable_variables)) with tf.control_dependencies([train_op]): loss = tf.identity(loss) return loss, embedding_vector dataset = utils.tf_dataset(*random_samples, batchsize=args.global_batch_size, to_sparse_tensor=False, repeat=1) train_iterator = dataset.make_initializable_iterator() iterator_init = train_iterator.initializer inputs, labels = train_iterator.get_next() graph_results = _train_step(inputs, labels, training=True) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) restore_op = list() for i, embedding_layer in enumerate(tf_dense_demo.embedding_layers): restore_op.append( embedding_layer.embeddings.assign( tf.concat(init_tensors[i], axis=0))) emb_values = list() for embedding_layer in tf_dense_demo.embedding_layers: if args.save_params: filepath = r"./embedding_variables/" utils.try_make_dirs(filepath) emb_values.append(embedding_layer.embeddings.read_value()) else: emb_values = tf.constant(1.0) tf_results = list() with tf.Session(graph=graph) as sess: sess.run([init_op, iterator_init]) sess.run(restore_op) sess.graph.finalize() for step in range(args.iter_num): loss_v, embedding_vector_v = sess.run([*graph_results]) print("*" * 80) print(f"step: {step}, loss: {loss_v}" ) #", embedding_vector:\n{embedding_vector_v}") tf_results.append(embedding_vector_v) emb_values_v = sess.run(emb_values) if args.save_params: for i, value in enumerate(emb_values_v): utils.save_to_file( os.path.join(filepath, r"tf_variable_" + str(i) + r".file"), value) name = list() for embedding_layer in tf_dense_demo.embedding_layers: name.append(embedding_layer.embeddings.name) return tf_results, name