def test_control_inputs(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) is_training = tf.placeholder(dtype=tf.bool, shape=()) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) features = tf.layers.dropout(features, training=is_training) output = tf.layers.dense(features, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss( loss, Adam(), val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), tensor_with_value={is_training: (True, False)}, metrics={"loss": loss}) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close()
def test_tfdataset_with_tfrecord(self): train_path = os.path.join(resource_path, "tfrecord/mnist_train.tfrecord") test_path = os.path.join(resource_path, "tfrecord/mnist_test.tfrecord") dataset = TFDataset.from_tfrecord_file(self.sc, train_path, batch_size=16, validation_file_path=test_path) raw_bytes = dataset.tensors[0] images, labels = parse_fn(raw_bytes) flat = tf.layers.flatten(images) logits = tf.layers.dense(flat, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) opt = TFOptimizer.from_loss(loss, Adam()) opt.optimize()
def test_tfdataset_with_tf_data_dataset_which_contains_bool(self): dataset = tf.data.Dataset.from_tensor_slices( (np.random.randn(102, 28, 28, 1), np.random.randint(0, 10, size=(102, )), np.ones(shape=(102, 28, 28, 1), dtype=np.bool))) dataset = TFDataset.from_tf_data_dataset(dataset, batch_size=16) feature, labels, mask = dataset.tensors float_mask = tf.to_float(mask) masked_feature = tf.to_float(feature) * float_mask flatten = tf.layers.flatten(masked_feature) logits = tf.layers.dense(flatten, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) opt = TFOptimizer.from_loss(loss, Adam()) opt.optimize()
def test_tf_optimizer_with_sparse_gradient(self): ids = np.random.randint(0, 10, size=[40]) labels = np.random.randint(0, 5, size=[40]) id_rdd = self.sc.parallelize(ids) label_rdd = self.sc.parallelize(labels) training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]]) with tf.Graph().as_default(): dataset = TFDataset.from_rdd(training_rdd, names=["ids", "labels"], shapes=[[], []], types=[tf.int32, tf.int32], batch_size=8) id_tensor, label_tensor = dataset.tensors embedding_table = tf.get_variable(name="word_embedding", shape=[10, 5]) embedding = tf.nn.embedding_lookup(embedding_table, id_tensor) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=embedding, labels=label_tensor)) optimizer = TFOptimizer.from_loss(loss, Adam(1e-3)) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close()
def test_tf_optimizer_metrics(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) output = tf.layers.dense(features, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss(loss, { "dense/": Adam(1e-3), "dense_1/": SGD(0.0) }, val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}) initial_weights = optimizer.tf_model.training_helper_layer.get_weights( ) optimizer.optimize(end_trigger=MaxEpoch(1)) updated_weights = optimizer.tf_model.training_helper_layer.get_weights( ) for i in [ 0, 1 ]: # weights and bias combined with "dense/" should be updated assert not np.allclose(initial_weights[i], updated_weights[i]) for i in [ 2, 3 ]: # weights and bias combined with "dense_1" should be unchanged assert np.allclose(initial_weights[i], updated_weights[i]) optimizer.sess.close()
def test_tf_optimizer_with_sparse_gradient_using_keras(self): import tensorflow as tf ids = np.random.randint(0, 10, size=[40]) labels = np.random.randint(0, 5, size=[40]) id_rdd = self.sc.parallelize(ids) label_rdd = self.sc.parallelize(labels) training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]]) dataset = TFDataset.from_rdd(training_rdd, features=(tf.int32, []), labels=(tf.int32, []), batch_size=8) words_input = tf.keras.layers.Input(shape=(), name='words_input') embedding_layer = tf.keras.layers.Embedding(input_dim=10, output_dim=5, name='word_embedding') word_embeddings = embedding_layer(words_input) embedding = tf.keras.layers.Flatten()(word_embeddings) output = tf.keras.layers.Dense(5, activation="softmax")(embedding) model = tf.keras.models.Model(inputs=[words_input], outputs=[output]) model.compile(optimizer="sgd", loss="sparse_categorical_crossentropy") optimizer = TFOptimizer.from_keras(model, dataset) optimizer.optimize()
def test_checkpoint(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) output = tf.layers.dense(features, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) model_dir = tempfile.mkdtemp() try: optimizer = TFOptimizer.from_loss(loss, Adam(), val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}, model_dir=model_dir) optimizer.optimize(end_trigger=MaxEpoch(1)) first_weights = optimizer.sess.run(tf.trainable_variables()[0]) import re ckpt_path = None versions = [] for (root, dirs, files) in os.walk(model_dir, topdown=True): temp_versions = [] for file_name in files: if re.match("^optimMethod-TFParkTraining\.[0-9]+$", file_name) is not None: version = int(file_name.split(".")[1]) temp_versions.append(version) if temp_versions: ckpt_path = root versions = temp_versions break assert ckpt_path is not None, "Cannot fine checkpoint file" optimizer.sess.run( tf.global_variables_initializer()) # reset variable optimizer_load = TFOptimizer.from_loss( loss, Adam(), session=optimizer.sess, val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}, model_dir=model_dir) optimizer_load.load_checkpoint(ckpt_path, max(versions)) loaded_first_weights_before_train = optimizer.sess.run( tf.trainable_variables()[0]) assert np.allclose(first_weights, loaded_first_weights_before_train) # max epoch still 1, should not train optimizer_load.optimize(end_trigger=MaxEpoch(1)) loaded_first_weights = optimizer.sess.run( tf.trainable_variables()[0]) assert np.allclose(first_weights, loaded_first_weights) # max epoch increase 1, should train 1 epoch optimizer_load.optimize(end_trigger=MaxEpoch(2)) loaded_first_weights_2 = optimizer.sess.run( tf.trainable_variables()[0]) assert not np.allclose(first_weights, loaded_first_weights_2) optimizer_load.sess.close() finally: import shutil shutil.rmtree(model_dir)
def fit(self, data, epochs=1, batch_size=32, feature_cols=None, label_cols=None, validation_data=None, session_config=None, checkpoint_trigger=None, auto_shard_files=True): """ Train this keras model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each partition can be a Pandas DataFrame or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is [feature tensor tuple, label tensor tuple] :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param label_cols: label column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param checkpoint_trigger: when to trigger checkpoint during training. Should be a bigdl.orca.learn.trigger, like EveryEpoch(), SeveralIteration( num_iterations),etc. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, tf.data.Dataset): assert isinstance(data.element_spec, tuple), \ "If data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if validation_data is not None: assert isinstance(validation_data, tf.data.Dataset), \ "train data and validation data should be both tf.data.Dataset" assert isinstance(validation_data.element_spec, tuple), \ "If validation_data is tf.data.Dataset, each element should be " \ "(feature tensors, label tensor), where each feature/label tensor can be " \ "either a single tensor or a tuple of tensors" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" data, validation_data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols, validation_data, "fit") if checkpoint_trigger is not None: checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger) if is_tf_data_dataset(data): data = data.map(_standardize_keras_target_data) validation_data = validation_data.map( _standardize_keras_target_data) memory_type = OrcaContext.train_data_store dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, label_cols=label_cols, hard_code_batch_size=False, sequential_order=False, shuffle=True, auto_shard_files=auto_shard_files, memory_type=memory_type) self.tf_optimizer = TFOptimizer.from_keras( self.model.model, dataset, model_dir=self.model.model_dir, session_config=session_config, metrics=self.metrics, optimizer=self.optimizer) if self.clip_norm: self.tf_optimizer.set_gradient_clipping_by_l2_norm( clip_norm=self.clip_norm) if self.clip_min and self.clip_max: self.tf_optimizer.set_constant_gradient_clipping( self.clip_min, self.clip_max) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboard( self.log_dir, self.app_name) self.tf_optimizer.optimize(MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self
def fit(self, data, epochs=1, batch_size=32, feature_cols=None, label_cols=None, validation_data=None, session_config=None, checkpoint_trigger=None, auto_shard_files=False, feed_dict=None): """ Train this graph model with train data. :param data: train data. It can be XShards, Spark DataFrame, tf.data.Dataset. If data is XShards, each partition can be a Pandas DataFrame or a dictionary of {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is a tuple of input tensors. :param epochs: number of epochs to train. :param batch_size: total batch size for each iteration. :param feature_cols: feature column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param label_cols: label column names if train data is Spark DataFrame or XShards of Pandas DataFrame. :param validation_data: validation data. Validation data type should be the same as train data. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. :param session_config: tensorflow session configuration for training. Should be object of tf.ConfigProto :param feed_dict: a dictionary. The key is TensorFlow tensor, usually a placeholder, the value of the dictionary is a tuple of two elements. The first one of the tuple is the value to feed to the tensor in training phase and the second one is the value to feed to the tensor in validation phase. :param checkpoint_trigger: when to trigger checkpoint during training. Should be a bigdl.orca.learn.trigger, like EveryEpoch(), SeveralIteration( num_iterations),etc. """ assert self.labels is not None, \ "labels is None; it should not be None in training" assert self.loss is not None, \ "loss is None; it should not be None in training" assert self.optimizer is not None, \ "optimizer is None; it should not be None in training" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in training" assert label_cols is not None, \ "label columns is None; it should not be None in training" data, validation_data = process_xshards_of_pandas_dataframe( data, feature_cols, label_cols, validation_data, "fit") if checkpoint_trigger is not None: checkpoint_trigger = Trigger.convert_trigger(checkpoint_trigger) memory_type = OrcaContext.train_data_store dataset = to_dataset(data, batch_size=batch_size, batch_per_thread=-1, validation_data=validation_data, feature_cols=feature_cols, label_cols=label_cols, hard_code_batch_size=False, sequential_order=False, shuffle=True, auto_shard_files=auto_shard_files, memory_type=memory_type) if feed_dict is not None: tensor_with_value = { key: (value[0], value[1]) for key, value in feed_dict.items() } else: tensor_with_value = None if self.use_bigdl_optim: self.tf_optimizer = TFOptimizer.from_loss( self.loss, self.optimizer, session=self.sess, inputs=(self.inputs, self.labels), dataset=dataset, clip_norm=self.clip_norm, clip_value=self.clip_value, metrics=self.metrics, tensor_with_value=tensor_with_value, session_config=session_config, model_dir=self.model_dir, updates=self.updates) else: self.tf_optimizer = TFOptimizer.from_train_op( train_op=self.train_op, loss=self.loss, inputs=self.inputs, labels=self.labels, dataset=dataset, metrics=self.metrics, updates=self.updates, sess=self.sess, tensor_with_value=tensor_with_value, session_config=session_config, model_dir=self.model_dir) if self.load_checkpoint: self.tf_optimizer.load_checkpoint(self.checkpoint_path, self.checkpoint_version) if self.log_dir and self.app_name: self.tf_optimizer.estimator.set_tensorboard( self.log_dir, self.app_name) self.tf_optimizer.optimize(end_trigger=MaxEpoch(epochs), checkpoint_trigger=checkpoint_trigger) return self
def train(self, input_fn, end_trigger): with tf.Graph().as_default() as g: dataset = input_fn() generator_inputs = dataset.tensors[0] real_data = dataset.tensors[1] counter = tf.train.get_or_create_global_step() period = self._discriminator_steps + self._generator_steps is_discriminator_phase = tf.less(tf.mod(counter, period), self._discriminator_steps) with tf.variable_scope("Generator"): gen_data = self._call_fn_maybe_with_counter( self._generator_fn, counter, generator_inputs) with tf.variable_scope("Discriminator"): fake_d_outputs = self._call_fn_maybe_with_counter( self._discriminator_fn, counter, gen_data, generator_inputs) with tf.variable_scope("Discriminator", reuse=True): real_d_outputs = self._call_fn_maybe_with_counter( self._discriminator_fn, counter, real_data, generator_inputs) with tf.name_scope("Generator_loss"): generator_loss = self._call_fn_maybe_with_counter( self._generator_loss_fn, counter, fake_d_outputs) gen_reg_loss = tf.losses.get_regularization_loss("Generator") generator_loss = generator_loss + gen_reg_loss with tf.name_scope("Discriminator_loss"): discriminator_loss = self._call_fn_maybe_with_counter( self._discriminator_loss_fn, counter, real_d_outputs, fake_d_outputs) dis_reg_loss = tf.losses.get_regularization_loss( "Discriminator") discriminator_loss = discriminator_loss + dis_reg_loss generator_variables = tf.trainable_variables("Generator") discriminator_variables = tf.trainable_variables("Discriminator") def run_gen_compute(): gen_grads_vars = self._gen_opt.compute_gradients( generator_loss, var_list=generator_variables) gen_grads = [grad for grad, var in gen_grads_vars] dis_grads = [ tf.zeros_like(var) for var in discriminator_variables ] return gen_grads + dis_grads def run_dis_compute(): dis_grads_vars = self._gen_opt.compute_gradients( discriminator_loss, var_list=discriminator_variables) dis_grads = [grad for grad, var in dis_grads_vars] gen_gards = [tf.zeros_like(var) for var in generator_variables] return gen_gards + dis_grads grads = tf.cond(is_discriminator_phase, run_dis_compute, run_gen_compute) grads_vars = list( zip(grads, generator_variables + discriminator_variables)) gen_grads_vars = grads_vars[:len(generator_variables)] dis_grads_vars = grads_vars[len(generator_variables):] grads = [grad for grad, var in grads_vars] _train_op = tf.cond( is_discriminator_phase, lambda: self._dis_opt.apply_gradients(dis_grads_vars), lambda: self._gen_opt.apply_gradients(gen_grads_vars)) variables = generator_variables + discriminator_variables loss = tf.cond(is_discriminator_phase, lambda: discriminator_loss, lambda: generator_loss) with tf.control_dependencies([_train_op]): increase_counter = tf.assign_add(counter, 1) with tf.control_dependencies([increase_counter]): train_op = tf.no_op() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() kpt = tf.train.latest_checkpoint(self.model_dir) if kpt is not None: saver.restore(sess, kpt) opt = TFOptimizer._from_grads( loss, sess, inputs=nest.flatten(dataset._original_tensors), labels=[], grads=grads, variables=variables, dataset=dataset, optim_method=FakeOptimMethod(), session_config=self._session_config, model_dir=os.path.join(self.model_dir, "tmp"), train_op=train_op) opt.optimize(end_trigger) saver = tf.train.Saver() saver.save(sess, self.checkpoint_path, global_step=counter)
def main(max_epoch, data_num): args = parser.parse_args() cluster_mode = args.cluster_mode if cluster_mode.startswith("yarn"): hadoop_conf = os.environ.get("HADOOP_CONF_DIR") assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ "set the environment variable HADOOP_CONF_DIR" spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ .set("spark.executor.cores", 2) \ .set("spark.executor.instances", 2) \ .set("spark.driver.memory", "2g") if cluster_mode == "yarn-client": sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) else: sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) else: sc = init_nncontext() # get data, pre-process and create TFDataset (train_images_data, train_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") (test_images_data, test_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") train_images_data = (train_images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD train_labels_data = train_labels_data[:data_num].astype(np.int) test_images_data = (test_images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD test_labels_data = (test_labels_data[:data_num]).astype(np.int) dataset = TFDataset.from_ndarrays( (train_images_data, train_labels_data), batch_size=360, val_tensors=(test_images_data, test_labels_data)) # construct the model from TFDataset images, labels = dataset.tensors with slim.arg_scope(lenet.lenet_arg_scope()): logits, end_points = lenet.lenet(images, num_classes=10, is_training=True) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) acc = accuracy(logits, labels) # create a optimizer optimizer = TFOptimizer.from_loss(loss, Adam(1e-3), metrics={"acc": acc}, model_dir="/tmp/lenet/") # kick off training optimizer.optimize(end_trigger=MaxEpoch(max_epoch)) saver = tf.train.Saver() saver.save(optimizer.sess, "/tmp/lenet/model")