def main(_): autodist = AutoDist(resource_spec_file, AllReduce(128)) TRUE_W = 3.0 TRUE_b = 2.0 NUM_EXAMPLES = 1000 EPOCHS = 10 inputs = np.random.randn(NUM_EXAMPLES) noises = np.random.randn(NUM_EXAMPLES) outputs = inputs * TRUE_W + TRUE_b + noises class MyIterator: def initialize(self): return tf.zeros(1) def get_next(self): # a fake one return inputs inputs_iterator = MyIterator() print('I am going to a scope.') with tf.Graph().as_default() as g, autodist.scope(): # x = placeholder(shape=[NUM_EXAMPLES], dtype=tf.float32) W = tf.Variable(5.0, name='W', dtype=tf.float64) b = tf.Variable(0.0, name='b', dtype=tf.float64) def train_step(input): def y(x): return W * x + b def l(predicted_y, desired_y): return tf.reduce_mean(tf.square(predicted_y - desired_y)) major_version, _, _ = tf.version.VERSION.split('.') if major_version == '1': optimizer = tf.train.GradientDescentOptimizer(0.01) else: optimizer = tf.optimizers.SGD(0.01) with tf.GradientTape() as tape: loss = l(y(input), outputs) vs = [W, b] # gradients = tape.gradient(target=loss, sources=vs) gradients = tf.gradients(loss, vs) train_op = optimizer.apply_gradients(zip(gradients, vs)) return loss, train_op, b fetches = train_step(inputs_iterator.get_next()) session = autodist.create_distributed_session() for epoch in range(EPOCHS): l, t, b = session.run(fetches) print('node: {}, loss: {}\nb:{}'.format( autodist._cluster.get_local_address(), l, b)) print('I am out of scope')
def train_main(args): autodist = AutoDist(resource_spec_file, Parallaxx()) with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() device_id = rank % args.num_local_worker nrank = ad.get_worker_communicate().nrank() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx, epoch, nnodes = 0, 0, 0 graph_len = graphs[0][0].y.shape[0] with tf.Graph().as_default() as g, autodist.scope(): norm_adj = tf.compat.v1.sparse.placeholder(tf.float32, name="norm_adj") sparse_feature = tf.placeholder(tf.int32, [graph_len, meta["feature"] - 1]) y_ = tf.placeholder(tf.int32, [graph_len], name="y_") train_mask = tf.placeholder(tf.float32, [graph_len], name="train_mask") loss, y, train_op = model(norm_adj, sparse_feature, y_, train_mask) sess = autodist.create_distributed_session() acc_stat = [] start = time.time() while True: g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph feed_dict = { norm_adj: mp_val, sparse_feature: g_sample.x[:, 0:-1], y_: g_sample.y, train_mask: mask } print("Before training") loss_val = sess.run([loss, y, y_, train_op], feed_dict=feed_dict) print(loss_val) pred_val = loss_val[1] true_val = loss_val[2] acc_val = np.equal(np.argmax(pred_val, 1), true_val).astype(np.float) acc_stat.append(acc_val) nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Acc : ", np.mean(acc_stat), "Time : ", time.time() - start) start = time.time() acc_stat = [] if epoch >= num_epoch: break
def train_and_save(): resource_spec_file = os.path.join(os.path.dirname(__file__), 'resource_spec.yml') autodist = AutoDist(resource_spec_file, PartitionedPS()) (train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data( num_words=vocab_size) train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data, value=0, padding='post', maxlen=256) test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data, value=0, padding='post', maxlen=256) train_labels = train_labels.astype(np.float32) with tf.Graph().as_default(), autodist.scope(): # AutoDist code my_iterator = tf.compat.v1.data.Dataset.from_tensor_slices((train_data, train_labels)) \ .shuffle(25000).batch(batch_size).repeat().make_one_shot_iterator().get_next() # my_iterator = MyIterator().get_next() model = SimpleModel() prev_time = time.time() # fetch train_op and loss loss_fn, train_op, gradients = model.train_fn(my_iterator) saver = autodist_saver() sess = autodist.create_distributed_session() for local_step in range(max_steps): loss, _ = sess.run(fetches=[loss_fn, train_op], options=config_pb2.RunOptions( trace_level=config_pb2.RunOptions.NO_TRACE)) if local_step % log_frequency == 0: cur_time = time.time() elapsed_time = cur_time - prev_time num_sentences = batch_size * log_frequency wps = float(num_sentences) / elapsed_time print( "Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % (local_step, cur_time - prev_time, wps, loss)) prev_time = cur_time saver.save(sess, checkpoint_dir + "sentiment", global_step=local_step) print(sess.run(model.emb)) print(sess.run(gradients[0])) print('ending...')
gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) #calcualte metrics train_loss(loss) train_accuracy(batch_y, predictions) #define test procedure @d.function def test_step(batch_x, batch_y): predictions = model(batch_x) t_loss = loss_object(y_true=batch_y, y_pred=predictions) test_loss(t_loss) test_accuracy(batch_y, predictions) session = autodist.create_distributed_session() #run train for epoch in range(max_epoch): batch_steps = int(len(partial_x_train) / batch_size) for i in range(batch_steps): batch_x = partial_x_train[i * batch_size:(i + 1) * batch_size] batch_y = partial_y_train[i * batch_size:(i + 1) * batch_size] fetches_train = train_step(batch_x, batch_y) session.run(fetches_train) #check validation accuracy fetches_test = test_step(x_val, y_val) session.run(fetches_test) template = 'Epoch {}, loss: {} - acc: {} - val_loss: {} - val_acc: {}'
def train_criteo(model, args): resource_spec_file = os.path.join(os.path.dirname(__file__), 'settings', 'plx_dist_spec.yml') # resource_spec_file = os.path.join(os.path.dirname(__file__), 'settings', 'plx_local_spec.yml') autodist = AutoDist(resource_spec_file, Parallaxx()) respec = ResourceSpec(resource_spec_file) if args.all: from models.load_data import process_all_criteo_data dense, sparse, all_labels = process_all_criteo_data() dense_feature, val_dense = dense sparse_feature, val_sparse = sparse labels, val_labels = all_labels else: from models.load_data import process_sampled_criteo_data dense_feature, sparse_feature, labels = process_sampled_criteo_data() # autodist will split the feeding data batch_size = 128 with tf.Graph().as_default() as g, autodist.scope(): dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) embed_partitioner = tf.fixed_size_partitioner(len( respec.nodes), 0) if len(respec.nodes) > 1 else None loss, y, opt = model(dense_input, sparse_input, y_, embed_partitioner) train_op = opt.minimize(loss) sess = autodist.create_distributed_session() my_feed_dict = { dense_input: np.empty(shape=(batch_size, 13)), sparse_input: np.empty(shape=(batch_size, 26)), y_: np.empty(shape=(batch_size, 1)), } if args.all: raw_log_file = os.path.join( os.path.split(os.path.abspath(__file__))[0], 'logs', 'tf_plx_%s.log' % (args.model)) print('Processing all data, log to', raw_log_file) log_file = open(raw_log_file, 'w') iterations = dense_feature.shape[0] // batch_size total_epoch = 32 start_index = 0 for ep in range(total_epoch): # print("iters: %d" % (lp * 1000)) print("epoch %d" % ep) st_time = time.time() train_loss, train_acc, train_auc = [], [], [] for it in tqdm( range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))): my_feed_dict[dense_input][:] = dense_feature[ start_index:start_index + batch_size] my_feed_dict[sparse_input][:] = sparse_feature[ start_index:start_index + batch_size] my_feed_dict[y_][:] = labels[start_index:start_index + batch_size] start_index += batch_size if start_index + batch_size > dense_feature.shape[0]: start_index = 0 loss_val = sess.run([loss, y, y_, train_op], feed_dict=my_feed_dict) pred_val = loss_val[1] true_val = loss_val[2] acc_val = np.equal(true_val, pred_val > 0.5) train_loss.append(loss_val[0]) train_acc.append(acc_val) train_auc.append(metrics.roc_auc_score(true_val, pred_val)) tra_accuracy = np.mean(train_acc) tra_loss = np.mean(train_loss) tra_auc = np.mean(train_auc) en_time = time.time() train_time = en_time - st_time printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ % (tra_loss, tra_accuracy, tra_auc, train_time) print(printstr) log_file.write(printstr + '\n') log_file.flush() else: iteration = dense_feature.shape[0] // batch_size epoch = 50 for ep in range(epoch): print('epoch', ep) if ep == 5: start = time.time() ep_st = time.time() train_loss = [] train_acc = [] for idx in range(iteration): start_index = idx * batch_size my_feed_dict[dense_input][:] = dense_feature[ start_index:start_index + batch_size] my_feed_dict[sparse_input][:] = sparse_feature[ start_index:start_index + batch_size] my_feed_dict[y_][:] = labels[start_index:start_index + batch_size] loss_val = sess.run([loss, y, y_, train_op], feed_dict=my_feed_dict) pred_val = loss_val[1] true_val = loss_val[2] if pred_val.shape[1] == 1: # for criteo case acc_val = np.equal(true_val, pred_val > 0.5) else: acc_val = np.equal(np.argmax(pred_val, 1), np.argmax(true_val, 1)).astype(np.float) train_loss.append(loss_val[0]) train_acc.append(acc_val) tra_accuracy = np.mean(train_acc) tra_loss = np.mean(train_loss) ep_en = time.time() print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" % (tra_loss, tra_accuracy, ep_en - ep_st)) print('all time:', (time.time() - start))
# update = optimizer.apply_gradients(zip(new_grads, all_vars)) return loss, optimizer.iterations, update def test_step(x, y): y_hat = model(x, training=True) return y, y_hat fetches = train_step(x, y) test_fetches = test_step(x, y) e_losses = [] e_train_accuracy = [] e_test_accuracy = [] with autodist.create_distributed_session() as sess: for epoch in range(EPOCHS): j = 0 for _ in range(train_steps_per_epoch): loss, i, _ = sess.run( fetches, { x: train_images[j:j + BATCH_SIZE], y: train_labels[j:j + BATCH_SIZE] }) #print(f"step: {i}, train_loss: {loss}") j += BATCH_SIZE e_losses.append(loss) yy, prediction = sess.run(test_fetches, { x: train_images[0:400], y: train_labels[0:400]
def train_and_save(): """ Train the model and save the serialized model and its weights. """ autodist = AutoDist(resource_spec_file, AllReduce(128)) print('I am going to a scope.') with tf.Graph().as_default() as g, autodist.scope(): x = tf.compat.v1.placeholder(shape=[NUM_EXAMPLES], dtype=tf.float64) W = tf.Variable(5.0, name='W', dtype=tf.float64) b = tf.Variable(0.0, name='b', dtype=tf.float64) def y(): return W * x + b def l(predicted_y, desired_y): return tf.reduce_mean(tf.square(predicted_y - desired_y)) major_version, _, _ = tf.version.VERSION.split('.') if major_version == '1': optimizer = tf.train.GradientDescentOptimizer(0.01) else: optimizer = tf.optimizers.SGD(0.01) with tf.GradientTape() as tape: prediction = y() loss = l(prediction, outputs) vs = [W, b] gradients = tf.gradients(loss, vs) train_op = optimizer.apply_gradients(zip(gradients, vs)) ops.add_to_collection(TRAIN_OP_KEY, train_op) fetches = [loss, train_op, b, prediction] feeds = [x] # NOTE: The AutoDist saver should be declared before the wrapped session. saver = autodist_saver() session = autodist.create_distributed_session() for _ in range(EPOCHS): l, _, b, _ = session.run(fetches, feed_dict={feeds[0]: inputs}) print('node: {}, loss: {}\nb:{}'.format(autodist._cluster.get_local_address(), l, b)) print('I am out of scope') inputs_info = { "input_data": saved_model.utils.build_tensor_info(feeds[0]) } outputs_info = { "loss": saved_model.utils.build_tensor_info(fetches[0]), "prediction": saved_model.utils.build_tensor_info(fetches[3]) } serving_signature = saved_model.signature_def_utils.build_signature_def( inputs=inputs_info, outputs=outputs_info, method_name=saved_model.signature_constants.PREDICT_METHOD_NAME ) signature_map = { saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: serving_signature, } if os.path.exists(EXPORT_DIR): shutil.rmtree(EXPORT_DIR) builder = SavedModelBuilder(EXPORT_DIR) builder.add_meta_graph_and_variables( sess=session, tags=[TAG_NAME], saver=saver, signature_def_map=signature_map) builder.save()
def train_main(args): resource_spec_file = os.path.join(os.path.dirname(__file__), '../../examples/ctr/settings', 'plx_dist_spec.yml') autodist = AutoDist(resource_spec_file, Parallaxx()) with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() device_id = rank % args.num_local_worker nrank = ad.get_worker_communicate().nrank() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx, epoch, nnodes = 0, 0, 0 worker_device = "gpu:0" graph_len = graphs[0][0].y.shape[0] with tf.Graph().as_default() as g, autodist.scope(): with tf.device(worker_device): norm_adj = tf.compat.v1.sparse.placeholder(tf.float32, name="norm_adj") sparse_feature = tf.placeholder(tf.int32, [graph_len, meta["feature"] - 1]) y_ = tf.placeholder(tf.int32, [graph_len], name="y_") train_mask = tf.placeholder(tf.float32, [graph_len], name="train_mask") loss, y, train_op = model(norm_adj, sparse_feature, y_, train_mask) # init=tf.global_variables_initializer() # gpu_options = tf.GPUOptions(allow_growth=True) # sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess = autodist.create_distributed_session() # sess.run(init) acc_cnt, total_cnt = 0, 0 train_acc, train_cnt = 0, 0 start = time.time() while True: g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph feed_dict = { norm_adj: mp_val, sparse_feature: g_sample.x[:, 0:-1], y_: g_sample.y, train_mask: mask } loss_val = sess.run([loss, y, train_op], feed_dict=feed_dict) pred_val = loss_val[1] acc_val = np.equal(np.argmax(pred_val, 1), g_sample.y).astype(np.float) acc_cnt += (acc_val * mask_eval).sum() total_cnt += mask_eval.sum() nnodes += mask.sum() + mask_eval.sum() train_acc += (acc_val * mask).sum() train_cnt += mask.sum() if nnodes > meta["partition"]["nodes"][rank] // 10: nnodes = 0 epoch += 1 print("Acc : ", acc_cnt / total_cnt, train_acc / train_cnt, "Time : ", time.time() - start) # print(pred_val) start = time.time() acc_cnt, total_cnt = 0, 0 train_acc, train_cnt = 0, 0 if epoch >= num_epoch: break
def main(): resource_spec_file = os.path.join(os.path.dirname(__file__), '../ctr/settings', 'plx_dist_spec.yml') autodist = AutoDist(resource_spec_file, Parallaxx()) respec = ResourceSpec(resource_spec_file) def validate(): # validate phase hits, ndcgs = [], [] for idx in range(num_users): start_index = idx * 100 my_feed_dict = { user_input: testUserInput[start_index:start_index + 100], item_input: testItemInput[start_index:start_index + 100], } predictions = sess.run([y], feed_dict=my_feed_dict) map_item_score = { testItemInput[start_index + i]: predictions[0][i] for i in range(100) } # Evaluate top rank list ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get) hr = getHitRatio(ranklist, testItemInput[start_index]) ndcg = getNDCG(ranklist, testItemInput[start_index]) hits.append(hr) ndcgs.append(ndcg) hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() return hr, ndcg from movielens import getdata trainData, testData = getdata('ml-25m', 'datasets') testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100) testItemInput = testData.reshape((-1, )) num_users, num_items = { 'ml-1m': (6040, 3706), 'ml-20m': (138493, 26744), 'ml-25m': (162541, 59047), }['ml-25m'] batch_size = 1024 num_negatives = 4 topK = 10 with tf.Graph().as_default() as g, autodist.scope(): user_input = tf.compat.v1.placeholder(tf.int32, [ None, ]) item_input = tf.compat.v1.placeholder(tf.int32, [ None, ]) y_ = tf.compat.v1.placeholder(tf.float32, [ None, ]) loss, y, opt = neural_mf(user_input, item_input, y_, num_users, num_items) train_op = opt.minimize(loss) # init = tf.compat.v1.global_variables_initializer() # gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) sess = autodist.create_distributed_session() # sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) # sess.run(init) log = Logging( path=os.path.join(os.path.dirname(__file__), 'logs', 'tfplx.txt')) epoch = 7 iterations = trainData['user_input'].shape[0] // batch_size start = time.time() for ep in range(epoch): ep_st = time.time() log.write('epoch %d' % ep) train_loss = [] for idx in range(iterations): start_index = idx * batch_size my_feed_dict = { user_input: trainData['user_input'][start_index:start_index + batch_size], item_input: trainData['item_input'][start_index:start_index + batch_size], y_: trainData['labels'][start_index:start_index + batch_size], } loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict) train_loss.append(loss_val[0]) # if idx % 10000 == 0: # hr, ndcg = validate() # printstr = "HR: %.4f, NDCF: %.4f" % (hr, ndcg) # log.write(printstr) tra_loss = np.mean(train_loss) ep_en = time.time() # validate phase # hr, ndcg = validate() printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss, ep_en - ep_st) log.write(printstr) log.write('all time:', (time.time() - start))
def run(flags_obj): """ Run ResNet ImageNet training and eval loop using native Keras APIs. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ ######################################################################### # Construct AutoDist with ResourceSpec for Different Strategies if flags_obj.autodist_patch_tf: os.environ['AUTODIST_PATCH_TF'] = '1' else: os.environ['AUTODIST_PATCH_TF'] = '0' if flags_obj.cnn_model == 'vgg16': chunk = 25 elif flags_obj.cnn_model == 'resnet101': chunk = 200 elif flags_obj.cnn_model == 'inceptionv3': chunk = 30 else: chunk = 512 if flags_obj.autodist_strategy == 'PS': autodist = AutoDist(resource_spec_file, PS(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'PSLoadBalancing': autodist = AutoDist( resource_spec_file, PSLoadBalancing(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'PartitionedPS': autodist = AutoDist( resource_spec_file, PartitionedPS(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'AllReduce': autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=chunk)) elif flags_obj.autodist_strategy == 'Parallax': autodist = AutoDist( resource_spec_file, Parallax(chunk_size=chunk, local_proxy_variable=flags_obj.proxy)) else: raise ValueError( 'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax' ) ######################################################################### dtype = flags_core.get_tf_dtype(flags_obj) if dtype == tf.float16: loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) policy = tf.compat.v1.keras.mixed_precision.experimental.Policy( 'mixed_float16', loss_scale=loss_scale) tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy) if not keras_utils.is_v2_0(): raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.') elif dtype == tf.bfloat16: policy = tf.compat.v1.keras.mixed_precision.experimental.Policy( 'mixed_bfloat16') tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy) input_fn = imagenet_preprocessing.input_fn drop_remainder = flags_obj.enable_xla if 'vgg' in flags_obj.cnn_model: lr_schedule = 0.01 else: lr_schedule = 0.1 if flags_obj.use_tensor_lr: lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) ######################################################################### # Build with Graph mode, and put all under AutoDist scope. with tf.Graph().as_default(), autodist.scope(): ########################################################################## train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=imagenet_preprocessing.parse_record, datasets_num_private_threads=flags_obj. datasets_num_private_threads, dtype=dtype, drop_remainder=drop_remainder, tf_data_experimental_slack=flags_obj.tf_data_experimental_slack, training_dataset_cache=flags_obj.training_dataset_cache, ) if flags_obj.cnn_model == 'resnet101': model = tf.keras.applications.ResNet101( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'vgg16': model = tf.keras.applications.VGG16( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'inceptionv3': model = tf.keras.applications.InceptionV3( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'densenet121': model = tf.keras.applications.DenseNet121( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) else: raise ValueError('Other Model Undeveloped') optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, beta_1=0.9, beta_2=0.999, epsilon=1e-08) train_input_iterator = tf.compat.v1.data.make_one_shot_iterator( train_input_dataset) train_input, train_target = train_input_iterator.get_next() steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) train_epochs = flags_obj.train_epochs if flags_obj.enable_checkpoint_and_export: ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}') if train_epochs <= 1 and flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) train_epochs = 1 num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) train_output = model(train_input, training=True) scc_loss = tf.keras.losses.SparseCategoricalCrossentropy() loss = scc_loss(train_target, train_output) var_list = variables.trainable_variables() + \ ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) grad = optimizer.get_gradients(loss, var_list) train_op = optimizer.apply_gradients(zip(grad, var_list)) ##################################################################### # Create distributed session. # Instead of using the original TensorFlow session for graph execution, # let's use AutoDist's distributed session, in which a computational # graph for distributed training is constructed. # # [original line] # >>> sess = tf.compat.v1.Session() # sess = autodist.create_distributed_session() ##################################################################### summary = TimeHistory(flags_obj.batch_size, steps_per_epoch) for epoch_id in range(train_epochs): summary.on_epoch_begin(epoch_id) for batch_id in range(steps_per_epoch): summary.on_batch_begin(batch_id) loss_v, _ = sess.run([loss, train_op]) summary.on_batch_end(batch_id, loss_v) summary.on_epoch_end(epoch_id) summary.on_train_end() return