def main(_): autodist = AutoDist(resource_spec_file, AllReduce(128)) TRUE_W = 3.0 TRUE_b = 2.0 NUM_EXAMPLES = 1000 EPOCHS = 10 inputs = np.random.randn(NUM_EXAMPLES) noises = np.random.randn(NUM_EXAMPLES) outputs = inputs * TRUE_W + TRUE_b + noises class MyIterator: def initialize(self): return tf.zeros(1) def get_next(self): # a fake one return inputs inputs_iterator = MyIterator() print('I am going to a scope.') with tf.Graph().as_default() as g, autodist.scope(): # x = placeholder(shape=[NUM_EXAMPLES], dtype=tf.float32) W = tf.Variable(5.0, name='W', dtype=tf.float64) b = tf.Variable(0.0, name='b', dtype=tf.float64) def train_step(input): def y(x): return W * x + b def l(predicted_y, desired_y): return tf.reduce_mean(tf.square(predicted_y - desired_y)) major_version, _, _ = tf.version.VERSION.split('.') if major_version == '1': optimizer = tf.train.GradientDescentOptimizer(0.01) else: optimizer = tf.optimizers.SGD(0.01) with tf.GradientTape() as tape: loss = l(y(input), outputs) vs = [W, b] # gradients = tape.gradient(target=loss, sources=vs) gradients = tf.gradients(loss, vs) train_op = optimizer.apply_gradients(zip(gradients, vs)) return loss, train_op, b fetches = train_step(inputs_iterator.get_next()) session = autodist.create_distributed_session() for epoch in range(EPOCHS): l, t, b = session.run(fetches) print('node: {}, loss: {}\nb:{}'.format( autodist._cluster.get_local_address(), l, b)) print('I am out of scope')
def train_main(args): autodist = AutoDist(resource_spec_file, Parallaxx()) with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() device_id = rank % args.num_local_worker nrank = ad.get_worker_communicate().nrank() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx, epoch, nnodes = 0, 0, 0 graph_len = graphs[0][0].y.shape[0] with tf.Graph().as_default() as g, autodist.scope(): norm_adj = tf.compat.v1.sparse.placeholder(tf.float32, name="norm_adj") sparse_feature = tf.placeholder(tf.int32, [graph_len, meta["feature"] - 1]) y_ = tf.placeholder(tf.int32, [graph_len], name="y_") train_mask = tf.placeholder(tf.float32, [graph_len], name="train_mask") loss, y, train_op = model(norm_adj, sparse_feature, y_, train_mask) sess = autodist.create_distributed_session() acc_stat = [] start = time.time() while True: g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph feed_dict = { norm_adj: mp_val, sparse_feature: g_sample.x[:, 0:-1], y_: g_sample.y, train_mask: mask } print("Before training") loss_val = sess.run([loss, y, y_, train_op], feed_dict=feed_dict) print(loss_val) pred_val = loss_val[1] true_val = loss_val[2] acc_val = np.equal(np.argmax(pred_val, 1), true_val).astype(np.float) acc_stat.append(acc_val) nnodes += mask.sum() + mask_eval.sum() if nnodes > meta["partition"]["nodes"][rank]: nnodes = 0 epoch += 1 print("Acc : ", np.mean(acc_stat), "Time : ", time.time() - start) start = time.time() acc_stat = [] if epoch >= num_epoch: break
def train_and_save(): resource_spec_file = os.path.join(os.path.dirname(__file__), 'resource_spec.yml') autodist = AutoDist(resource_spec_file, PartitionedPS()) (train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data( num_words=vocab_size) train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data, value=0, padding='post', maxlen=256) test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data, value=0, padding='post', maxlen=256) train_labels = train_labels.astype(np.float32) with tf.Graph().as_default(), autodist.scope(): # AutoDist code my_iterator = tf.compat.v1.data.Dataset.from_tensor_slices((train_data, train_labels)) \ .shuffle(25000).batch(batch_size).repeat().make_one_shot_iterator().get_next() # my_iterator = MyIterator().get_next() model = SimpleModel() prev_time = time.time() # fetch train_op and loss loss_fn, train_op, gradients = model.train_fn(my_iterator) saver = autodist_saver() sess = autodist.create_distributed_session() for local_step in range(max_steps): loss, _ = sess.run(fetches=[loss_fn, train_op], options=config_pb2.RunOptions( trace_level=config_pb2.RunOptions.NO_TRACE)) if local_step % log_frequency == 0: cur_time = time.time() elapsed_time = cur_time - prev_time num_sentences = batch_size * log_frequency wps = float(num_sentences) / elapsed_time print( "Iteration %d, time = %.2fs, wps = %.0f, train loss = %.4f" % (local_step, cur_time - prev_time, wps, loss)) prev_time = cur_time saver.save(sess, checkpoint_dir + "sentiment", global_step=local_step) print(sess.run(model.emb)) print(sess.run(gradients[0])) print('ending...')
def train_criteo(model, args): resource_spec_file = os.path.join(os.path.dirname(__file__), 'settings', 'plx_dist_spec.yml') # resource_spec_file = os.path.join(os.path.dirname(__file__), 'settings', 'plx_local_spec.yml') autodist = AutoDist(resource_spec_file, Parallaxx()) respec = ResourceSpec(resource_spec_file) if args.all: from models.load_data import process_all_criteo_data dense, sparse, all_labels = process_all_criteo_data() dense_feature, val_dense = dense sparse_feature, val_sparse = sparse labels, val_labels = all_labels else: from models.load_data import process_sampled_criteo_data dense_feature, sparse_feature, labels = process_sampled_criteo_data() # autodist will split the feeding data batch_size = 128 with tf.Graph().as_default() as g, autodist.scope(): dense_input = tf.compat.v1.placeholder(tf.float32, [batch_size, 13]) sparse_input = tf.compat.v1.placeholder(tf.int32, [batch_size, 26]) y_ = y_ = tf.compat.v1.placeholder(tf.float32, [batch_size, 1]) embed_partitioner = tf.fixed_size_partitioner(len( respec.nodes), 0) if len(respec.nodes) > 1 else None loss, y, opt = model(dense_input, sparse_input, y_, embed_partitioner) train_op = opt.minimize(loss) sess = autodist.create_distributed_session() my_feed_dict = { dense_input: np.empty(shape=(batch_size, 13)), sparse_input: np.empty(shape=(batch_size, 26)), y_: np.empty(shape=(batch_size, 1)), } if args.all: raw_log_file = os.path.join( os.path.split(os.path.abspath(__file__))[0], 'logs', 'tf_plx_%s.log' % (args.model)) print('Processing all data, log to', raw_log_file) log_file = open(raw_log_file, 'w') iterations = dense_feature.shape[0] // batch_size total_epoch = 32 start_index = 0 for ep in range(total_epoch): # print("iters: %d" % (lp * 1000)) print("epoch %d" % ep) st_time = time.time() train_loss, train_acc, train_auc = [], [], [] for it in tqdm( range(iterations // 10 + (ep % 10 == 9) * (iterations % 10))): my_feed_dict[dense_input][:] = dense_feature[ start_index:start_index + batch_size] my_feed_dict[sparse_input][:] = sparse_feature[ start_index:start_index + batch_size] my_feed_dict[y_][:] = labels[start_index:start_index + batch_size] start_index += batch_size if start_index + batch_size > dense_feature.shape[0]: start_index = 0 loss_val = sess.run([loss, y, y_, train_op], feed_dict=my_feed_dict) pred_val = loss_val[1] true_val = loss_val[2] acc_val = np.equal(true_val, pred_val > 0.5) train_loss.append(loss_val[0]) train_acc.append(acc_val) train_auc.append(metrics.roc_auc_score(true_val, pred_val)) tra_accuracy = np.mean(train_acc) tra_loss = np.mean(train_loss) tra_auc = np.mean(train_auc) en_time = time.time() train_time = en_time - st_time printstr = "train_loss: %.4f, train_acc: %.4f, train_auc: %.4f, train_time: %.4f"\ % (tra_loss, tra_accuracy, tra_auc, train_time) print(printstr) log_file.write(printstr + '\n') log_file.flush() else: iteration = dense_feature.shape[0] // batch_size epoch = 50 for ep in range(epoch): print('epoch', ep) if ep == 5: start = time.time() ep_st = time.time() train_loss = [] train_acc = [] for idx in range(iteration): start_index = idx * batch_size my_feed_dict[dense_input][:] = dense_feature[ start_index:start_index + batch_size] my_feed_dict[sparse_input][:] = sparse_feature[ start_index:start_index + batch_size] my_feed_dict[y_][:] = labels[start_index:start_index + batch_size] loss_val = sess.run([loss, y, y_, train_op], feed_dict=my_feed_dict) pred_val = loss_val[1] true_val = loss_val[2] if pred_val.shape[1] == 1: # for criteo case acc_val = np.equal(true_val, pred_val > 0.5) else: acc_val = np.equal(np.argmax(pred_val, 1), np.argmax(true_val, 1)).astype(np.float) train_loss.append(loss_val[0]) train_acc.append(acc_val) tra_accuracy = np.mean(train_acc) tra_loss = np.mean(train_loss) ep_en = time.time() print("train_loss: %.4f, train_acc: %.4f, train_time: %.4f" % (tra_loss, tra_accuracy, ep_en - ep_st)) print('all time:', (time.time() - start))
fashion_mnist = tf.keras.datasets.fashion_mnist (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data() train_images = train_images[:512, :, :, None] test_images = test_images[:512, :, :, None] train_labels = train_labels[:512] test_labels = test_labels[:512] train_images = train_images / np.float32(255) test_images = test_images / np.float32(255) BATCH_SIZE = 64 EPOCHS = 1 ############################################################# # Step 2: Build with Graph mode, and put it under AutoDist scope with tf.Graph().as_default(), autodist.scope(): ############################################################# train_dataset = tf.data.Dataset.from_tensor_slices( (train_images, train_labels)).repeat(EPOCHS).shuffle(len(train_images)//2).batch(BATCH_SIZE) train_iterator = tf.compat.v1.data.make_one_shot_iterator(train_dataset).get_next() model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation='relu'), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dropout(0.1), tf.keras.layers.Dense(10, activation='softmax') ]) loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
def train_and_save(): """ Train the model and save the serialized model and its weights. """ autodist = AutoDist(resource_spec_file, AllReduce(128)) print('I am going to a scope.') with tf.Graph().as_default() as g, autodist.scope(): x = tf.compat.v1.placeholder(shape=[NUM_EXAMPLES], dtype=tf.float64) W = tf.Variable(5.0, name='W', dtype=tf.float64) b = tf.Variable(0.0, name='b', dtype=tf.float64) def y(): return W * x + b def l(predicted_y, desired_y): return tf.reduce_mean(tf.square(predicted_y - desired_y)) major_version, _, _ = tf.version.VERSION.split('.') if major_version == '1': optimizer = tf.train.GradientDescentOptimizer(0.01) else: optimizer = tf.optimizers.SGD(0.01) with tf.GradientTape() as tape: prediction = y() loss = l(prediction, outputs) vs = [W, b] gradients = tf.gradients(loss, vs) train_op = optimizer.apply_gradients(zip(gradients, vs)) ops.add_to_collection(TRAIN_OP_KEY, train_op) fetches = [loss, train_op, b, prediction] feeds = [x] # NOTE: The AutoDist saver should be declared before the wrapped session. saver = autodist_saver() session = autodist.create_distributed_session() for _ in range(EPOCHS): l, _, b, _ = session.run(fetches, feed_dict={feeds[0]: inputs}) print('node: {}, loss: {}\nb:{}'.format(autodist._cluster.get_local_address(), l, b)) print('I am out of scope') inputs_info = { "input_data": saved_model.utils.build_tensor_info(feeds[0]) } outputs_info = { "loss": saved_model.utils.build_tensor_info(fetches[0]), "prediction": saved_model.utils.build_tensor_info(fetches[3]) } serving_signature = saved_model.signature_def_utils.build_signature_def( inputs=inputs_info, outputs=outputs_info, method_name=saved_model.signature_constants.PREDICT_METHOD_NAME ) signature_map = { saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: serving_signature, } if os.path.exists(EXPORT_DIR): shutil.rmtree(EXPORT_DIR) builder = SavedModelBuilder(EXPORT_DIR) builder.add_meta_graph_and_variables( sess=session, tags=[TAG_NAME], saver=saver, signature_def_map=signature_map) builder.save()
def run_ncf(FLAGS): """Run NCF training and eval with Keras.""" ######################################################################### # Construct AutoDist with ResourceSpec for Different Strategies resource_spec_file = os.path.join( os.path.dirname(__file__), '../resource_spec.yml') resource_info = yaml.safe_load(open(resource_spec_file, 'r')) try: node_num = len(resource_info['nodes']) except ValueError: print("nodes need to be set in specficiation file") try: gpu_num = len(resource_info['nodes'][0]['gpus']) except ValueError: print("gpus need to be set in specficiation file") if FLAGS.autodist_patch_tf: os.environ['AUTODIST_PATCH_TF'] = '1' else: os.environ['AUTODIST_PATCH_TF'] = '0' if FLAGS.proxy: local_proxy_variable = True else: local_proxy_variable = False if FLAGS.autodist_strategy == 'PS': autodist = AutoDist( resource_spec_file, PS( local_proxy_variable=local_proxy_variable)) elif FLAGS.autodist_strategy == 'PSLoadBalancing': autodist = AutoDist(resource_spec_file, PSLoadBalancing( local_proxy_variable=local_proxy_variable)) elif FLAGS.autodist_strategy == 'PartitionedPS': autodist = AutoDist(resource_spec_file, PartitionedPS( local_proxy_variable=local_proxy_variable)) elif FLAGS.autodist_strategy == 'AllReduce': autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=256)) elif FLAGS.autodist_strategy == 'Parallax': autodist = AutoDist( resource_spec_file, Parallax( chunk_size=256, local_proxy_variable=local_proxy_variable)) else: raise ValueError( 'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax') ######################################################################### if FLAGS.seed is not None: print("Setting tf seed") tf.random.set_seed(FLAGS.seed) model_helpers.apply_clean(FLAGS) if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras": policy = tf.keras.mixed_precision.experimental.Policy( "mixed_float16", loss_scale=flags_core.get_loss_scale( FLAGS, default_for_fp16="dynamic")) tf.keras.mixed_precision.experimental.set_policy(policy) params = ncf_common.parse_flags(FLAGS) params["distribute_strategy"] = None batch_size = params["batch_size"] time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) callbacks = [time_callback] producer, input_meta_data = None, None generate_input_online = params["train_dataset_path"] is None if generate_input_online: num_users, num_items, _, _, producer = ncf_common.get_inputs(params) producer.start() per_epoch_callback = IncrementEpochCallback(producer) callbacks.append(per_epoch_callback) else: assert params["eval_dataset_path"] and params["input_meta_data_path"] with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: input_meta_data = json.loads(reader.read().decode("utf-8")) num_users = input_meta_data["num_users"] num_items = input_meta_data["num_items"] params["num_users"], params["num_items"] = num_users, num_items if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_HR_METRIC", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) with tf.Graph().as_default(), autodist.scope(): (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = ( ncf_input_pipeline.create_ncf_input_data(params, producer, input_meta_data, None)) steps_per_epoch = None if generate_input_online else num_train_steps keras_model = _get_keras_model(params) if FLAGS.optimizer == 'adam': optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) elif FLAGS.optimizer == 'sgd': optimizer = tf.keras.optimizers.SGD( learning_rate=params["learning_rate"]) elif FLAGS.optimizer == 'lazyadam': optimizer = LazyAdam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) else: raise ValueError('Do not support other optimizers...') if FLAGS.fp16_implementation == "graph_rewrite": optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) elif FLAGS.dtype == "fp16" and params["keras_use_ctl"]: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, tf.keras.mixed_precision.experimental.global_policy().loss_scale) return run_ncf_custom_training( params, autodist, keras_model, optimizer, callbacks, train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps, generate_input_online=generate_input_online, return_simulation=FLAGS.simulation_strategy_id is not None)
def train_main(args): resource_spec_file = os.path.join(os.path.dirname(__file__), '../../examples/ctr/settings', 'plx_dist_spec.yml') autodist = AutoDist(resource_spec_file, Parallaxx()) with open(os.path.join(args.path, "meta.yml"), 'rb') as f: meta = yaml.load(f.read(), Loader=yaml.FullLoader) hidden_layer_size = args.hidden_size num_epoch = args.num_epoch rank = ad.get_worker_communicate().rank() device_id = rank % args.num_local_worker nrank = ad.get_worker_communicate().nrank() distributed.ps_init(rank, nrank) ngraph = meta["partition"]["nodes"][rank] // args.batch_size graphs = prepare_data(ngraph) idx, epoch, nnodes = 0, 0, 0 worker_device = "gpu:0" graph_len = graphs[0][0].y.shape[0] with tf.Graph().as_default() as g, autodist.scope(): with tf.device(worker_device): norm_adj = tf.compat.v1.sparse.placeholder(tf.float32, name="norm_adj") sparse_feature = tf.placeholder(tf.int32, [graph_len, meta["feature"] - 1]) y_ = tf.placeholder(tf.int32, [graph_len], name="y_") train_mask = tf.placeholder(tf.float32, [graph_len], name="train_mask") loss, y, train_op = model(norm_adj, sparse_feature, y_, train_mask) # init=tf.global_variables_initializer() # gpu_options = tf.GPUOptions(allow_growth=True) # sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess = autodist.create_distributed_session() # sess.run(init) acc_cnt, total_cnt = 0, 0 train_acc, train_cnt = 0, 0 start = time.time() while True: g_sample, mp_val, mask, mask_eval = graphs[idx] idx = (idx + 1) % ngraph feed_dict = { norm_adj: mp_val, sparse_feature: g_sample.x[:, 0:-1], y_: g_sample.y, train_mask: mask } loss_val = sess.run([loss, y, train_op], feed_dict=feed_dict) pred_val = loss_val[1] acc_val = np.equal(np.argmax(pred_val, 1), g_sample.y).astype(np.float) acc_cnt += (acc_val * mask_eval).sum() total_cnt += mask_eval.sum() nnodes += mask.sum() + mask_eval.sum() train_acc += (acc_val * mask).sum() train_cnt += mask.sum() if nnodes > meta["partition"]["nodes"][rank] // 10: nnodes = 0 epoch += 1 print("Acc : ", acc_cnt / total_cnt, train_acc / train_cnt, "Time : ", time.time() - start) # print(pred_val) start = time.time() acc_cnt, total_cnt = 0, 0 train_acc, train_cnt = 0, 0 if epoch >= num_epoch: break
def main(): resource_spec_file = os.path.join(os.path.dirname(__file__), '../ctr/settings', 'plx_dist_spec.yml') autodist = AutoDist(resource_spec_file, Parallaxx()) respec = ResourceSpec(resource_spec_file) def validate(): # validate phase hits, ndcgs = [], [] for idx in range(num_users): start_index = idx * 100 my_feed_dict = { user_input: testUserInput[start_index:start_index + 100], item_input: testItemInput[start_index:start_index + 100], } predictions = sess.run([y], feed_dict=my_feed_dict) map_item_score = { testItemInput[start_index + i]: predictions[0][i] for i in range(100) } # Evaluate top rank list ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get) hr = getHitRatio(ranklist, testItemInput[start_index]) ndcg = getNDCG(ranklist, testItemInput[start_index]) hits.append(hr) ndcgs.append(ndcg) hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() return hr, ndcg from movielens import getdata trainData, testData = getdata('ml-25m', 'datasets') testUserInput = np.repeat(np.arange(testData.shape[0], dtype=np.int32), 100) testItemInput = testData.reshape((-1, )) num_users, num_items = { 'ml-1m': (6040, 3706), 'ml-20m': (138493, 26744), 'ml-25m': (162541, 59047), }['ml-25m'] batch_size = 1024 num_negatives = 4 topK = 10 with tf.Graph().as_default() as g, autodist.scope(): user_input = tf.compat.v1.placeholder(tf.int32, [ None, ]) item_input = tf.compat.v1.placeholder(tf.int32, [ None, ]) y_ = tf.compat.v1.placeholder(tf.float32, [ None, ]) loss, y, opt = neural_mf(user_input, item_input, y_, num_users, num_items) train_op = opt.minimize(loss) # init = tf.compat.v1.global_variables_initializer() # gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) sess = autodist.create_distributed_session() # sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options)) # sess.run(init) log = Logging( path=os.path.join(os.path.dirname(__file__), 'logs', 'tfplx.txt')) epoch = 7 iterations = trainData['user_input'].shape[0] // batch_size start = time.time() for ep in range(epoch): ep_st = time.time() log.write('epoch %d' % ep) train_loss = [] for idx in range(iterations): start_index = idx * batch_size my_feed_dict = { user_input: trainData['user_input'][start_index:start_index + batch_size], item_input: trainData['item_input'][start_index:start_index + batch_size], y_: trainData['labels'][start_index:start_index + batch_size], } loss_val = sess.run([loss, train_op], feed_dict=my_feed_dict) train_loss.append(loss_val[0]) # if idx % 10000 == 0: # hr, ndcg = validate() # printstr = "HR: %.4f, NDCF: %.4f" % (hr, ndcg) # log.write(printstr) tra_loss = np.mean(train_loss) ep_en = time.time() # validate phase # hr, ndcg = validate() printstr = "train_loss: %.4f, train_time: %.4f" % (tra_loss, ep_en - ep_st) log.write(printstr) log.write('all time:', (time.time() - start))
def run(flags_obj): """ Run ResNet ImageNet training and eval loop using native Keras APIs. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ ######################################################################### # Construct AutoDist with ResourceSpec for Different Strategies if flags_obj.autodist_patch_tf: os.environ['AUTODIST_PATCH_TF'] = '1' else: os.environ['AUTODIST_PATCH_TF'] = '0' if flags_obj.cnn_model == 'vgg16': chunk = 25 elif flags_obj.cnn_model == 'resnet101': chunk = 200 elif flags_obj.cnn_model == 'inceptionv3': chunk = 30 else: chunk = 512 if flags_obj.autodist_strategy == 'PS': autodist = AutoDist(resource_spec_file, PS(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'PSLoadBalancing': autodist = AutoDist( resource_spec_file, PSLoadBalancing(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'PartitionedPS': autodist = AutoDist( resource_spec_file, PartitionedPS(local_proxy_variable=flags_obj.proxy)) elif flags_obj.autodist_strategy == 'AllReduce': autodist = AutoDist(resource_spec_file, AllReduce(chunk_size=chunk)) elif flags_obj.autodist_strategy == 'Parallax': autodist = AutoDist( resource_spec_file, Parallax(chunk_size=chunk, local_proxy_variable=flags_obj.proxy)) else: raise ValueError( 'the strategy can be only from PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax' ) ######################################################################### dtype = flags_core.get_tf_dtype(flags_obj) if dtype == tf.float16: loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) policy = tf.compat.v1.keras.mixed_precision.experimental.Policy( 'mixed_float16', loss_scale=loss_scale) tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy) if not keras_utils.is_v2_0(): raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.') elif dtype == tf.bfloat16: policy = tf.compat.v1.keras.mixed_precision.experimental.Policy( 'mixed_bfloat16') tf.compat.v1.keras.mixed_precision.experimental.set_policy(policy) input_fn = imagenet_preprocessing.input_fn drop_remainder = flags_obj.enable_xla if 'vgg' in flags_obj.cnn_model: lr_schedule = 0.01 else: lr_schedule = 0.1 if flags_obj.use_tensor_lr: lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) ######################################################################### # Build with Graph mode, and put all under AutoDist scope. with tf.Graph().as_default(), autodist.scope(): ########################################################################## train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=imagenet_preprocessing.parse_record, datasets_num_private_threads=flags_obj. datasets_num_private_threads, dtype=dtype, drop_remainder=drop_remainder, tf_data_experimental_slack=flags_obj.tf_data_experimental_slack, training_dataset_cache=flags_obj.training_dataset_cache, ) if flags_obj.cnn_model == 'resnet101': model = tf.keras.applications.ResNet101( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'vgg16': model = tf.keras.applications.VGG16( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'inceptionv3': model = tf.keras.applications.InceptionV3( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.cnn_model == 'densenet121': model = tf.keras.applications.DenseNet121( weights=None, classes=imagenet_preprocessing.NUM_CLASSES) else: raise ValueError('Other Model Undeveloped') optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, beta_1=0.9, beta_2=0.999, epsilon=1e-08) train_input_iterator = tf.compat.v1.data.make_one_shot_iterator( train_input_dataset) train_input, train_target = train_input_iterator.get_next() steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) train_epochs = flags_obj.train_epochs if flags_obj.enable_checkpoint_and_export: ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}') if train_epochs <= 1 and flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) train_epochs = 1 num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) train_output = model(train_input, training=True) scc_loss = tf.keras.losses.SparseCategoricalCrossentropy() loss = scc_loss(train_target, train_output) var_list = variables.trainable_variables() + \ ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) grad = optimizer.get_gradients(loss, var_list) train_op = optimizer.apply_gradients(zip(grad, var_list)) ##################################################################### # Create distributed session. # Instead of using the original TensorFlow session for graph execution, # let's use AutoDist's distributed session, in which a computational # graph for distributed training is constructed. # # [original line] # >>> sess = tf.compat.v1.Session() # sess = autodist.create_distributed_session() ##################################################################### summary = TimeHistory(flags_obj.batch_size, steps_per_epoch) for epoch_id in range(train_epochs): summary.on_epoch_begin(epoch_id) for batch_id in range(steps_per_epoch): summary.on_batch_begin(batch_id) loss_v, _ = sess.run([loss, train_op]) summary.on_batch_end(batch_id, loss_v) summary.on_epoch_end(epoch_id) summary.on_train_end() return