def main(*args, **kwargs): check_dataset() mnist = input_data.read_data_sets(flags.data_url, one_hot=True) # define the input dataset, return image and label def input_fn(run_mode, **kwargs): def gen(): while True: yield mnist.train.next_batch(flags.batch_size) ds = tf.data.Dataset.from_generator( gen, output_types=(tf.float32, tf.int64), output_shapes=(tf.TensorShape([None, 784]), tf.TensorShape([None, 10]))) return ds.make_one_shot_iterator().get_next() # define the model for training or evaling. def model_fn(inputs, run_mode, **kwargs): x, y_ = inputs y = tf.keras.layers.Dense(128, activation='relu')(x) y = tf.keras.layers.Dense(10)(y) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) export_spec = mox.ExportSpec(inputs_dict={'images': x}, outputs_dict={'logits': y}, version='model') return mox.ModelSpec(loss=cross_entropy, log_info={ 'loss': cross_entropy, 'accuracy': accuracy }, export_spec=export_spec) mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn('adam', learning_rate=0.01), run_mode=mox.ModeKeys.TRAIN, batch_size=flags.batch_size, auto_batch=False, log_dir=flags.train_url, max_number_of_steps=1000, log_every_n_steps=10, export_model=mox.ExportKeys.TF_SERVING)
def main(*args): mnist = input_data.read_data_sets(flags.data_url, one_hot=True) # define the input dataset, return image and label def input_fn(run_mode, **kwargs): def gen(): while True: yield mnist.train.next_batch(50) ds = tf.data.Dataset.from_generator( gen, output_types=(tf.float32, tf.int64), output_shapes=(tf.TensorShape([None, 784]), tf.TensorShape([None, 10]))) return ds.make_one_shot_iterator().get_next() # define the model for training or evaling. def model_fn(inputs, run_mode, **kwargs): x, y_ = inputs W = tf.get_variable(name='W', initializer=tf.zeros([784, 10])) b = tf.get_variable(name='b', initializer=tf.zeros([10])) y = tf.matmul(x, W) + b cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) predictions = tf.argmax(y, 1) correct_predictions = tf.equal(predictions, tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) export_spec = mox.ExportSpec(inputs_dict={'images': x}, outputs_dict={'predictions': predictions}, version='model') return mox.ModelSpec(loss=cross_entropy, log_info={ 'loss': cross_entropy, 'accuracy': accuracy }, export_spec=export_spec) mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn('sgd', learning_rate=0.01), run_mode=mox.ModeKeys.TRAIN, batch_size=50, auto_batch=False, log_dir=flags.train_url, max_number_of_steps=1000, log_every_n_steps=10, export_model=mox.ExportKeys.TF_SERVING)
def main(*args): num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) steps_per_epoch = int( round( math.ceil( float(NUM_SAMPLES_TRAIN) / (flags.batch_size * num_gpus * num_workers)))) if flags.is_training: mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn(name='adam', learning_rate=0.001), run_mode=mox.ModeKeys.TRAIN, batch_size=flags.batch_size, log_dir=flags.train_url, max_number_of_steps=steps_per_epoch * 150, log_every_n_steps=20, save_summary_steps=50, save_model_secs=120, export_model=mox.ExportKeys.TF_SERVING) else: mox.run(input_fn=input_fn, model_fn=model_fn, run_mode=mox.ModeKeys.EVAL, batch_size=5, log_every_n_steps=1, max_number_of_steps=int(NUM_SAMPLES_EVAL / 5), checkpoint_path=flags.train_url) mox.run(input_fn=input_fn, output_fn=output_fn, model_fn=model_fn, run_mode=mox.ModeKeys.PREDICT, batch_size=24, max_number_of_steps=int(NUM_SAMPLES_TEST / 24), log_every_n_steps=50, output_every_n_steps=int(NUM_SAMPLES_TEST / 24), checkpoint_path=flags.train_url) # Write results to file. tf.gfile allow writing file to EBS/s3 submission_file = os.path.join(flags.train_url, 'submission.csv') result = submission.to_csv(path_or_buf=None, index=False) with tf.gfile.Open(submission_file, 'w') as f: f.write(result)
batch_size = 100 num_batches = mnist.test.num_examples // batch_size def gen(): for _ in range(num_batches): yield mnist.test.next_batch(batch_size) ds = tf.data.Dataset.from_generator( gen, output_types=(tf.float32, tf.int64), output_shapes=(tf.TensorShape([None, 784]), tf.TensorShape([None, 10]))) return ds.make_one_shot_iterator().get_next() def model_fn(inputs, run_mode, **kwargs): x, y_ = inputs W = tf.get_variable(name='W', initializer=tf.zeros([784, 10])) b = tf.get_variable(name='b', initializer=tf.zeros([10])) y = tf.matmul(x, W) + b correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) return mox.ModelSpec(log_info={'accuracy': accuracy}) mox.run(input_fn=input_fn, model_fn=model_fn, run_mode=mox.ModeKeys.EVAL, checkpoint_path=flags.train_url, max_number_of_steps=sys.maxint)
x, y_ = inputs W = tf.get_variable(name='W', initializer=tf.zeros([784, 10])) b = tf.get_variable(name='b', initializer=tf.zeros([10])) y = tf.matmul(x, W) + b cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) predictions = tf.argmax(y, 1) correct_predictions = tf.equal(predictions, tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) export_spec = mox.ExportSpec(inputs_dict={'images': x}, outputs_dict={'predictions': predictions}) return mox.ModelSpec(loss=cross_entropy, log_info={ 'loss': cross_entropy, 'accuracy': accuracy }, export_spec=export_spec) if __name__ == '__main__': mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn('sgd', learning_rate=0.01), run_mode=mox.ModeKeys.TRAIN, batch_size=50, auto_batch=False, log_dir=flags.train_url, max_number_of_steps=1000, log_every_n_steps=10, export_model=mox.ExportKeys.TF_SERVING)
def main(*args): _data_url = flags.data_url _train_url = flags.train_url if not mox.file.is_directory(_train_url): mox.file.make_dirs(_train_url) mox.file.make_dirs('/cache/data_url') mox.file.make_dirs('/cache/train_url') mox.file.copy_parallel(_data_url, '/cache/data_url') mox.file.copy_parallel(_train_url, '/cache/train_url') flags.data_url = '/cache/data_url' flags.train_url = '/cache/train_url' atexit.register( lambda: mox.file.copy_parallel('/cache/train_url', _train_url)) logger = logging.getLogger() while logger.handlers: logger.handlers.pop() num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) steps_per_epoch = int( math.ceil( float(NUM_SAMPLES_TRAIN) / (flags.batch_size * num_gpus * num_workers))) submission = pd.DataFrame(columns=['id', 'is_iceberg']) def input_fn(run_mode, **kwargs): if run_mode == mox.ModeKeys.TRAIN: num_samples = NUM_SAMPLES_TRAIN num_epochs = None shuffle = True file_pattern = 'iceberg-train-*.tfrecord' else: num_epochs = 1 shuffle = False if run_mode == mox.ModeKeys.EVAL: num_samples = NUM_SAMPLES_EVAL file_pattern = 'iceberg-eval-*.tfrecord' else: num_samples = NUM_SAMPLES_TEST file_pattern = 'iceberg-test-*.tfrecord' keys_to_features = { 'band_1': tf.FixedLenFeature((75 * 75, ), tf.float32, default_value=None), 'band_2': tf.FixedLenFeature((75 * 75, ), tf.float32, default_value=None), 'angle': tf.FixedLenFeature([1], tf.float32, default_value=None), } items_to_handlers = { 'band_1': slim.tfexample_decoder.Tensor('band_1', shape=[75, 75]), 'band_2': slim.tfexample_decoder.Tensor('band_2', shape=[75, 75]), 'angle': slim.tfexample_decoder.Tensor('angle', shape=[]) } if run_mode == mox.ModeKeys.PREDICT: keys_to_features['id'] = tf.FixedLenFeature([1], tf.string, default_value=None) items_to_handlers['id'] = slim.tfexample_decoder.Tensor('id', shape=[]) else: keys_to_features['label'] = tf.FixedLenFeature([1], tf.int64, default_value=None) items_to_handlers['label'] = slim.tfexample_decoder.Tensor( 'label', shape=[]) dataset = mox.get_tfrecord(dataset_dir=flags.data_url, file_pattern=file_pattern, num_samples=num_samples, keys_to_features=keys_to_features, items_to_handlers=items_to_handlers, num_epochs=num_epochs, shuffle=shuffle) if run_mode == mox.ModeKeys.PREDICT: band_1, band_2, id_or_label, angle = dataset.get( ['band_1', 'band_2', 'id', 'angle']) # Non-DMA safe string cannot tensor may not be copied to a GPU. # So we encode string to a list of integer. id_or_label = tf.py_func( lambda str: np.array([ord(ch) for ch in str]), [id_or_label], tf.int64) # We know `id` is a string of 8 alphabets. id_or_label = tf.reshape(id_or_label, shape=(8, )) else: band_1, band_2, id_or_label, angle = dataset.get( ['band_1', 'band_2', 'label', 'angle']) band_3 = band_1 + band_2 # Rescale the input image to [0, 1] def rescale(*args): ret_images = [] for image in args: image = tf.cast(image, tf.float32) image_min = tf.reduce_min(image) image_max = tf.reduce_max(image) image = (image - image_min) / (image_max - image_min) ret_images.append(image) return ret_images band_1, band_2, band_3 = rescale(band_1, band_2, band_3) image = tf.stack([band_1, band_2, band_3], axis=2) # Data augementation if run_mode == mox.ModeKeys.TRAIN: image = tf.image.random_flip_left_right(image) image = tf.image.random_flip_up_down(image) image = tf.image.rot90(image, k=tf.random_uniform(shape=(), maxval=3, minval=0, dtype=tf.int32)) return image, id_or_label, angle def model_v1(images, angles, run_mode): is_training = (run_mode == mox.ModeKeys.TRAIN) # Conv Layer 1 x = Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=(75, 75, 3))(images) x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(x) x = Dropout(0.2)(x, training=is_training) # Conv Layer 2 x = Conv2D(128, kernel_size=(3, 3), activation='relu')(x) x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) x = Dropout(0.2)(x, training=is_training) # Conv Layer 3 x = Conv2D(128, kernel_size=(3, 3), activation='relu')(x) x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) x = Dropout(0.2)(x, training=is_training) # Conv Layer 4 x = Conv2D(64, kernel_size=(3, 3), activation='relu')(x) x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) x = Dropout(0.2)(x, training=is_training) # Flatten the data for upcoming dense layers x = Flatten()(x) x = Concatenate()([x, angles]) # Dense Layers x = Dense(512)(x) x = Activation('relu')(x) x = Dropout(0.2)(x, training=is_training) # Dense Layer 2 x = Dense(256)(x) x = Activation('relu')(x) x = Dropout(0.2)(x, training=is_training) # Sigmoid Layer logits = Dense(2)(x) return logits def model_fn(inputs, run_mode, **kwargs): # In train or eval, id_or_labels represents labels. In predict, id_or_labels represents id. images, id_or_labels, angles = inputs # Reshape angles from [batch_size] to [batch_size, 1] angles = tf.expand_dims(angles, 1) # Apply your version of model logits = model_v1(images, angles, run_mode) if run_mode == mox.ModeKeys.PREDICT: logits = tf.nn.softmax(logits) # clip logits to get lower loss value. logits = tf.clip_by_value(logits, clip_value_min=0.05, clip_value_max=0.95) model_spec = mox.ModelSpec(output_info={ 'id': id_or_labels, 'logits': logits }) else: labels_one_hot = slim.one_hot_encoding(id_or_labels, 2) loss = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels_one_hot, label_smoothing=0.0, weights=1.0) model_spec = mox.ModelSpec(loss=loss, log_info={'loss': loss}) return model_spec def output_fn(outputs): global submission for output in outputs: for id, logits in zip(output['id'], output['logits']): # Decode id from integer list to string. id = ''.join([chr(ch) for ch in id]) # Get the probability of label==1 is_iceberg = logits[1] df = pd.DataFrame([[id, is_iceberg]], columns=['id', 'is_iceberg']) submission = submission.append(df) if flags.is_training: mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn(name='adam', learning_rate=0.001), run_mode=mox.ModeKeys.TRAIN, batch_size=flags.batch_size, log_dir=flags.train_url, max_number_of_steps=steps_per_epoch * 150, log_every_n_steps=20, save_summary_steps=50, save_model_secs=120) else: mox.run(input_fn=input_fn, model_fn=model_fn, run_mode=mox.ModeKeys.EVAL, batch_size=5, log_every_n_steps=1, max_number_of_steps=int(NUM_SAMPLES_EVAL / 5), checkpoint_path=flags.train_url) mox.run(input_fn=input_fn, output_fn=output_fn, model_fn=model_fn, run_mode=mox.ModeKeys.PREDICT, batch_size=24, max_number_of_steps=int(NUM_SAMPLES_TEST / 24), log_every_n_steps=50, output_every_n_steps=int(NUM_SAMPLES_TEST / 24), checkpoint_path=flags.train_url) # Write results to file. tf.gfile allow writing file to EBS/s3 submission_file = os.path.join(flags.train_url, 'submission.csv') result = submission.to_csv(path_or_buf=None, index=False) with tf.gfile.Open(submission_file, 'w') as f: f.write(result)
# Decode id from integer list to string. id = ''.join([chr(ch) for ch in id]) # Get the probability of label==1 is_iceberg = logits[1] df = pd.DataFrame([[id, is_iceberg]], columns=['id', 'is_iceberg']) submission = submission.append(df) if __name__ == '__main__': if flags.is_training: mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn(name='adam', learning_rate=0.001), run_mode=mox.ModeKeys.TRAIN, batch_size=flags.batch_size, log_dir=flags.train_url, max_number_of_steps=steps_per_epoch * 150, log_every_n_steps=20, save_summary_steps=50, save_model_secs=120) else: mox.run(input_fn=input_fn, model_fn=model_fn, run_mode=mox.ModeKeys.EVAL, batch_size=5, log_every_n_steps=1, max_number_of_steps=int(NUM_SAMPLES_EVAL / 5), checkpoint_path=flags.train_url) mox.run(input_fn=input_fn, output_fn=output_fn,
gen, output_types=(tf.float32, tf.int64), output_shapes=(tf.TensorShape([None, 784]), tf.TensorShape([None, 10]))) return ds.make_one_shot_iterator().get_next() def model_fn(inputs, run_mode, **kwargs): x, y_ = inputs W = tf.get_variable(name='W', initializer=tf.zeros([784, 10])) b = tf.get_variable(name='b', initializer=tf.zeros([10])) y = tf.matmul(x, W) + b cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) predictions = tf.argmax(y, 1) correct_predictions = tf.equal(predictions, tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) export_spec = mox.ExportSpec(inputs_dict={'images': x}, outputs_dict={'predictions': predictions}) return mox.ModelSpec(loss=cross_entropy, log_info={'loss': cross_entropy, 'accuracy': accuracy}, export_spec=export_spec) if __name__ == '__main__': mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn('sgd', learning_rate=0.01), run_mode=mox.ModeKeys.TRAIN, batch_size=50, auto_batch=False, log_dir=flags.train_url, max_number_of_steps=1000, log_every_n_steps=10, export_model=mox.ExportKeys.TF_SERVING)
def main(*args, **kwargs): if flags.use_controller: convert_ps_to_controller() job_name = mox.get_flag('job_name') task_index = mox.get_flag('task_index') if flags.local_cache == 'hard': if flags.use_controller: # In all-reduce mode, worker-0 does not download dataset (controller-0 will download). imagenet_data, imagenet_160_data = download_dataset( flags.data_url, flags.data_url_160, skip_download=(job_name == 'worker' and task_index == 0)) else: # PS dose not download dataset. imagenet_data, imagenet_160_data = download_dataset( flags.data_url, flags.data_url_160, skip_download=(job_name == 'ps')) log_dir = '/cache/cache-outputs' else: imagenet_data = flags.data_url imagenet_160_data = flags.data_url_160 log_dir = flags.train_url print('download dataset finish at %s' % time.time()) if (not job_name or (job_name == 'worker' and task_index == 0)) and flags.train_url: if not mox.file.is_directory(log_dir): mox.file.make_dirs(log_dir) else: log_dir = None model_meta = mox.get_model_meta(flags.model_name) labels_offset = model_meta.default_labels_offset num_workers = len(mox.get_flag('worker_hosts').split(',')) assert flags.bs_and_ims_strategy is not None schduler = config_bs_ims(flags.bs_and_ims_strategy) max_step = int(schduler[-1][0]) def input_fn(mode, **kwargs): if not flags.synthetic: ds_strategy_spec = [] ds_switch_steps = [] if flags.split_dataset_like_mxnet and mox.get_flag('job_name'): if num_workers == 4: file_pattern = 'train-*-of-*-node-%d-*-*' % task_index elif num_workers == 8: file_pattern = 'train-*-of-*-node-*-%d-*' % task_index elif num_workers == 16: file_pattern = 'train-*-of-*-node-*-*-%d' % task_index else: raise ValueError('num_workers should be 4, 8, 16') else: file_pattern = flags.file_pattern for step, ims, bs in schduler: # switch to next dataset 2 steps earlier because there are 2 pipeline prefetch queue ds_switch_steps.append(step - 2) if ims == 128: ds_strategy_spec.append( (os.path.join(imagenet_160_data, file_pattern), bs, ims, 0.08)) elif ims == 224: ds_strategy_spec.append( (os.path.join(imagenet_data, file_pattern), bs, ims, 0.087)) elif ims == 288: ds_strategy_spec.append( (os.path.join(imagenet_data, file_pattern), bs, ims, 0.5)) else: raise ValueError('image is not in [128, 224, 288]') # The last stage of dataset does not need to be switched ds_switch_steps.pop(-1) tf.logging.info('Dataset will be switched at step: %s' % ds_switch_steps) dataset = ProgressiveImagenetDataset( num_samples=flags.num_samples, strategy_spec=ds_strategy_spec, ds_switch_steps=ds_switch_steps, shuffle=True, num_parallel=flags.num_readers, labels_offset=labels_offset, private_num_threads=flags.private_num_threads, shuffle_buffer_size=512 * 8 * 2) image, label = dataset.get(['image', 'label']) image_shape = tf.shape(image)[2] batch_size = tf.shape(label)[0] tf.summary.scalar(name='image_shape', tensor=image_shape) tf.summary.scalar(name='batch_size', tensor=batch_size) else: import numpy as np image = tf.constant( np.random.randint(low=0, high=255, size=[flags.batch_size, 128, 128, 3], dtype=np.uint8)) label = tf.constant( np.random.randint(low=0, high=999, size=[flags.batch_size], dtype=np.int64)) if flags.split_to_device: input_spec = mox.InputSpec(split_to_device=True) input_spec.new_input([image, label]) return input_spec else: return image, label def model_fn(inputs, mode, **kwargs): if not flags.gpu_synthetic: if flags.split_to_device: images, labels = inputs.get_input(0) else: images, labels = inputs else: import numpy as np images = tf.constant( np.random.randint(low=0, high=255, size=[flags.batch_size, 128, 128, 3], dtype=np.uint8)) labels = tf.constant( np.random.randint(low=0, high=999, size=[flags.batch_size], dtype=np.int64)) if flags.fp16: images = tf.cast(images, tf.float16) def preprocess_fn(images, run_mode, *args): images = images / 255.0 channels = tf.split(axis=3, num_or_size_splits=3, value=images) for i in range(3): channels[i] = (channels[i] - mean[i]) / std[i] images = tf.concat(axis=3, values=channels) if flags.data_format == 'NCHW': images = tf.transpose(images, perm=(0, 3, 1, 2)) return images model_kwargs = {} if flags.model_name == 'resnet_v1_50_8k': if flags.official_stride: model_kwargs['official'] = True if flags.fastai_initializer: model_kwargs['weights_initializer_params'] = { 'factor': 2.0 / 1.3, 'mode': 'FAN_OUT' } mox_model_fn = mox.get_model_fn(name=flags.model_name, run_mode=mode, num_classes=1000, preprocess_fn=preprocess_fn, weight_decay=flags.weight_decay, data_format=flags.data_format, batch_norm_fused=True, batch_renorm=False, **model_kwargs) logits, end_points = mox_model_fn(images) labels_one_hot = slim.one_hot_encoding(labels, 1000) loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels_one_hot, label_smoothing=0.0, weights=1.0) logits_fp32 = tf.cast(logits, tf.float32) accuracy_top_1 = tf.reduce_mean( tf.cast(tf.nn.in_top_k(logits_fp32, labels, 1), tf.float32)) accuracy_top_5 = tf.reduce_mean( tf.cast(tf.nn.in_top_k(logits_fp32, labels, 5), tf.float32)) log_info = { 'ent_loss': loss, 'top-1': accuracy_top_1, 'top-5': accuracy_top_5 } regularization_losses = mox.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if len(regularization_losses ) > 0 and flags.use_optimizer != 'dymomentumw': regularization_loss = tf.add_n(regularization_losses) log_info['reg_loss'] = regularization_loss loss = loss + regularization_loss log_info['total_loss'] = loss return mox.ModelSpec(loss=loss, log_info=log_info) if flags.strict_sync_replicas: mox.set_flag('sync_replicas', False) mox.set_flag('chief_inc_global_step', True) def optimizer_fn(): global_step = tf.train.get_or_create_global_step() decay_end = 1.0 - flags.cooldown if flags.use_lr_schedule == 'lcd': lr = linear_cosine_decay(flags.max_lr, flags.min_lr, global_step, max_step, flags.warmup, decay_end) print("Using Linear Cosine Decay Schedule") elif flags.use_lr_schedule == 'poly': lr = polynomial_decay(flags.max_lr, flags.min_lr, global_step, max_step, flags.warmup, decay_end) print("Using Polynomial Decay Schedule") else: raise ValueError("lr schedule not provided") if flags.use_optimizer == 'dymomentum': opt = DyMomentumOptimizer(lr, flags.max_lr, flags.min_lr, max_mom=flags.max_mom, min_mom=flags.min_mom, global_step=global_step, max_iteration=max_step, use_nesterov=flags.use_nesterov, cooldown=flags.cooldown, use_lars=flags.use_lars, weight_decay=flags.weight_decay) print("Using Dynamic Momentum Optimizer") elif flags.use_optimizer == 'dymomentumw': opt = DyMomentumWOptimizer(lr, flags.max_lr, flags.min_lr, max_mom=flags.max_mom, min_mom=flags.min_mom, global_step=global_step, max_iteration=max_step, use_nesterov=flags.use_nesterov, cooldown=flags.cooldown, use_lars=flags.use_lars, weight_decay=flags.weight_decay) print("Using Dynamic MomentumW Optimizer") else: raise ValueError("Optimizer not provided") tf.summary.scalar(name='momentum', tensor=opt.get_momentum()) if flags.strict_sync_replicas: from moxing.tensorflow.optimizer.simple_sync_optimizer import SimpleSyncOptimizer opt = SimpleSyncOptimizer(opt, num_workers=num_workers, task_index=task_index) return opt mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=optimizer_fn, run_mode=flags.run_mode, batch_size=flags.batch_size, max_number_of_steps=max_step, log_every_n_steps=flags.log_every_n_steps, log_dir=log_dir, auto_batch=False, save_summary_steps=flags.save_summary_steps, checkpoint_path=flags.checkpoint_url, save_model_secs=flags.save_model_secs) print('upload model finish at %s' % time.time()) if flags.local_cache == 'hard' and log_dir: mox.file.copy_parallel(log_dir, flags.train_url) print('Training job finish at: %s' % time.time())
def main(_): # 获取当前使用的GPU数量和节点数量 num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) data_meta = mox.ImageClassificationRawMetadata(base_dir=flags.data_url) def input_fn(mode): # 创建一个数据增强方法,该方法基于resnet50论文实现 augmentation_fn = mox.get_data_augmentation_fn(name='resnet_v1_50', run_mode=mode, output_height=224, output_width=224) # 创建`数据集读取类`,并将数据增强方法传入,最多读取20个epoch dataset = mox.ImageClassificationRawDataset( data_meta, batch_size=flags.batch_size, num_epochs=20, augmentation_fn=augmentation_fn) image, label = dataset.get(['image', 'label']) return image, label def model_fn(inputs, mode): images, labels = inputs # 获取一个resnet50的模型,输入images,输入logits和end_points,这里不关心end_points,仅取logits logits, _ = mox.get_model_fn(name='resnet_v1_50', run_mode=mode, num_classes=data_meta.num_classes, weight_decay=0.00004)(images) # 计算交叉熵损失值 labels_one_hot = slim.one_hot_encoding(labels, data_meta.num_classes) loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels_one_hot) # 获取正则项损失值,并加到loss上,这里必须要用mox.get_collection代替tf.get_collection regularization_losses = mox.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) regularization_loss = tf.add_n(regularization_losses) loss = loss + regularization_loss # 计算分类正确率 accuracy = tf.reduce_mean( tf.cast(tf.nn.in_top_k(logits, labels, 1), tf.float32)) # 返回MoXing-TensorFlow用于定义模型的类ModelSpec return mox.ModelSpec(loss=loss, log_info={ 'loss': loss, 'accuracy': accuracy }) def optimizer_fn(): # 使用分段式学习率,0-10个epoch为0.01,10-20个epoch为0.001 lr = learning_rate_scheduler.piecewise_lr( '10:0.01,20:0.001', num_samples=data_meta.total_num_samples, global_batch_size=flags.batch_size * num_gpus * num_workers) return tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9) mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=optimizer_fn, run_mode=mox.ModeKeys.TRAIN, max_number_of_steps=sys.maxint, log_dir=flags.train_url)
global submission for output in outputs: for id, logits in zip(output['id'], output['logits']): # Decode id from integer list to string. id = ''.join([chr(ch) for ch in id]) # Get the probability of label==1 is_iceberg = logits[1] df = pd.DataFrame([[id, is_iceberg]], columns=['id', 'is_iceberg']) submission = submission.append(df) if __name__ == '__main__': if flags.is_training: mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn(name='adam', learning_rate=0.001), run_mode=mox.ModeKeys.TRAIN, batch_size=flags.batch_size, log_dir=flags.log_dir, max_number_of_steps=steps_per_epoch * 150, log_every_n_steps=20, save_summary_steps=50, save_model_secs=120) else: mox.run(input_fn=input_fn, model_fn=model_fn, run_mode=mox.ModeKeys.EVAL, batch_size=5, log_every_n_steps=1, max_number_of_steps=int(NUM_SAMPLES_EVAL / 5), checkpoint_path=flags.log_dir) mox.run(input_fn=input_fn, output_fn=output_fn, model_fn=model_fn,
ds = tf.data.Dataset.from_generator( gen, output_types=(tf.float32, tf.int64), output_shapes=(tf.TensorShape([None, 784]), tf.TensorShape([None, 10]))) x, y_ = ds.make_one_shot_iterator().get_next() return x, y_ def model_fn(inputs, run_mode, **kwargs): x, y_ = inputs W = tf.get_variable(name='W', initializer=tf.zeros([784, 10])) b = tf.get_variable(name='b', initializer=tf.zeros([10])) y = tf.matmul(x, W) + b cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) return mox.ModelSpec(loss=cross_entropy, log_info={'loss': cross_entropy}) def optimizer_fn(): return tf.train.GradientDescentOptimizer(0.5) mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=optimizer_fn, run_mode=mox.ModeKeys.TRAIN, log_dir=flags.train_url, max_number_of_steps=sys.maxint)
tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) from efficient_ai.config import CompressorSpec compressor_spec = CompressorSpec(logits=y) return mox.ModelSpec(loss=cross_entropy, compressor_spec=compressor_spec, log_info={'loss': cross_entropy}) def optimizer_fn(): return tf.train.GradientDescentOptimizer(0.5) mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=optimizer_fn, run_mode=mox.ModeKeys.TRAIN, log_dir=flags.train_url, max_number_of_steps=500, batch_size=12) def dcp(): from efficient_ai.config import DCPCompressorConfig run_mode = mox.ModeKeys.TRAIN log_dir = flags.train_url max_number_of_steps = 500 num_classes = 10 batch_size = None new_log_dir = os.path.join(log_dir, 'dcp') config = DCPCompressorConfig(
def main(*args, **kwargs): import time st = time.time() num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) exclude_list = ['global_step'] model_meta = mox.get_model_meta(flags.model_name) exclude_list.append(model_meta.default_logits_pattern) checkpoint_exclude_patterns = ','.join(exclude_list) mox.set_flag('checkpoint_exclude_patterns', checkpoint_exclude_patterns) data_meta = mox.ImageClassificationRawMetadata(base_dir=flags.data_url) labels_list = data_meta.labels_list mox.set_flag('loss_scale', 1024.0) def input_fn(mode, **kwargs): data_augmentation_fn = mox.get_data_augmentation_fn(name=flags.model_name, run_mode=mode) dataset = mox.ImageClassificationRawDataset(data_meta, batch_size=flags.batch_size, num_epochs=20, augmentation_fn=data_augmentation_fn, reader_class=mox.AsyncRawGenerator) images, labels = dataset.get(['image', 'label']) return images, labels def model_fn(inputs, mode, **kwargs): images, labels = inputs # cpu cannot support model infer with `NCHW`, gpu support both if mode == mox.ModeKeys.EXPORT: data_format = 'NHWC' else: data_format = 'NCHW' mox_model_fn = mox.get_model_fn( name=flags.model_name, run_mode=mode, num_classes=data_meta.num_classes, weight_decay=0.00004, data_format=data_format, batch_norm_fused=True) images_fp16 = tf.cast(images, tf.float16) with mox.var_scope(force_dtype=tf.float32): logits, _ = mox_model_fn(images_fp16) labels_one_hot = slim.one_hot_encoding(labels, data_meta.num_classes) loss = tf.losses.softmax_cross_entropy(labels_one_hot, logits=logits) regularization_losses = mox.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) regularization_loss = tf.add_n(regularization_losses) loss = loss + regularization_loss logits_fp32 = tf.cast(logits, tf.float32) accuracy = tf.reduce_mean(tf.cast(tf.nn.in_top_k(logits_fp32, labels, 1), tf.float32)) export_spec = mox.ExportSpec(inputs_dict={'images': images}, outputs_dict={'logits': logits_fp32}, version='model') return mox.ModelSpec(loss=loss, log_info={'loss': loss, 'accuracy': accuracy}, export_spec=export_spec) def optimizer_fn(): lr = learning_rate_scheduler.piecewise_lr('10:0.01,20:0.001', num_samples=data_meta.total_num_samples, global_batch_size=flags.batch_size * num_gpus * num_workers) opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9) return opt mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=optimizer_fn, run_mode=mox.ModeKeys.TRAIN, log_dir=flags.train_url, checkpoint_path=flags.checkpoint_url, max_number_of_steps=sys.maxint, export_model=mox.ExportKeys.TF_SERVING) # for model infer in ModelArts console with mox.file.File(os.path.join(flags.train_url, 'model', 'labels.txt'), 'w') as f: f.write('\n'.join(labels_list)) print(time.time() - st)