def main(*args): num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) steps_per_epoch = int( round( math.ceil( float(NUM_SAMPLES_TRAIN) / (flags.batch_size * num_gpus * num_workers)))) if flags.is_training: mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn(name='adam', learning_rate=0.001), run_mode=mox.ModeKeys.TRAIN, batch_size=flags.batch_size, log_dir=flags.train_url, max_number_of_steps=steps_per_epoch * 150, log_every_n_steps=20, save_summary_steps=50, save_model_secs=120, export_model=mox.ExportKeys.TF_SERVING) else: mox.run(input_fn=input_fn, model_fn=model_fn, run_mode=mox.ModeKeys.EVAL, batch_size=5, log_every_n_steps=1, max_number_of_steps=int(NUM_SAMPLES_EVAL / 5), checkpoint_path=flags.train_url) mox.run(input_fn=input_fn, output_fn=output_fn, model_fn=model_fn, run_mode=mox.ModeKeys.PREDICT, batch_size=24, max_number_of_steps=int(NUM_SAMPLES_TEST / 24), log_every_n_steps=50, output_every_n_steps=int(NUM_SAMPLES_TEST / 24), checkpoint_path=flags.train_url) # Write results to file. tf.gfile allow writing file to EBS/s3 submission_file = os.path.join(flags.train_url, 'submission.csv') result = submission.to_csv(path_or_buf=None, index=False) with tf.gfile.Open(submission_file, 'w') as f: f.write(result)
def config_bs_ims(strategy): num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) res = [] if ":" not in strategy: image_size, batch_size = strategy.split('-') return [(float(image_size), float(batch_size))] else: stags = strategy.split(",") last_steps, last_epoch = 0, 0 for i in range(len(stags)): cur_epoch, value = stags[i].strip().split(':') image_size, batch_size = value.strip().split('-') cur_epoch, image_size, batch_size = float(cur_epoch), float( image_size), float(batch_size) cur_batch_tot = batch_size * num_gpus * num_workers cur_steps = int( round(math.ceil(flags.num_samples / float(cur_batch_tot)))) * ( cur_epoch - last_epoch) + last_steps res.append((int(cur_steps), int(image_size), int(batch_size))) last_steps, last_epoch = cur_steps, cur_epoch return res
def convert_ps_to_controller(): # ps0 -> worker0 # ps1 -> worker1 # worker0 -> controller # worker1 -> sleep job_name = mox.get_flag('job_name') task_index = mox.get_flag('task_index') ps_hosts = mox.get_flag('ps_hosts') worker_hosts = mox.get_flag('worker_hosts') mox.set_flag('ps_hosts', '') mox.set_flag('worker_hosts', ps_hosts) mox.set_flag('controller_host', worker_hosts.split(',')[0]) if job_name == 'ps': tf.logging.info('convert ps to worker') mox.set_flag('job_name', 'worker') elif job_name == 'worker' and task_index == 0: tf.logging.info('convert worker-0 to controller') mox.set_flag('job_name', 'controller') os.environ['CUDA_VISIBLE_DEVICES'] = '' else: tf.logging.info('sleep unused server') time.sleep(9999999)
def main(*args): _data_url = flags.data_url _train_url = flags.train_url if not mox.file.is_directory(_train_url): mox.file.make_dirs(_train_url) mox.file.make_dirs('/cache/data_url') mox.file.make_dirs('/cache/train_url') mox.file.copy_parallel(_data_url, '/cache/data_url') mox.file.copy_parallel(_train_url, '/cache/train_url') flags.data_url = '/cache/data_url' flags.train_url = '/cache/train_url' atexit.register( lambda: mox.file.copy_parallel('/cache/train_url', _train_url)) logger = logging.getLogger() while logger.handlers: logger.handlers.pop() num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) steps_per_epoch = int( math.ceil( float(NUM_SAMPLES_TRAIN) / (flags.batch_size * num_gpus * num_workers))) submission = pd.DataFrame(columns=['id', 'is_iceberg']) def input_fn(run_mode, **kwargs): if run_mode == mox.ModeKeys.TRAIN: num_samples = NUM_SAMPLES_TRAIN num_epochs = None shuffle = True file_pattern = 'iceberg-train-*.tfrecord' else: num_epochs = 1 shuffle = False if run_mode == mox.ModeKeys.EVAL: num_samples = NUM_SAMPLES_EVAL file_pattern = 'iceberg-eval-*.tfrecord' else: num_samples = NUM_SAMPLES_TEST file_pattern = 'iceberg-test-*.tfrecord' keys_to_features = { 'band_1': tf.FixedLenFeature((75 * 75, ), tf.float32, default_value=None), 'band_2': tf.FixedLenFeature((75 * 75, ), tf.float32, default_value=None), 'angle': tf.FixedLenFeature([1], tf.float32, default_value=None), } items_to_handlers = { 'band_1': slim.tfexample_decoder.Tensor('band_1', shape=[75, 75]), 'band_2': slim.tfexample_decoder.Tensor('band_2', shape=[75, 75]), 'angle': slim.tfexample_decoder.Tensor('angle', shape=[]) } if run_mode == mox.ModeKeys.PREDICT: keys_to_features['id'] = tf.FixedLenFeature([1], tf.string, default_value=None) items_to_handlers['id'] = slim.tfexample_decoder.Tensor('id', shape=[]) else: keys_to_features['label'] = tf.FixedLenFeature([1], tf.int64, default_value=None) items_to_handlers['label'] = slim.tfexample_decoder.Tensor( 'label', shape=[]) dataset = mox.get_tfrecord(dataset_dir=flags.data_url, file_pattern=file_pattern, num_samples=num_samples, keys_to_features=keys_to_features, items_to_handlers=items_to_handlers, num_epochs=num_epochs, shuffle=shuffle) if run_mode == mox.ModeKeys.PREDICT: band_1, band_2, id_or_label, angle = dataset.get( ['band_1', 'band_2', 'id', 'angle']) # Non-DMA safe string cannot tensor may not be copied to a GPU. # So we encode string to a list of integer. id_or_label = tf.py_func( lambda str: np.array([ord(ch) for ch in str]), [id_or_label], tf.int64) # We know `id` is a string of 8 alphabets. id_or_label = tf.reshape(id_or_label, shape=(8, )) else: band_1, band_2, id_or_label, angle = dataset.get( ['band_1', 'band_2', 'label', 'angle']) band_3 = band_1 + band_2 # Rescale the input image to [0, 1] def rescale(*args): ret_images = [] for image in args: image = tf.cast(image, tf.float32) image_min = tf.reduce_min(image) image_max = tf.reduce_max(image) image = (image - image_min) / (image_max - image_min) ret_images.append(image) return ret_images band_1, band_2, band_3 = rescale(band_1, band_2, band_3) image = tf.stack([band_1, band_2, band_3], axis=2) # Data augementation if run_mode == mox.ModeKeys.TRAIN: image = tf.image.random_flip_left_right(image) image = tf.image.random_flip_up_down(image) image = tf.image.rot90(image, k=tf.random_uniform(shape=(), maxval=3, minval=0, dtype=tf.int32)) return image, id_or_label, angle def model_v1(images, angles, run_mode): is_training = (run_mode == mox.ModeKeys.TRAIN) # Conv Layer 1 x = Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=(75, 75, 3))(images) x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(x) x = Dropout(0.2)(x, training=is_training) # Conv Layer 2 x = Conv2D(128, kernel_size=(3, 3), activation='relu')(x) x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) x = Dropout(0.2)(x, training=is_training) # Conv Layer 3 x = Conv2D(128, kernel_size=(3, 3), activation='relu')(x) x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) x = Dropout(0.2)(x, training=is_training) # Conv Layer 4 x = Conv2D(64, kernel_size=(3, 3), activation='relu')(x) x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) x = Dropout(0.2)(x, training=is_training) # Flatten the data for upcoming dense layers x = Flatten()(x) x = Concatenate()([x, angles]) # Dense Layers x = Dense(512)(x) x = Activation('relu')(x) x = Dropout(0.2)(x, training=is_training) # Dense Layer 2 x = Dense(256)(x) x = Activation('relu')(x) x = Dropout(0.2)(x, training=is_training) # Sigmoid Layer logits = Dense(2)(x) return logits def model_fn(inputs, run_mode, **kwargs): # In train or eval, id_or_labels represents labels. In predict, id_or_labels represents id. images, id_or_labels, angles = inputs # Reshape angles from [batch_size] to [batch_size, 1] angles = tf.expand_dims(angles, 1) # Apply your version of model logits = model_v1(images, angles, run_mode) if run_mode == mox.ModeKeys.PREDICT: logits = tf.nn.softmax(logits) # clip logits to get lower loss value. logits = tf.clip_by_value(logits, clip_value_min=0.05, clip_value_max=0.95) model_spec = mox.ModelSpec(output_info={ 'id': id_or_labels, 'logits': logits }) else: labels_one_hot = slim.one_hot_encoding(id_or_labels, 2) loss = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels_one_hot, label_smoothing=0.0, weights=1.0) model_spec = mox.ModelSpec(loss=loss, log_info={'loss': loss}) return model_spec def output_fn(outputs): global submission for output in outputs: for id, logits in zip(output['id'], output['logits']): # Decode id from integer list to string. id = ''.join([chr(ch) for ch in id]) # Get the probability of label==1 is_iceberg = logits[1] df = pd.DataFrame([[id, is_iceberg]], columns=['id', 'is_iceberg']) submission = submission.append(df) if flags.is_training: mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=mox.get_optimizer_fn(name='adam', learning_rate=0.001), run_mode=mox.ModeKeys.TRAIN, batch_size=flags.batch_size, log_dir=flags.train_url, max_number_of_steps=steps_per_epoch * 150, log_every_n_steps=20, save_summary_steps=50, save_model_secs=120) else: mox.run(input_fn=input_fn, model_fn=model_fn, run_mode=mox.ModeKeys.EVAL, batch_size=5, log_every_n_steps=1, max_number_of_steps=int(NUM_SAMPLES_EVAL / 5), checkpoint_path=flags.train_url) mox.run(input_fn=input_fn, output_fn=output_fn, model_fn=model_fn, run_mode=mox.ModeKeys.PREDICT, batch_size=24, max_number_of_steps=int(NUM_SAMPLES_TEST / 24), log_every_n_steps=50, output_every_n_steps=int(NUM_SAMPLES_TEST / 24), checkpoint_path=flags.train_url) # Write results to file. tf.gfile allow writing file to EBS/s3 submission_file = os.path.join(flags.train_url, 'submission.csv') result = submission.to_csv(path_or_buf=None, index=False) with tf.gfile.Open(submission_file, 'w') as f: f.write(result)
_data_url = flags.data_url _train_url = flags.train_url if not mox.file.is_directory(_train_url): mox.file.make_dirs(_train_url) mox.file.make_dirs('/cache/data_url') mox.file.make_dirs('/cache/train_url') mox.file.copy_parallel(_data_url, '/cache/data_url') mox.file.copy_parallel(_train_url, '/cache/train_url') flags.data_url = '/cache/data_url' flags.train_url = '/cache/train_url' atexit.register(lambda: mox.file.copy_parallel('/cache/train_url', _train_url)) logger = logging.getLogger() while logger.handlers: logger.handlers.pop() num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) steps_per_epoch = int( math.ceil( float(NUM_SAMPLES_TRAIN) / (flags.batch_size * num_gpus * num_workers))) submission = pd.DataFrame(columns=['id', 'is_iceberg']) def input_fn(run_mode, **kwargs): if run_mode == mox.ModeKeys.TRAIN: num_samples = NUM_SAMPLES_TRAIN num_epochs = None shuffle = True file_pattern = 'iceberg-train-*.tfrecord' else:
def input_fn(mode, **kwargs): if not flags.synthetic: ds_strategy_spec = [] ds_switch_steps = [] if flags.split_dataset_like_mxnet and mox.get_flag('job_name'): if num_workers == 4: file_pattern = 'train-*-of-*-node-%d-*-*' % task_index elif num_workers == 8: file_pattern = 'train-*-of-*-node-*-%d-*' % task_index elif num_workers == 16: file_pattern = 'train-*-of-*-node-*-*-%d' % task_index else: raise ValueError('num_workers should be 4, 8, 16') else: file_pattern = flags.file_pattern for step, ims, bs in schduler: # switch to next dataset 2 steps earlier because there are 2 pipeline prefetch queue ds_switch_steps.append(step - 2) if ims == 128: ds_strategy_spec.append( (os.path.join(imagenet_160_data, file_pattern), bs, ims, 0.08)) elif ims == 224: ds_strategy_spec.append( (os.path.join(imagenet_data, file_pattern), bs, ims, 0.087)) elif ims == 288: ds_strategy_spec.append( (os.path.join(imagenet_data, file_pattern), bs, ims, 0.5)) else: raise ValueError('image is not in [128, 224, 288]') # The last stage of dataset does not need to be switched ds_switch_steps.pop(-1) tf.logging.info('Dataset will be switched at step: %s' % ds_switch_steps) dataset = ProgressiveImagenetDataset( num_samples=flags.num_samples, strategy_spec=ds_strategy_spec, ds_switch_steps=ds_switch_steps, shuffle=True, num_parallel=flags.num_readers, labels_offset=labels_offset, private_num_threads=flags.private_num_threads, shuffle_buffer_size=512 * 8 * 2) image, label = dataset.get(['image', 'label']) image_shape = tf.shape(image)[2] batch_size = tf.shape(label)[0] tf.summary.scalar(name='image_shape', tensor=image_shape) tf.summary.scalar(name='batch_size', tensor=batch_size) else: import numpy as np image = tf.constant( np.random.randint(low=0, high=255, size=[flags.batch_size, 128, 128, 3], dtype=np.uint8)) label = tf.constant( np.random.randint(low=0, high=999, size=[flags.batch_size], dtype=np.int64)) if flags.split_to_device: input_spec = mox.InputSpec(split_to_device=True) input_spec.new_input([image, label]) return input_spec else: return image, label
def main(*args, **kwargs): if flags.use_controller: convert_ps_to_controller() job_name = mox.get_flag('job_name') task_index = mox.get_flag('task_index') if flags.local_cache == 'hard': if flags.use_controller: # In all-reduce mode, worker-0 does not download dataset (controller-0 will download). imagenet_data, imagenet_160_data = download_dataset( flags.data_url, flags.data_url_160, skip_download=(job_name == 'worker' and task_index == 0)) else: # PS dose not download dataset. imagenet_data, imagenet_160_data = download_dataset( flags.data_url, flags.data_url_160, skip_download=(job_name == 'ps')) log_dir = '/cache/cache-outputs' else: imagenet_data = flags.data_url imagenet_160_data = flags.data_url_160 log_dir = flags.train_url print('download dataset finish at %s' % time.time()) if (not job_name or (job_name == 'worker' and task_index == 0)) and flags.train_url: if not mox.file.is_directory(log_dir): mox.file.make_dirs(log_dir) else: log_dir = None model_meta = mox.get_model_meta(flags.model_name) labels_offset = model_meta.default_labels_offset num_workers = len(mox.get_flag('worker_hosts').split(',')) assert flags.bs_and_ims_strategy is not None schduler = config_bs_ims(flags.bs_and_ims_strategy) max_step = int(schduler[-1][0]) def input_fn(mode, **kwargs): if not flags.synthetic: ds_strategy_spec = [] ds_switch_steps = [] if flags.split_dataset_like_mxnet and mox.get_flag('job_name'): if num_workers == 4: file_pattern = 'train-*-of-*-node-%d-*-*' % task_index elif num_workers == 8: file_pattern = 'train-*-of-*-node-*-%d-*' % task_index elif num_workers == 16: file_pattern = 'train-*-of-*-node-*-*-%d' % task_index else: raise ValueError('num_workers should be 4, 8, 16') else: file_pattern = flags.file_pattern for step, ims, bs in schduler: # switch to next dataset 2 steps earlier because there are 2 pipeline prefetch queue ds_switch_steps.append(step - 2) if ims == 128: ds_strategy_spec.append( (os.path.join(imagenet_160_data, file_pattern), bs, ims, 0.08)) elif ims == 224: ds_strategy_spec.append( (os.path.join(imagenet_data, file_pattern), bs, ims, 0.087)) elif ims == 288: ds_strategy_spec.append( (os.path.join(imagenet_data, file_pattern), bs, ims, 0.5)) else: raise ValueError('image is not in [128, 224, 288]') # The last stage of dataset does not need to be switched ds_switch_steps.pop(-1) tf.logging.info('Dataset will be switched at step: %s' % ds_switch_steps) dataset = ProgressiveImagenetDataset( num_samples=flags.num_samples, strategy_spec=ds_strategy_spec, ds_switch_steps=ds_switch_steps, shuffle=True, num_parallel=flags.num_readers, labels_offset=labels_offset, private_num_threads=flags.private_num_threads, shuffle_buffer_size=512 * 8 * 2) image, label = dataset.get(['image', 'label']) image_shape = tf.shape(image)[2] batch_size = tf.shape(label)[0] tf.summary.scalar(name='image_shape', tensor=image_shape) tf.summary.scalar(name='batch_size', tensor=batch_size) else: import numpy as np image = tf.constant( np.random.randint(low=0, high=255, size=[flags.batch_size, 128, 128, 3], dtype=np.uint8)) label = tf.constant( np.random.randint(low=0, high=999, size=[flags.batch_size], dtype=np.int64)) if flags.split_to_device: input_spec = mox.InputSpec(split_to_device=True) input_spec.new_input([image, label]) return input_spec else: return image, label def model_fn(inputs, mode, **kwargs): if not flags.gpu_synthetic: if flags.split_to_device: images, labels = inputs.get_input(0) else: images, labels = inputs else: import numpy as np images = tf.constant( np.random.randint(low=0, high=255, size=[flags.batch_size, 128, 128, 3], dtype=np.uint8)) labels = tf.constant( np.random.randint(low=0, high=999, size=[flags.batch_size], dtype=np.int64)) if flags.fp16: images = tf.cast(images, tf.float16) def preprocess_fn(images, run_mode, *args): images = images / 255.0 channels = tf.split(axis=3, num_or_size_splits=3, value=images) for i in range(3): channels[i] = (channels[i] - mean[i]) / std[i] images = tf.concat(axis=3, values=channels) if flags.data_format == 'NCHW': images = tf.transpose(images, perm=(0, 3, 1, 2)) return images model_kwargs = {} if flags.model_name == 'resnet_v1_50_8k': if flags.official_stride: model_kwargs['official'] = True if flags.fastai_initializer: model_kwargs['weights_initializer_params'] = { 'factor': 2.0 / 1.3, 'mode': 'FAN_OUT' } mox_model_fn = mox.get_model_fn(name=flags.model_name, run_mode=mode, num_classes=1000, preprocess_fn=preprocess_fn, weight_decay=flags.weight_decay, data_format=flags.data_format, batch_norm_fused=True, batch_renorm=False, **model_kwargs) logits, end_points = mox_model_fn(images) labels_one_hot = slim.one_hot_encoding(labels, 1000) loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels_one_hot, label_smoothing=0.0, weights=1.0) logits_fp32 = tf.cast(logits, tf.float32) accuracy_top_1 = tf.reduce_mean( tf.cast(tf.nn.in_top_k(logits_fp32, labels, 1), tf.float32)) accuracy_top_5 = tf.reduce_mean( tf.cast(tf.nn.in_top_k(logits_fp32, labels, 5), tf.float32)) log_info = { 'ent_loss': loss, 'top-1': accuracy_top_1, 'top-5': accuracy_top_5 } regularization_losses = mox.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if len(regularization_losses ) > 0 and flags.use_optimizer != 'dymomentumw': regularization_loss = tf.add_n(regularization_losses) log_info['reg_loss'] = regularization_loss loss = loss + regularization_loss log_info['total_loss'] = loss return mox.ModelSpec(loss=loss, log_info=log_info) if flags.strict_sync_replicas: mox.set_flag('sync_replicas', False) mox.set_flag('chief_inc_global_step', True) def optimizer_fn(): global_step = tf.train.get_or_create_global_step() decay_end = 1.0 - flags.cooldown if flags.use_lr_schedule == 'lcd': lr = linear_cosine_decay(flags.max_lr, flags.min_lr, global_step, max_step, flags.warmup, decay_end) print("Using Linear Cosine Decay Schedule") elif flags.use_lr_schedule == 'poly': lr = polynomial_decay(flags.max_lr, flags.min_lr, global_step, max_step, flags.warmup, decay_end) print("Using Polynomial Decay Schedule") else: raise ValueError("lr schedule not provided") if flags.use_optimizer == 'dymomentum': opt = DyMomentumOptimizer(lr, flags.max_lr, flags.min_lr, max_mom=flags.max_mom, min_mom=flags.min_mom, global_step=global_step, max_iteration=max_step, use_nesterov=flags.use_nesterov, cooldown=flags.cooldown, use_lars=flags.use_lars, weight_decay=flags.weight_decay) print("Using Dynamic Momentum Optimizer") elif flags.use_optimizer == 'dymomentumw': opt = DyMomentumWOptimizer(lr, flags.max_lr, flags.min_lr, max_mom=flags.max_mom, min_mom=flags.min_mom, global_step=global_step, max_iteration=max_step, use_nesterov=flags.use_nesterov, cooldown=flags.cooldown, use_lars=flags.use_lars, weight_decay=flags.weight_decay) print("Using Dynamic MomentumW Optimizer") else: raise ValueError("Optimizer not provided") tf.summary.scalar(name='momentum', tensor=opt.get_momentum()) if flags.strict_sync_replicas: from moxing.tensorflow.optimizer.simple_sync_optimizer import SimpleSyncOptimizer opt = SimpleSyncOptimizer(opt, num_workers=num_workers, task_index=task_index) return opt mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=optimizer_fn, run_mode=flags.run_mode, batch_size=flags.batch_size, max_number_of_steps=max_step, log_every_n_steps=flags.log_every_n_steps, log_dir=log_dir, auto_batch=False, save_summary_steps=flags.save_summary_steps, checkpoint_path=flags.checkpoint_url, save_model_secs=flags.save_model_secs) print('upload model finish at %s' % time.time()) if flags.local_cache == 'hard' and log_dir: mox.file.copy_parallel(log_dir, flags.train_url) print('Training job finish at: %s' % time.time())
def main(_): # 获取当前使用的GPU数量和节点数量 num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) data_meta = mox.ImageClassificationRawMetadata(base_dir=flags.data_url) def input_fn(mode): # 创建一个数据增强方法,该方法基于resnet50论文实现 augmentation_fn = mox.get_data_augmentation_fn(name='resnet_v1_50', run_mode=mode, output_height=224, output_width=224) # 创建`数据集读取类`,并将数据增强方法传入,最多读取20个epoch dataset = mox.ImageClassificationRawDataset( data_meta, batch_size=flags.batch_size, num_epochs=20, augmentation_fn=augmentation_fn) image, label = dataset.get(['image', 'label']) return image, label def model_fn(inputs, mode): images, labels = inputs # 获取一个resnet50的模型,输入images,输入logits和end_points,这里不关心end_points,仅取logits logits, _ = mox.get_model_fn(name='resnet_v1_50', run_mode=mode, num_classes=data_meta.num_classes, weight_decay=0.00004)(images) # 计算交叉熵损失值 labels_one_hot = slim.one_hot_encoding(labels, data_meta.num_classes) loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels_one_hot) # 获取正则项损失值,并加到loss上,这里必须要用mox.get_collection代替tf.get_collection regularization_losses = mox.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) regularization_loss = tf.add_n(regularization_losses) loss = loss + regularization_loss # 计算分类正确率 accuracy = tf.reduce_mean( tf.cast(tf.nn.in_top_k(logits, labels, 1), tf.float32)) # 返回MoXing-TensorFlow用于定义模型的类ModelSpec return mox.ModelSpec(loss=loss, log_info={ 'loss': loss, 'accuracy': accuracy }) def optimizer_fn(): # 使用分段式学习率,0-10个epoch为0.01,10-20个epoch为0.001 lr = learning_rate_scheduler.piecewise_lr( '10:0.01,20:0.001', num_samples=data_meta.total_num_samples, global_batch_size=flags.batch_size * num_gpus * num_workers) return tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9) mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=optimizer_fn, run_mode=mox.ModeKeys.TRAIN, max_number_of_steps=sys.maxint, log_dir=flags.train_url)
import numpy as np import pandas as pd import tensorflow as tf import moxing.tensorflow as mox from tensorflow.python.keras.layers import Conv2D, MaxPooling2D, Dense from tensorflow.python.keras.layers import Dropout, Flatten, Activation, Concatenate slim = tf.contrib.slim NUM_SAMPLES_TRAIN = 1176 NUM_SAMPLES_EVAL = 295 NUM_SAMPLES_TEST = 8424 tf.flags.DEFINE_integer('batch_size', 16, 'Mini-batch size') tf.flags.DEFINE_string('data_url', 's3://zxy/model/zzy', 'Dir of dataset') tf.flags.DEFINE_string('log_dir', 's3://zxy/model/zzy/log', 'Dir of log') tf.flags.DEFINE_boolean('is_training', True, 'True for train. False for eval and predict.') flags = tf.flags.FLAGS num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) steps_per_epoch = int(math.ceil(float(NUM_SAMPLES_TRAIN) / (flags.batch_size * num_gpus * num_workers))) submission = pd.DataFrame(columns=['id', 'is_iceberg']) def input_fn(run_mode, **kwargs): if run_mode == mox.ModeKeys.TRAIN: num_samples = NUM_SAMPLES_TRAIN num_epochs = None shuffle = True file_pattern = 'iceberg-train-*.tfrecord' else: num_epochs = 1 shuffle = False if run_mode == mox.ModeKeys.EVAL: num_samples = NUM_SAMPLES_EVAL file_pattern = 'iceberg-eval-*.tfrecord'
def main(*args, **kwargs): import time st = time.time() num_gpus = mox.get_flag('num_gpus') num_workers = len(mox.get_flag('worker_hosts').split(',')) exclude_list = ['global_step'] model_meta = mox.get_model_meta(flags.model_name) exclude_list.append(model_meta.default_logits_pattern) checkpoint_exclude_patterns = ','.join(exclude_list) mox.set_flag('checkpoint_exclude_patterns', checkpoint_exclude_patterns) data_meta = mox.ImageClassificationRawMetadata(base_dir=flags.data_url) labels_list = data_meta.labels_list mox.set_flag('loss_scale', 1024.0) def input_fn(mode, **kwargs): data_augmentation_fn = mox.get_data_augmentation_fn(name=flags.model_name, run_mode=mode) dataset = mox.ImageClassificationRawDataset(data_meta, batch_size=flags.batch_size, num_epochs=20, augmentation_fn=data_augmentation_fn, reader_class=mox.AsyncRawGenerator) images, labels = dataset.get(['image', 'label']) return images, labels def model_fn(inputs, mode, **kwargs): images, labels = inputs # cpu cannot support model infer with `NCHW`, gpu support both if mode == mox.ModeKeys.EXPORT: data_format = 'NHWC' else: data_format = 'NCHW' mox_model_fn = mox.get_model_fn( name=flags.model_name, run_mode=mode, num_classes=data_meta.num_classes, weight_decay=0.00004, data_format=data_format, batch_norm_fused=True) images_fp16 = tf.cast(images, tf.float16) with mox.var_scope(force_dtype=tf.float32): logits, _ = mox_model_fn(images_fp16) labels_one_hot = slim.one_hot_encoding(labels, data_meta.num_classes) loss = tf.losses.softmax_cross_entropy(labels_one_hot, logits=logits) regularization_losses = mox.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) regularization_loss = tf.add_n(regularization_losses) loss = loss + regularization_loss logits_fp32 = tf.cast(logits, tf.float32) accuracy = tf.reduce_mean(tf.cast(tf.nn.in_top_k(logits_fp32, labels, 1), tf.float32)) export_spec = mox.ExportSpec(inputs_dict={'images': images}, outputs_dict={'logits': logits_fp32}, version='model') return mox.ModelSpec(loss=loss, log_info={'loss': loss, 'accuracy': accuracy}, export_spec=export_spec) def optimizer_fn(): lr = learning_rate_scheduler.piecewise_lr('10:0.01,20:0.001', num_samples=data_meta.total_num_samples, global_batch_size=flags.batch_size * num_gpus * num_workers) opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9) return opt mox.run(input_fn=input_fn, model_fn=model_fn, optimizer_fn=optimizer_fn, run_mode=mox.ModeKeys.TRAIN, log_dir=flags.train_url, checkpoint_path=flags.checkpoint_url, max_number_of_steps=sys.maxint, export_model=mox.ExportKeys.TF_SERVING) # for model infer in ModelArts console with mox.file.File(os.path.join(flags.train_url, 'model', 'labels.txt'), 'w') as f: f.write('\n'.join(labels_list)) print(time.time() - st)