def main(): repeat = stparams.get_value('repeat', None) output_dir = stparams.get_value( 'outpu_dir', stparams.get_value('backup', '/mlsteam/data/yolo/model_weights/trained')) pretrained_weights = stparams.get_value('weights_file', None) if not os.path.exists(output_dir): pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) if repeat: repeat = int(repeat) for i in range(1, repeat + 1): if i == 1: pretrained = pretrained_weights else: pretrained = '%s/mls_final_run_%d.weights' % (output_dir, i - 1) trained = '%s/mls_final_run_%d.weights' % (output_dir, i) train_run(pretrained, output_dir, trained) else: train_run(pretrained_weights, output_dir)
def _experiment_fn(run_config, hparams): """Returns an Experiment.""" # Create estimator. train_input_fn = functools.partial( input_fn, data_dir, subset='train', num_shards=num_gpus, batch_size=hparams.train_batch_size, use_distortion_for_training=use_distortion_for_training) eval_input_fn = functools.partial(input_fn, data_dir, subset='eval', batch_size=hparams.eval_batch_size, num_shards=num_gpus) num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch( 'eval') if num_eval_examples % hparams.eval_batch_size != 0: raise ValueError( 'validation set size must be multiple of eval_batch_size') train_steps = stparams.get_value('train_steps', hparams.train_steps) eval_steps = num_eval_examples // hparams.eval_batch_size classifier = tf.estimator.Estimator(model_fn=get_model_fn( num_gpus, variable_strategy, run_config.num_worker_replicas or 1), config=run_config, params=hparams) # Create experiment. return tf.contrib.learn.Experiment(classifier, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps)
trainer.test() if __name__ == "__main__": parser = ArgumentParser() # PROGRAM level args parser.add_argument("--data_dir", type=str, default="/mlsteam/data/cifar10") parser.add_argument("--download_weights", type=int, default=0, choices=[0, 1]) parser.add_argument("--test_phase", type=int, default=0, choices=[0, 1]) parser.add_argument("--dev", type=int, default=0, choices=[0, 1]) parser.add_argument( "--logger", type=str, default="tensorboard", choices=["tensorboard", "wandb"] ) # TRAINER args parser.add_argument("--classifier", type=str, default=stparams.get_value("network", "mobilenet_v2")) parser.add_argument("--pretrained", type=int, default=0, choices=[0, 1]) parser.add_argument("--precision", type=int, default=32, choices=[16, 32]) parser.add_argument("--batch_size", type=int, default=128) parser.add_argument("--max_epochs", type=int, default=stparams.get_value("num_epochs", 100)) parser.add_argument("--num_workers", type=int, default=8) parser.add_argument("--gpu_id", type=str, default="0") parser.add_argument("--learning_rate", type=float, default=1e-2) parser.add_argument("--weight_decay", type=float, default=1e-2) args = parser.parse_args() main(args)
from keras import optimizers from keras.models import Sequential from keras.preprocessing.image import ImageDataGenerator import numpy as np ###import mlsteam function ##### from mlsteam import stparams # step 1: load data img_width = 150 img_height = 150 ######Prarms code###### train_data_dir = '/mlsteam/input/train' valid_data_dir = '/mlsteam/input/validation' batch_size=stparams.get_value("batch_size", 128) validation_batch_size=stparams.get_value("validation_batch_size", 128) num_epochs=stparams.get_value("num_epochs", 30) ######## datagen = ImageDataGenerator(rescale = 1./255) train_generator = datagen.flow_from_directory(directory=train_data_dir, target_size=(img_width,img_height), classes=['dogs','cats'], class_mode='binary', batch_size=batch_size) validation_generator = datagen.flow_from_directory(directory=valid_data_dir, target_size=(img_width,img_height), classes=['dogs','cats'],
def _resnet_model_fn(features, labels, mode, params): """Resnet model body. Support single host, one or more GPU training. Parameter distribution can be either one of the following scheme. 1. CPU is the parameter server and manages gradient updates. 2. Parameters are distributed evenly across all GPUs, and the first GPU manages gradient updates. Args: features: a list of tensors, one for each tower labels: a list of tensors, one for each tower mode: ModeKeys.TRAIN or EVAL params: Hyperparameters suitable for tuning Returns: A EstimatorSpec object. """ is_training = (mode == tf.estimator.ModeKeys.TRAIN) weight_decay = params.weight_decay momentum = params.momentum tower_features = features tower_labels = labels tower_losses = [] tower_gradvars = [] tower_preds = [] # channels first (NCHW) is normally optimal on GPU and channels last (NHWC) # on CPU. The exception is Intel MKL on CPU which is optimal with # channels_last. data_format = params.data_format if not data_format: if num_gpus == 0: data_format = 'channels_last' else: data_format = 'channels_first' if num_gpus == 0: num_devices = 1 device_type = 'cpu' else: num_devices = num_gpus device_type = 'gpu' for i in range(num_devices): worker_device = '/{}:{}'.format(device_type, i) if variable_strategy == 'CPU': device_setter = cifar10_utils.local_device_setter( worker_device=worker_device) elif variable_strategy == 'GPU': device_setter = cifar10_utils.local_device_setter( ps_device_type='gpu', worker_device=worker_device, ps_strategy=tf.contrib.training. GreedyLoadBalancingStrategy( num_gpus, tf.contrib.training.byte_size_load_fn)) with tf.variable_scope('resnet', reuse=bool(i != 0)): with tf.name_scope('tower_%d' % i) as name_scope: with tf.device(device_setter): loss, gradvars, preds = _tower_fn( is_training, weight_decay, tower_features[i], tower_labels[i], data_format, params.num_layers, params.batch_norm_decay, params.batch_norm_epsilon) tower_losses.append(loss) tower_gradvars.append(gradvars) tower_preds.append(preds) if i == 0: # Only trigger batch_norm moving mean and variance update from # the 1st tower. Ideally, we should grab the updates from all # towers but these stats accumulate extremely fast so we can # ignore the other stats from the other towers without # significant detriment. update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS, name_scope) # Now compute global loss and gradients. gradvars = [] with tf.name_scope('gradient_averaging'): all_grads = {} for grad, var in itertools.chain(*tower_gradvars): if grad is not None: all_grads.setdefault(var, []).append(grad) for var, grads in six.iteritems(all_grads): # Average gradients on the same device as the variables # to which they apply. with tf.device(var.device): if len(grads) == 1: avg_grad = grads[0] else: avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) gradvars.append((avg_grad, var)) # Device that runs the ops to apply global gradient updates. consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0' with tf.device(consolidation_device): # Suggested learning rate scheduling from # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155 num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch( 'train') // (stparams.get_value( 'train_bs', params.train_batch_size) * num_workers) boundaries = [ num_batches_per_epoch * x for x in np.array([82, 123, 300], dtype=np.int64) ] staged_lr = [ params.learning_rate * x for x in [1, 0.1, 0.01, 0.002] ] learning_rate = tf.train.piecewise_constant( tf.train.get_global_step(), boundaries, staged_lr) loss = tf.reduce_mean(tower_losses, name='loss') examples_sec_hook = cifar10_utils.ExamplesPerSecondHook( stparams.get_value('train_bs', params.train_batch_size), every_n_steps=10) tensors_to_log = {'learning_rate': learning_rate, 'loss': loss} log_hook = cifar10_utils.LogHook(tensors_to_log) logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100) train_hooks = [logging_hook, examples_sec_hook, log_hook] optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum) if params.sync: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=num_workers) sync_replicas_hook = optimizer.make_session_run_hook( params.is_chief) train_hooks.append(sync_replicas_hook) # Create single grouped train op train_op = [ optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step()) ] train_op.extend(update_ops) train_op = tf.group(*train_op) predictions = { 'classes': tf.concat([p['classes'] for p in tower_preds], axis=0), 'probabilities': tf.concat([p['probabilities'] for p in tower_preds], axis=0) } stacked_labels = tf.concat(labels, axis=0) metrics = { 'accuracy': tf.metrics.accuracy(stacked_labels, predictions['classes']) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train_op, training_hooks=train_hooks, eval_metric_ops=metrics)
def write_model_cfg(output_cfg=DEFAULT_MODEL_CFG, **kwargs): if stparams.get_value('version') == 'tiny': model_file = MODEL_CFG_TINY_TEMPLATE else: model_file = MODEL_CFG_ORIG_TEMPLATE return write_cfg(model_file, output_cfg, **kwargs)
def train_run(pretrained_weights, output_dir, output_weights=None): # read obj names get num class train_dir = stparams.get_value('train_dir', '/mlstea/data/yolo/training_data/yolo') param_names = stparams.get_value('names', '/mlsteam/data/yolo/obj.names') image_dir, names_file = get_input_dirs(train_dir, param_names) max_batches = int(stparams.get_value('max_batches', 500)) number_classes = num_classes(names_file) filters = int((number_classes + 5) * 3) num_epoch_save = '1,000' if max_batches < 10000 else '10,000' log_summary(f"Number of classes: {number_classes}") log_summary(f"Output weight policy: every {num_epoch_save} batch") log_summary(" if max_batcheds < 10,000, save weights every 1,000 batch") log_summary( " if max_batcheds >= 10,000, save weights every 10,000 batch") # Prepare train_list.txt train_list = os.path.join(train_dir, TRAIN_LIST_FILENAME) ensure_img_list(train_list, image_dir, stparams.get_value('image_exts', 'jpg;png').split(';')) # Prepare valid_list.txt if valid_dir present valid_dir = stparams.get_value('valid_dir', None) valid_list = os.path.join(train_dir, VALID_LIST_FILENAME) if not os.path.exists(valid_list): print("Validate list not exist, try to scan valid_dir {}".format( valid_dir)) if valid_dir in ['', None]: print("Parameter valid_dir not specify! skip validation") valid_list = 'no_valid_list' else: image_dir, names_file = get_input_dirs(valid_list, param_names) valid_list = ensure_img_list(valid_list, image_dir) cfg_dir = os.path.join(output_dir, 'cfg') if not os.path.exists(cfg_dir): os.makedirs(cfg_dir) model_cfg = write_model_cfg( output_cfg=os.path.join(cfg_dir, MODEL_CFG_NAME), batch=stparams.get_value('batch', 64), subdivisions=stparams.get_value('subdivisions', 64), learning_rate=stparams.get_value('learning_rate', 0.001), max_batches=max_batches, steps=stparams.get_value('steps', '400, 450').replace( ';', ','), # comma(,) is mlsteam parameter reserved character scales=stparams.get_value('scales', '.1, .1').replace( ';', ','), # comma(,) is mlsteam parameter reserved character num_classes=number_classes, filters=filters, # num_mask *(num_class+5) = 3*(1+5) ) data_cfg = os.path.join(train_dir, DATA_CFG_NAME) data_cfg = write_data_cfg(output_cfg=data_cfg, num_classes=number_classes, train_list=train_list, valid_list=valid_list, names=names_file, backup=output_dir, eval=stparams.get_value('eval', 'coco')) #prepare config for inferencing write_inf_cfg(output_cfg=os.path.join(cfg_dir, DATA_CFG_NAME), num_classes=number_classes, names=os.path.join('cfg', os.path.basename(names_file))) copyfile(names_file, os.path.join(cfg_dir, os.path.basename(names_file))) train(data_cfg, model_cfg, pretrained_weights=pretrained_weights) if output_weights: weights_path = '%s/%s_final.weights' % ( output_dir, os.path.basename(model_cfg).split('.')[0]) copyfile(weights_path, output_weights)