def validation_run(valid, filepath, i, epoch, first_run, opts): if filepath: valid.saver.restore(valid.session, filepath) # Gather accuracy statistics accuracy = 0.0 start = time.time() for __ in range(opts["validation_iterations"]): try: a = valid.session.run(valid.ops)[0] except tf.errors.OpError as e: raise tf.errors.ResourceExhaustedError(e.node_def, e.op, e.message) accuracy += a val_time = time.time() - start accuracy /= opts["validation_iterations"] valid_format = ( "Validation top-1 accuracy (iteration: {iteration:6d}, epoch: {epoch:6.2f}, img/sec: {img_per_sec:6.2f}," " time: {val_time:8.6f}): {val_acc:6.3f}%") stats = OrderedDict([ ('iteration', i), ('epoch', epoch), ('val_acc', accuracy), ('val_time', val_time), ('img_per_sec', (opts["validation_iterations"] * opts["validation_batches_per_step"] * opts['validation_total_batch_size']) / val_time), ]) logging.print_to_file_and_screen(valid_format.format(**stats), opts) logging.write_to_csv(stats, first_run, False, opts)
def train_process(model, LR_Class, opts): # --------------- OPTIONS --------------------- epochs = opts["epochs"] iterations_per_epoch = DATASET_CONSTANTS[ opts['dataset']]['NUM_IMAGES'] // opts["total_batch_size"] if not opts['iterations']: iterations = epochs * iterations_per_epoch log_freq = iterations_per_epoch // opts['logs_per_epoch'] else: iterations = opts['iterations'] log_freq = opts['log_freq'] if log_freq < opts['batches_per_step']: iterations_per_step = log_freq else: iterations_per_step = log_freq // int( round(log_freq / opts['batches_per_step'])) iterations_per_valid = iterations_per_epoch iterations_per_ckpt = iterations_per_epoch // opts[ 'ckpts_per_epoch'] if opts['ckpts_per_epoch'] else np.inf LR = LR_Class(opts, iterations) batch_accs = deque(maxlen=iterations_per_epoch // iterations_per_step) batch_losses = deque(maxlen=iterations_per_epoch // iterations_per_step) batch_times = deque(maxlen=iterations_per_epoch // iterations_per_step) start_all = None # -------------- BUILD TRAINING GRAPH ---------------- train = training_graph( model, opts, iterations_per_step * opts["gradients_to_accumulate"]) train.session.run(train.init) train.session.run(train.iterator.initializer) # -------------- BUILD VALIDATION GRAPH ---------------- if opts['validation']: valid = validation.initialise_validation(model, opts) # -------------- SAVE AND RESTORE -------------- if opts['ckpts_per_epoch']: filepath = train.saver.save(train.session, opts["checkpoint_path"], global_step=0) print("Saved checkpoint to {}".format(filepath)) if opts.get('restoring'): filename_pattern = re.compile(".*ckpt-[0-9]+$") ckpt_pattern = re.compile(".*ckpt-([0-9]+)$") filenames = sorted( [ os.path.join(opts['logs_path'], f[:-len(".index")]) for f in os.listdir(opts['logs_path']) if filename_pattern.match(f[:-len(".index")]) and f[-len(".index"):] == ".index" ], key=lambda x: int(ckpt_pattern.match(x).groups()[0])) latest_checkpoint = filenames[-1] logging.print_to_file_and_screen( "Restoring training from latest checkpoint: {}".format( latest_checkpoint), opts) ckpt_pattern = re.compile(".*ckpt-([0-9]+)$") i = int(ckpt_pattern.match(latest_checkpoint).groups()[0]) + 1 train.saver.restore(train.session, latest_checkpoint) epoch = float(opts["total_batch_size"] * (i + iterations_per_step)) / DATASET_CONSTANTS[ opts['dataset']]['NUM_IMAGES'] else: i = 0 # ------------- TRAINING LOOP ---------------- print_format = ( "step: {step:6d}, iteration: {iteration:6d}, epoch: {epoch:6.2f}, lr: {lr:6.4g}, loss: {loss_avg:6.3f}, accuracy: {train_acc_avg:6.3f}%" ", img/sec: {img_per_sec:6.2f}, time: {it_time:8.6f}, total_time: {total_time:8.1f}" ) step = 0 start_all = time.time() while i < iterations: step += opts["gradients_to_accumulate"] log_this_step = ((i // log_freq) < ((i + iterations_per_step) // log_freq) or (i == 0) or ((i + (2 * iterations_per_step)) >= iterations)) ckpt_this_step = ((i // iterations_per_ckpt) < ((i + iterations_per_step) // iterations_per_ckpt) or (i == 0) or ((i + (2 * iterations_per_step)) >= iterations)) valid_this_step = (opts['validation'] and ( (i // iterations_per_valid) < ((i + iterations_per_step) // iterations_per_valid) or (i == 0) or ((i + (2 * iterations_per_step)) >= iterations))) # Run Training try: batch_loss, batch_acc, batch_time, current_lr, scaled_lr = training_step( train, i + 1, LR.feed_dict_lr(i)) except tf.errors.OpError as e: raise tf.errors.ResourceExhaustedError(e.node_def, e.op, e.message) batch_time /= iterations_per_step # Calculate Stats batch_accs.append([batch_acc]) batch_losses.append([batch_loss]) if i != 0: batch_times.append([batch_time]) # Print loss if log_this_step: train_acc = np.mean(batch_accs) train_loss = np.mean(batch_losses) if len(batch_times) != 0: avg_batch_time = np.mean(batch_times) else: avg_batch_time = batch_time # flush times every time it is reported batch_times.clear() total_time = time.time() - start_all epoch = float(opts["total_batch_size"] * (i + iterations_per_step)) / DATASET_CONSTANTS[ opts['dataset']]['NUM_IMAGES'] stats = OrderedDict([ ('step', step), ('iteration', i + iterations_per_step), ('epoch', epoch), ('lr', current_lr), ('scaled_lr', scaled_lr), ('loss_batch', batch_loss), ('loss_avg', train_loss), ('train_acc_batch', batch_acc), ('train_acc_avg', train_acc), ('it_time', avg_batch_time), ('img_per_sec', opts['total_batch_size'] / avg_batch_time), ('total_time', total_time), ]) logging.print_to_file_and_screen(print_format.format(**stats), opts) logging.write_to_csv(stats, i == 0, True, opts) if ckpt_this_step: filepath = train.saver.save(train.session, opts["checkpoint_path"], global_step=i + iterations_per_step) print("Saved checkpoint to {}".format(filepath)) # Eval if valid_this_step and opts['validation']: if 'validation_points' not in locals(): validation_points = [] validation_points.append( (i + iterations_per_step, epoch, i == 0, filepath)) i += iterations_per_step # ------------ RUN VALIDATION ------------ if opts['validation']: for iteration, epoch, first_run, filepath in validation_points: validation.validation_run(valid, filepath, iteration, epoch, first_run, opts) # --------------- CLEANUP ---------------- train.session.close()
args['lr_schedule']) except ImportError: raise ValueError("LR_Schedules/{}.py not found".format( args['lr_schedule'])) # Large number of deprecation warnings that cannot be resolved yet. tf.logging.set_verbosity(tf.logging.ERROR) parser = create_parser(model, lr_schedule, parser) opts = vars(parser.parse_args()) if opts['help']: parser.print_help() else: if opts['gradients_to_accumulate'] > 1 and opts['pipeline_depth'] > 1: raise ValueError( "gradients-to-accumulate can't be specified when using --pipeline-depth > 1" ) if opts['pipeline_depth'] > 1 and opts['shards'] == 1: raise ValueError( "--pipeline-depth can only be used if --shards > 1") opts["command"] = ' '.join(sys.argv) set_defaults(model, lr_schedule, opts) logging.print_to_file_and_screen("Command line: " + opts["command"], opts) logging.print_to_file_and_screen(opts["summary_str"].format(**opts), opts) opts["summary_str"] = "" logging.print_to_file_and_screen(opts, opts) train_process(model, lr_schedule.LearningRate, opts)
def inference_run(exec_filename, ckpt_name, iteration, epoch, first_run, opts): """Run inference for multiple iterations and collect latency values.""" logging.mlperf_logging(key="EVAL_START", log_type="start", metadata={"epoch_num": round(epoch)}) engine_name = "my_engine" ctx = embedded_runtime.embedded_runtime_start(exec_filename, [], engine_name, timeout=1000) input_placeholder = tf.placeholder( tf.uint8, (opts['micro_batch_size'], opts['image_size'], opts['image_size'], 3)) num_iters = opts['iterations'] if opts['generated_data']: placeholders = [input_placeholder] images = np.random.normal(size=(opts['micro_batch_size'], opts['image_size'], opts['image_size'], 3)).astype(np.uint8) labels = None else: label_placeholder = tf.placeholder(tf.int32, (opts['micro_batch_size'])) placeholders = [input_placeholder, label_placeholder] with tf.Graph().as_default(): inference_dataset = dataset.data( opts, is_training=False).map(lambda x: {'data_dict': x}) images, labels = dataset_to_list( inference_dataset, num_iters * opts['micro_batch_size']) call_result = embedded_runtime.embedded_runtime_call(placeholders, ctx) ipu.config.reset_ipu_configuration() gc.collect() thread_queue = Queue() with tf.Session() as session: # do not include time of the first iteration in stats initial_feed_dict = prepare_feed_dict(placeholders, images, labels, opts['micro_batch_size'], opts['generated_data'], 0) session.run(call_result, initial_feed_dict) def runner(session, thread_idx): thread_channel = pvti.createTraceChannel(f"Thread {thread_idx}") latencies = [] accuracies = [] for iter_idx in range(num_iters): feed_dict = prepare_feed_dict(placeholders, images, labels, opts['micro_batch_size'], opts['generated_data'], iter_idx) with pvti.Tracepoint(thread_channel, f"Iteration {iter_idx}"): start_iter = time.time() _, predictions = session.run(call_result, feed_dict) end_iter = time.time() latencies.append(end_iter - start_iter) if not opts['generated_data']: expected = feed_dict[label_placeholder] accuracy = np.mean( np.equal(predictions, expected).astype(np.float32)) accuracies.append(accuracy) thread_queue.put((latencies, accuracies), timeout=10) thp = [ Thread(target=runner, args=(session, thread_idx)) for thread_idx in range(opts['num_inference_thread']) ] inference_start = time.time() for idx, _thread in enumerate(thp): _thread.start() print(f"Thread {idx} started") for idx, _thread in enumerate(thp): _thread.join() print(f"Thread {idx} joined") val_time = time.time() - inference_start latencies, accuracies = [], [] while not thread_queue.empty(): lat_acc = thread_queue.get() latencies.extend(lat_acc[0]) accuracies.extend(lat_acc[1]) if opts['generated_data']: total_accuracy = -1 else: total_accuracy = sum(accuracies) / len(accuracies) total_accuracy *= 100 # convert latencies to miliseconds latencies = [1000 * latency_s for latency_s in latencies] max_latency = max(latencies) mean_latency = np.mean(latencies) perc_99 = np.percentile(latencies, 99) perc_99_9 = np.percentile(latencies, 99.9) print( f"Latencies - avg: {mean_latency:8.4f}, 99th percentile: {perc_99:8.4f}, " f"99.9th percentile: {perc_99_9:8.4f}, max: {max_latency:8.4f}") valid_format = ( "Validation top-1 accuracy [{name}] (iteration: {iteration:6d}, epoch: {epoch:6.2f}, " "img/sec: {img_per_sec:6.2f}, time: {val_time:8.6f}, " "latency (ms): {latency:8.4f}: {val_acc:6.3f}%") val_size = (num_iters * opts['num_inference_thread'] * opts['validation_total_batch_size']) stats = OrderedDict([ ('name', ckpt_name), ('iteration', iteration), ('epoch', epoch), ('val_acc', total_accuracy), ('val_time', val_time), ('val_size', val_size), ('img_per_sec', val_size / val_time), ('latency', mean_latency), ]) logging.print_to_file_and_screen(valid_format.format(**stats), opts) logging.write_to_csv(stats, first_run, False, opts) if opts['wandb'] and opts['distributed_worker_index'] == 0: logging.log_to_wandb(stats) logging.mlperf_logging(key="EVAL_STOP", log_type="stop", metadata={"epoch_num": round(epoch)}) logging.mlperf_logging(key="EVAL_ACCURACY", value=float(stats['val_acc']) / 100, metadata={"epoch_num": round(epoch)}) return stats
else: # backwards compatibility if opts['batch_size'] and opts['micro_batch_size']: raise ValueError( 'Both --batch-size and --micro-batch-size arguments were given, ' 'use --micro-batch-size, as --batch-size is deprecated and kept ' 'for backwards compatibility.') elif opts['batch_size']: opts['micro_batch_size'] = opts['batch_size'] opts['use_popdist'] = False opts['total_replicas'] = opts['replicas'] opts['command'] = ' '.join(sys.argv) set_defaults(model, opts) if opts['dataset'] == 'imagenet': if opts['image_size'] is None: opts['image_size'] = 224 elif 'cifar' in opts['dataset']: opts['image_size'] = 32 if opts['wandb'] and opts['distributed_worker_index'] == 0: logging.initialise_wandb(opts) logging.print_to_file_and_screen("Command line: " + opts['command'], opts) logging.print_to_file_and_screen(opts['summary_str'].format(**opts), opts) opts['summary_str'] = "" logging.print_to_file_and_screen(opts, opts) inference_only_process(model, opts)
def validation_run(valid, filepath, i, epoch, first_run, opts, latency_thread): run = True if filepath: valid.saver.restore(valid.session, filepath) name = filepath.split('/')[-1] csv_path = os.path.join(opts['logs_path'], 'validation.csv') if os.path.exists(csv_path): with open(csv_path, 'rU') as infile: # read the file as a dictionary for each row ({header : value}) reader = csv.DictReader(infile) for row in reader: if row['name'] == name: run = False print( 'Skipping validation run on checkpoint: {}'.format( name)) break else: name = None if run: if opts['use_popdist']: # synchronise the model weights across all instances valid.session.run(valid.ops['broadcast_weights']) logging.mlperf_logging(key="EVAL_START", log_type="start", metadata={"epoch_num": round(epoch)}) # Gather accuracy statistics accuracy = 0.0 # start latency thread latency_thread.start() start = relative_timer.now() for __ in range(opts["validation_iterations"]): try: a = valid.session.run(valid.ops['accuracy']) except tf.errors.OpError as e: if opts['compile_only'] and 'compilation only' in e.message: print("Validation graph successfully compiled") print("Exiting...") sys.exit(0) raise tf.errors.ResourceExhaustedError(e.node_def, e.op, e.message) accuracy += a val_time = relative_timer.now() - start accuracy /= opts["validation_iterations"] # wait for all dequeues and latency computation latency_thread.join() latency = latency_thread.get_latency() valid_format = ( "Validation top-1 accuracy [{name}] (iteration: {iteration:6d}, epoch: {epoch:6.2f}, img/sec: {img_per_sec:6.2f}," " time: {val_time:8.6f}, latency (ms): {latency:8.4f}): {val_acc:6.3f}%" ) val_size = (opts["validation_iterations"] * opts["validation_batches_per_step"] * opts["validation_global_batch_size"]) count = int( DATASET_CONSTANTS[opts['dataset']]['NUM_VALIDATION_IMAGES']) raw_accuracy = accuracy if count < val_size: accuracy = accuracy * val_size / count stats = OrderedDict([ ('name', name), ('iteration', i), ('epoch', epoch), ('val_acc', accuracy), ('raw_acc', raw_accuracy), ('val_time', val_time), ('val_size', val_size), ('img_per_sec', val_size / val_time), ('latency', latency * 1000), ]) logging.print_to_file_and_screen(valid_format.format(**stats), opts) logging.write_to_csv(stats, first_run, False, opts) if opts["wandb"] and opts["distributed_worker_index"] == 0: logging.log_to_wandb(stats) logging.mlperf_logging(key="EVAL_STOP", log_type="stop", metadata={"epoch_num": round(epoch)}) logging.mlperf_logging(key="EVAL_ACCURACY", value=float(stats["val_acc"]) / 100, metadata={"epoch_num": round(epoch)}) return stats