def testNonParallelModel(self): workspace.ResetWorkspace() model = model_helper.ModelHelper(name="test") old_seq_id = data_workers.global_coordinator._fetcher_id_seq coordinator = data_workers.init_data_input_workers( model, ["data", "label"], dummy_fetcher, 32, 2, input_source_name="unittest" ) new_seq_id = data_workers.global_coordinator._fetcher_id_seq self.assertEqual(new_seq_id, old_seq_id + 2) coordinator.start() workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) for _i in range(500): with timeout_guard.CompleteInTimeOrDie(5): workspace.RunNet(model.net.Proto().name) data = workspace.FetchBlob("data") labels = workspace.FetchBlob("label") self.assertEqual(data.shape[0], labels.shape[0]) self.assertEqual(data.shape[0], 32) for j in range(32): self.assertEqual(labels[j], data[j, 0]) self.assertEqual(labels[j], data[j, 1]) self.assertEqual(labels[j], data[j, 2]) coordinator.stop_coordinator("unittest") self.assertEqual(coordinator._coordinators, [])
def _test_create_blobs_queue_db(self, add_blobs_fun): num_samples = 10000 batch_size = 10 init_net = core.Net('init_net') net = core.Net('test_create_blobs_queue_db') queue = init_net.CreateBlobsQueue([], 'queue', capacity=num_samples) reader = init_net.CreateBlobsQueueDB( [queue], 'blobs_queue_db_reader', value_blob_index=0, timeout_secs=0.1, ) workspace.RunNetOnce(init_net) add_blobs_fun(queue, num_samples) net.TensorProtosDBInput([reader], ['image', 'label'], batch_size=batch_size) workspace.CreateNet(net) close_net = core.Net('close_net') close_net.CloseBlobsQueue([queue], []) for i in range(int(num_samples / batch_size)): print("Running net, iteration {}".format(i)) with timeout_guard.CompleteInTimeOrDie(2.0): workspace.RunNet(net) images = workspace.FetchBlob('image') labels = workspace.FetchBlob('label') self.assertEqual(batch_size, len(images)) self.assertEqual(batch_size, len(labels)) for idx, item in enumerate(images): self.assertEqual( "foo{}".format(i * batch_size + idx).encode('utf-8'), item) for item in labels: self.assertEqual(1, item) workspace.RunNetOnce(close_net)
def RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, best_accuracy, ): ''' Run one epoch of the trainer. TODO: add checkpointing here. ''' # TODO: add loading from checkpoint log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt)) prefix = "{}_{}".format( train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') train_fmt = "Training loss: {}, accuracy: {}" log.info(train_fmt.format(loss, accuracy)) num_images = epoch * epoch_iters * total_batch_size prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0] ) test_accuracy = 0 if (test_model is not None): # Run 100 iters of testing ntests = 0 # for _ in range(0, 100): # for _ in range(0, 125): for _ in range(0, args.test_iters): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar(workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy' )) ntests += 1 test_accuracy /= ntests else: test_accuracy = (-1) if test_accuracy > best_accuracy: best_accuracy = test_accuracy explog.log( input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, 'best_accuracy': best_accuracy, } ) assert loss < 40, "Exploded gradients :(" # TODO: add checkpointing return epoch + 1, best_accuracy
def RunEpoch( args, epoch, train_model, test_model, batch_size, num_shards, expname, explog, ): log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / batch_size / num_shards) if args.multi_label: accumulated_prob = np.empty(shape=[0, args.num_labels], dtype=np.float) accumulated_label = np.empty(shape=[0, args.num_labels], dtype=np.int32) for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 6000.0 if i == 0 else 600.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 if args.multi_label: prefix = "gpu_{}".format(train_model._devices[0]) prob = workspace.FetchBlob(prefix + '/prob') label = workspace.FetchBlob(prefix + '/label') accumulated_prob = np.concatenate((accumulated_prob, prob), axis=0) accumulated_label = np.concatenate((accumulated_label, label), axis=0) if i % args.display_iter == 0: fmt = "Finished iteration {}/{} of epoch {} ({:.2f} clips/sec)" log.info(fmt.format(i, epoch_iters, epoch, batch_size / dt)) prefix = "gpu_{}".format(train_model._devices[0]) loss = workspace.FetchBlob(prefix + '/loss') if args.multi_label: mean_auc, mean_ap, _, _ = \ metric.mean_ap_metric(accumulated_prob, accumulated_label) train_msg = \ "Training loss: {}, AUC: {}, mAP: {}".format( np.mean(loss), mean_auc, mean_ap ) if accumulated_label.shape[0] > 4096: accumulated_prob = accumulated_prob[-4096:, :] accumulated_label = accumulated_label[-4096:, :] else: accuracy = workspace.FetchBlob(prefix + '/accuracy') train_msg = "Training loss: {}, accuracy: {}".format( loss, accuracy) log.info(train_msg) num_clips = epoch * epoch_iters * batch_size prefix = "gpu_{}".format(train_model._devices[0]) loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob(prefix + '/LR') if args.multi_label: accuracy = -1 loss = np.mean(loss) else: mean_ap = -1 mean_auc = -1 if (test_model is not None): # Run 100 iters of testing ntests = 0 test_accuracy = 0 test_mean_auc = 0 test_mean_ap = 0 all_prob = np.empty(shape=[0, args.num_labels], dtype=np.float) all_label = np.empty(shape=[0, args.num_labels], dtype=np.int32) for _ in range(0, 100): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: prefix = "gpu_{}".format(g) if args.multi_label: prob = workspace.FetchBlob(prefix + '/prob') label = workspace.FetchBlob(prefix + '/label') all_prob = np.concatenate((all_prob, prob), axis=0) all_label = np.concatenate((all_label, label), axis=0) else: accuracy = workspace.FetchBlob(prefix + '/accuracy') test_accuracy += np.asscalar(accuracy) ntests += 1 if args.multi_label: test_mean_auc, test_mean_ap, _, _ = \ metric.mean_ap_metric(all_prob, all_label) log.info("Test AUC: {}, mAP: {}".format(mean_auc, mean_ap)) else: test_accuracy /= ntests log.info("Test accuracy: {}".format(test_accuracy)) else: test_accuracy = (-1) test_mean_auc = (-1) test_mean_ap = (-1) explog.log(input_count=num_clips, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'train_AUC': mean_auc, 'train_mAP': mean_ap, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, 'test_mean_auc': test_mean_auc, 'test_mean_ap': test_mean_ap, }) assert loss < 40, "Exploded gradients :(" return epoch + 1
def RunEpoch( args, epoch, train_model, test_model, total_batch_size, expname, explog, ): ''' Run one epoch of the trainer. TODO: add checkpointing here. ''' # TODO: add loading from checkpoint epoch_iters = int(args.epoch_size / total_batch_size) for i in range(epoch_iters): log.info("Start iteration {}/{} of epoch {}".format( i, epoch_iters, epoch, )) # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): workspace.RunNet(train_model.net.Proto().name) num_images = (i + epoch * epoch_iters) * total_batch_size record_freq = total_batch_size * 20 # Report progress, compute train and test accuracies. if num_images % record_freq == 0 and i > 0: prefix = "gpu_{}".format(train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob(prefix + '/LR') test_accuracy = 0 ntests = 0 if (test_model is not None): # Run 5 iters of testing for t in range(0, 5): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar( workspace.FetchBlob("gpu_{}".format(g) + '/accuracy')) ntests += 1 test_accuracy /= ntests else: test_accuracy = (-1) explog.log(input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, }) assert loss < 40, "Exploded gradients :(" # TODO: add checkpointing return epoch + 1
def RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, ): log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) for i in range(epoch_iters): timeout = 600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt)) prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') train_fmt = "Training loss: {}, accuracy: {}" log.info(train_fmt.format(loss, accuracy)) num_images = epoch * epoch_iters * total_batch_size prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0]) test_accuracy = 0 if (test_model is not None): # Run 100 iters of testing ntests = 0 for _ in range(0, 100): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar( workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy')) ntests += 1 test_accuracy /= ntests else: test_accuracy = (-1) explog.log(input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, }) assert loss < 40, "Exploded gradients :(" return epoch + 1
def RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, ): ''' Run one epoch of the trainer. TODO: add checkpointing here. ''' # TODO: add loading from checkpoint log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) ts = time.time() drop = 10 max = 0.0 spans = [] for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 3600 #3600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 if i > drop: spans.append(dt) pass updateEvery = args.notify_frequency #ignore the first 10 iterations if i == drop: #reset timer ts = time.time() pass if (i - drop) % updateEvery == 0 and i > drop: fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec), max = {:.2f}, avg = {:.2f}, median = {}. medthru = {}, avgthru = {}" te = time.time() td = te - ts currSpeed = updateEvery * total_batch_size / td if max < currSpeed: max = currSpeed pass log.info( fmt.format(i + 1, epoch_iters, epoch, currSpeed, max, np.mean(spans), np.median(spans), 1. / np.median(spans), 1. / np.mean(spans))) ts = time.time() prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') train_fmt = "Training loss: {}, accuracy: {}" log.info(train_fmt.format(loss, accuracy)) num_images = epoch * epoch_iters * total_batch_size prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0]) test_accuracy = 0 if (test_model is not None): # Run 100 iters of testing ntests = 0 for _ in range(0, 100): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar( workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy')) ntests += 1 test_accuracy /= ntests else: test_accuracy = (-1) explog.log(input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, }) assert loss < 40, "Exploded gradients :(" # TODO: add checkpointing print("accuracy = %s. test_acc = %s. loss = %s" % (accuracy, test_accuracy, loss)) return epoch + 1
def RunEpoch( args, epoch, train_model, test_model, batch_size, num_shards, expname, explog, ): log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / batch_size / num_shards) for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 6000.0 if i == 0 else 600.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 if i % args.display_iter == 0: fmt = "Finished iteration {}/{} of epoch {} ({:.2f} clips/sec)" log.info(fmt.format(i, epoch_iters, epoch, batch_size / dt)) prefix = "gpu_{}".format(train_model._devices[0]) loss = workspace.FetchBlob(prefix + '/loss') accuracy = workspace.FetchBlob(prefix + '/accuracy') learning_rate = workspace.FetchBlob(prefix + '/LR') train_msg = "Training loss: {}, lr: {}, accuracy: {}".format( loss, learning_rate, accuracy) log.info(train_msg) num_clips = epoch * epoch_iters * batch_size prefix = "gpu_{}".format(train_model._devices[0]) loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob(prefix + '/LR') if (test_model is not None): # Run 100 iters of testing ntests = 0 test_accuracy = 0 for _ in range(0, 100): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: prefix = "gpu_{}".format(g) accuracy = workspace.FetchBlob(prefix + '/accuracy') test_accuracy += np.asscalar(accuracy) ntests += 1 test_accuracy /= ntests log.info("Test accuracy: {}".format(test_accuracy)) else: test_accuracy = (-1) explog.log(input_count=num_clips, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, }) assert loss < 40, "Exploded gradients :(" return epoch + 1
def RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, explog, plt_kernel): ''' Run one epoch of the trainer. TODO: add checkpointing here. ''' # TODO: add loading from checkpoint if args.test_data_type == 'VAL': log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) epoch_loss = [] epoch_accuracy = [] for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 # display_first_image() fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt)) prefix = "{}_{}".format( train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') train_fmt = "Training loss: {}, accuracy: {}" log.info(train_fmt.format(loss, accuracy)) epoch_loss.append(loss) epoch_accuracy.append(accuracy) num_images = epoch * epoch_iters * total_batch_size prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0] ) test_accuracy = 0 if (test_model is not None): # Run 100 iters of testing ntests = 0 for _ in range(0, 100): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar(workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy' )) ntests += 1 test_accuracy /= ntests else: test_accuracy = (-1) explog.log( input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, } ) assert loss < 40, "Exploded gradients :(" if DEBUG_TRAINING: device_name = "{}_{}".format(test_model._device_prefix, test_model._devices[0]) display_activation_map(plt_kernel, channel=0, batch_num=16, device_name=device_name) plt.pause(0.001) #lfw verification test elif args.test_data_type == 'LFW' and args.load_model_path is not None: lfw_pairs = os.path.join(os.path.abspath('../dataset'), 'lfw_pairs.txt') if not os.path.exists(lfw_pairs): log.error('There is no lfw_pairs.txt in folder dataset/lfw!!!') else: actual_issame = lfw.get_issame_list(lfw.read_pairs(lfw_pairs)) num_test_images = len(actual_issame) * 2 assert num_test_images % total_batch_size == 0, \ 'The number of lfw test images must be interger multiple of the test bach size' num_batches = num_test_images // total_batch_size emb_array = np.zeros((num_test_images, args.feature_dim)) for _ in range(0, num_batches): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: # display_activation_map(plt_kernel, channel=0, batch_num=16) # plt.pause(0.001) label = workspace.FetchBlob('{}_{}'.format(test_model._device_prefix, g) + '/label') embedding = workspace.FetchBlob('{}_{}'.format(test_model._device_prefix, g) + '/fc5') emb_array[label] = embedding _, _, test_accuracy, test_val, val_std, far = lfw.evaluate(emb_array, actual_issame, nrof_folds=10) log.info('Accuracy: %1.3f+-%1.3f' % (np.mean(test_accuracy), np.std(test_accuracy))) log.info('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (test_val, val_std, far)) #megaface verification test elif args.test_data_type == 'MEGAFACE' and args.load_model_path is not None: pass return epoch + 1, epoch_loss, epoch_accuracy
def run_training_net(self): timeout = 2000.0 with timeout_guard.CompleteInTimeOrDie(timeout): workspace.RunNet(self.train_model.net.Proto().name)
def run_testing_net(self): if self.test_model is None: return timeout = 2000.0 with timeout_guard.CompleteInTimeOrDie(timeout): workspace.RunNet(self.test_model.net.Proto().name)
def buildModelAndTrain(self, opts): log.info('in buildModelAndTrain, trainer_input: {}'.format(str(opts))) log.info("check type self: {}".format(type(self))) log.info("check self dir: {}".format(dir(self))) log.info("check self get_input_dataset methods: {}".format( inspect.getsource(self.get_input_dataset))) log.info("check self gen_input_builder_fun method: {}".format( inspect.getsource(self.gen_input_builder_fun))) log.info("check self gen_forward_pass_builder_fun method: {}".format( inspect.getsource(self.gen_forward_pass_builder_fun))) if self.gen_param_update_builder_fun is not None: log.info( "check self gen_param_update_builder_fun method: {}".format( inspect.getsource(self.gen_param_update_builder_fun))) else: log.info("check self gen_optimizer_fun method: {}".format( inspect.getsource(self.gen_optimizer_fun))) log.info("check self assembleAllOutputs method: {}".format( inspect.getsource(self.assembleAllOutputs))) self.get_model_input_fun() self.init_model() self.planning_output() self.prep_data_parallel_models() self.loadCheckpoint() for epoch in self.list_of_epochs(): log.info("start training epoch {}".format(epoch)) self.fun_per_epoch_b4RunNet(epoch) for epoch_iter in self.list_of_epoch_iters(): self.iter_start_time = time.time() self.fun_per_iter_b4RunNet(epoch, epoch_iter) self.run_training_net() self.fun_per_iter_aftRunNetB4Test(epoch, epoch_iter) self.iter_end_time = time.time() if (epoch_iter % opts['epoch_iter']['num_train_iteration_per_test'] == 0 ): secs_per_train = (self.iter_end_time - self.iter_start_time) self.secs_per_train.append(secs_per_train) sample_trained = self.total_batch_size samples_per_sec = sample_trained / secs_per_train self.samples_per_sec.append(samples_per_sec) self.fract_epoch = ( epoch + float(epoch_iter) / self.epoch_iterations) self.record_epochs.append(self.fract_epoch) for key in self.metrics: metric = self.metrics[key] if not metric['is_train']: continue metric['calculator'].Add() metric['output'].append(metric['calculator'].Compute()) self.test_loop_start_time = time.time() for _test_iter in range( 0, opts['epoch_iter']['num_test_iter']): timeout = 2000.0 with timeout_guard.CompleteInTimeOrDie(timeout): workspace.RunNet(self.test_model.net.Proto().name) for key in self.metrics: metric = self.metrics[key] if metric['is_train']: continue metric['calculator'].Add() self.test_loop_end_time = time.time() self.sec_per_test_loop = \ self.test_loop_end_time - self.test_loop_start_time for metric in self.metrics.values(): if metric['is_train']: continue metric['output'].append(metric['calculator'].Compute()) logStr = 'epoch:{}/{} iter:{}/{} secs_per_train:{} '.format( self.fract_epoch, self.opts['epoch_iter']['num_epochs'], epoch_iter, self.epoch_iterations, secs_per_train) logStr += 'samples_per_sec:{} loop {} tests takes {} sec'.format( samples_per_sec, opts['epoch_iter']['num_test_iter'], self.sec_per_test_loop) for metric, value in self.metrics.items(): logStr += ' {}:{} '.format(metric, value['output'][-1]) log.info('Iter Stats: {}'.format(logStr)) self.fun_per_iter_aftRunNetAftTest(epoch, epoch_iter) self.checkpoint(epoch) self.fun_per_epoch_aftRunNet(epoch) self.fun_conclude_operator() self.createMetricsPlotsModelsOutputs() return self.assembleAllOutputs()
def testInputOrder(self): # # Create two models (train and validation) with same input blobs # names and ensure that both will get the data in correct order # workspace.ResetWorkspace() self.counters = {0: 0, 1: 1} def dummy_fetcher_rnn_ordered1(fetcher_id, batch_size): # Hardcoding some input blobs T = 20 N = batch_size D = 33 data = np.zeros((T, N, D)) data[0][0][0] = self.counters[fetcher_id] label = np.random.randint(N, size=(T, N)) label[0][0] = self.counters[fetcher_id] seq_lengths = np.random.randint(N, size=(N)) seq_lengths[0] = self.counters[fetcher_id] self.counters[fetcher_id] += 1 return [data, label, seq_lengths] workspace.ResetWorkspace() model = model_helper.ModelHelper(name="rnn_test_order") coordinator = data_workers.init_data_input_workers( model, input_blob_names=["data2", "label2", "seq_lengths2"], fetch_fun=dummy_fetcher_rnn_ordered1, batch_size=32, max_buffered_batches=1000, num_worker_threads=1, dont_rebatch=True, input_source_name='train') coordinator.start() val_model = model_helper.ModelHelper(name="rnn_test_order_val") coordinator1 = data_workers.init_data_input_workers( val_model, input_blob_names=["data2", "label2", "seq_lengths2"], fetch_fun=dummy_fetcher_rnn_ordered1, batch_size=32, max_buffered_batches=1000, num_worker_threads=1, dont_rebatch=True, input_source_name='val') coordinator1.start() workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.CreateNet(val_model.net) while coordinator._coordinators[0]._state._inputs < 900: time.sleep(0.01) with timeout_guard.CompleteInTimeOrDie(5): for m in (model, val_model): print(m.net.Proto().name) workspace.RunNet(m.net.Proto().name) last_data = workspace.FetchBlob('data2')[0][0][0] last_lab = workspace.FetchBlob('label2')[0][0] last_seq = workspace.FetchBlob('seq_lengths2')[0] # Run few rounds for _i in range(10): workspace.RunNet(m.net.Proto().name) data = workspace.FetchBlob('data2')[0][0][0] lab = workspace.FetchBlob('label2')[0][0] seq = workspace.FetchBlob('seq_lengths2')[0] self.assertEqual(data, last_data + 1) self.assertEqual(lab, last_lab + 1) self.assertEqual(seq, last_seq + 1) last_data = data last_lab = lab last_seq = seq time.sleep(0.2) self.assertTrue(coordinator.stop())
def RunEpoch(args, epoch, train_model, test_model, explog, elapsed_training_time): """ Run a training epoch one the evaluation model, and then compute the accuracy on a test model. :param args: the script's parameters :param epoch: the current epoch'count :param train_model: the model on which training will be performed :param test_model: the model on which testing will be performed :param explog: the log object wrapping the file """ log.info("Starting epoch {}/{}".format(epoch + 1, args.epoch_count)) epoch_iters = int(args.epoch_size / args.batch_size / args.num_shards) test_epoch_iters = int(args.test_epoch_size / args.batch_size / args.num_shards) prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) total_time = 0. for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 600 if i == 0 else 300 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 total_time += dt # Log the tiem it took to run the current batch fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" log.info( fmt.format(i + 1, epoch_iters, epoch + 1, args.batch_size / dt)) # Get the accuracy and loss for this particular device accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') # Write the training loss and accuracy for this batch log.info("Training loss: {}, accuracy: {}".format(loss, accuracy)) # Compute the total number of images computed for this epoch; get the accuracy and the loss num_images = (epoch + 1) * epoch_iters * args.batch_size accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') try: learning_rate = workspace.FetchBlob( (prefix if args.per_device_optimization else '') + data_parallel_model.GetLearningRateBlobNames(train_model)[0]) except AttributeError: log.error( "The learning rate could not be found on this peer; this is likely due to the " "--per_device_optimization=True option.") learning_rate = 'unknown' # Prepare the parameters required for testing test_accuracy = 0 test_accuracy_top5 = 0 if test_model is not None: ntests = 0 for _ in range(test_epoch_iters): workspace.RunNet(test_model.net.Proto().name) # Aggregate the accuracy across all the devices involved in testing for g in test_model._devices: test_accuracy += np.asscalar( workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy')) test_accuracy_top5 += np.asscalar( workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5')) ntests += 1 # Compute the average test_accuracy and the average top-5 test accuracy across # a test epoch, and across all devices involved in it test_accuracy /= ntests test_accuracy_top5 /= ntests # Log the results to stdout, update total training time elapsed_training_time += total_time on_target = test_accuracy >= args.target_accuracy log.info("Finished testing on epoch {}. Obtained:\nAccuracy (Local - Training): {}\n" \ "Loss (Local - Training): {}\nTop-1 Acc: {}\nTop-5 Acc: {}\nOn target: {}\n Elapsed training time: {}" .format(epoch + 1, accuracy, loss, test_accuracy, test_accuracy_top5, on_target, elapsed_training_time)) # Log this epoch's results explog.log(input_count=num_images, batch_count=((epoch + 1) * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch + 1, 'top1_test_accuracy': test_accuracy, 'top5_test_accuracy': test_accuracy_top5, 'target_accuracy': args.target_accuracy, 'on_target': on_target, 'elapsed_training_time': elapsed_training_time, }) assert loss < 40, "Exploded gradients" return elapsed_training_time, on_target
def RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, ): ''' Run one epoch of the trainer. TODO: add checkpointing here. ''' # TODO: add loading from checkpoint log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) train_accuracy = 0 train_loss = 0 display_count = 20 prefix = "gpu_{}".format(train_model._devices[0]) for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 train_accuracy += workspace.FetchBlob(prefix + '/accuracy') train_loss += workspace.FetchBlob(prefix + '/loss') if (i + 1) % display_count == 0: # or (i + 1) % epoch_iters == 0: fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" log.info( fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt)) train_fmt = "Training loss: {}, accuracy: {}" log.info( train_fmt.format(train_loss / display_count, train_accuracy / display_count)) r_train_accuracy.append(train_accuracy / display_count) r_loss.append(train_loss / display_count) train_accuracy = 0 train_loss = 0 test_accuracy = 0 ntests = 0 for _ in range(0, 20): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar( workspace.FetchBlob("gpu_{}".format(g) + '/accuracy')) ntests += 1 test_accuracy /= ntests r_test_accuracy.append(test_accuracy) #my # print(dir(data_parallel_model)) # exit(0) num_images = epoch * epoch_iters * total_batch_size prefix = "gpu_{}".format(train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob('SgdOptimizer_0_lr_gpu0') test_accuracy = 0 if (test_model is not None): # Run 100 iters of testing ntests = 0 for _ in range(0, 100): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar( workspace.FetchBlob("gpu_{}".format(g) + '/accuracy')) ntests += 1 test_accuracy /= ntests r_test_accuracy.append(test_accuracy) #my else: test_accuracy = (-1) test_fmt = "Testing accuracy: {}" log.info(test_fmt.format(test_accuracy)) explog.log(input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, }) assert loss < 40, "Exploded gradients :(" # TODO: add checkpointing return epoch + 1