def time(solver, nccl): fprop = [] bprop = [] total = caffe.Timer() allrd = caffe.Timer() for _ in range(len(solver.net.layers)): fprop.append(caffe.Timer()) bprop.append(caffe.Timer()) display = solver.param.display def show_time(): if solver.iter % display == 0: s = '\n' for i in range(len(solver.net.layers)): s += 'forw %3d %8s ' % (i, solver.net._layer_names[i]) s += ': %.2f\n' % fprop[i].ms for i in range(len(solver.net.layers) - 1, -1, -1): s += 'back %3d %8s ' % (i, solver.net._layer_names[i]) s += ': %.2f\n' % bprop[i].ms s += 'solver total: %.2f\n' % total.ms s += 'allreduce: %.2f\n' % allrd.ms caffe.log(s) solver.net.before_forward(lambda layer: fprop[layer].start()) solver.net.after_forward(lambda layer: fprop[layer].stop()) solver.net.before_backward(lambda layer: bprop[layer].start()) solver.net.after_backward(lambda layer: bprop[layer].stop()) solver.add_callback(lambda: total.start(), lambda: (total.stop(), allrd.start())) solver.add_callback(nccl) solver.add_callback(lambda: '', lambda: (allrd.stop(), show_time()))
def time(net, iters): fprop = [] bprop = [] total = caffe.Timer() for _ in range(len(net.layers)): fprop.append(caffe.Timer()) bprop.append(caffe.Timer()) def show_time(): s = '\n' for i in range(len(net.layers)): s += 'forw %3d %8s ' % (i, net._layer_names[i]) s += ': %.2f\n' % fprop[i].ms for i in range(len(net.layers) - 1, -1, -1): s += 'back %3d %8s ' % (i, net._layer_names[i]) s += ': %.2f\n' % bprop[i].ms s += 'solver total: %.2f\n' % total.ms caffe.log(s) net.before_forward(lambda layer: fprop[layer].start()) net.after_forward(lambda layer: fprop[layer].stop()) net.before_backward(lambda layer: bprop[layer].start()) net.after_backward(lambda layer: bprop[layer].stop()) total.start() for i in xrange(iters): net.forward() net.backward() total.stop() show_time()
def run_train(solver, max_epoch, max_tol, output_blob, label_blob): timer = caffe.Timer() solver.add_callback(lambda: timer.start(), lambda: timer.stop()) loss_weights = [(n, w) for n, w in solver.net.blob_loss_weights.iteritems() if w > 0.0] snapshot_prefix = solver.param.snapshot_prefix model_dir = os.path.dirname(snapshot_prefix) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) handler = logging.FileHandler(os.path.join(model_dir, 'train.log'), mode='w') formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') handler.setFormatter(formatter) logger.addHandler(handler) f_info = open(os.path.join(model_dir, 'info.txt'), 'w') f_info.write('AUs:' + ','.join([au[2:] for au in solver.net.layers[0].au_names]) + '\n') epoch = 0 it = 0 last_it = 0 tol = 0 train_loss = 0.0 best_epoch = -1 best = -np.inf while True: solver.step(1) loss = np.sum([ solver.net.blobs[n].data[...].copy() * w for n, w in loss_weights ]) train_loss = (train_loss * (it - last_it) + loss) / (it + 1 - last_it) message = '===Epoch<{}>...prog: {}/{}, train_loss: {}, speed: {:.3f}s/iter \r' sys.stdout.flush() sys.stdout.write( message.format(solver.net.layers[0].epoch, solver.net.layers[0].processed_num, solver.net.layers[0].num_samples, train_loss, timer.ms / 1000.0)) if epoch < solver.net.layers[0].epoch: average, results, au_names = run_validation( solver, output_blob, label_blob) solver.snapshot() os.rename( snapshot_prefix + '_iter_{}.solverstate'.format(it + 1), os.path.join(model_dir, 'epoch{}.solverstate'.format(epoch))) os.rename( snapshot_prefix + '_iter_{}.caffemodel'.format(it + 1), os.path.join(model_dir, 'epoch{}.caffemodel'.format(epoch))) if average > best: print('GOOD!\n') best = average best_epoch = epoch tol = 0 else: tol += 1 val_string = '{:<5} {:<.3f}\n' * len(au_names) val_list = [e for t in zip(au_names, results) for e in t] val_string = val_string.format(*val_list) print('Validation average: {}, tol: {}/{}'.format( average, tol, max_tol)) print(val_string) logger.info('Epoch<{}> ends'.format(epoch)) message = 'train_loss: {}, validation_average: {}, tol: {}/{}' logger.info(message.format(train_loss, average, tol, max_tol)) logger.info('Validation results: \n' + val_string) if tol >= max_tol or epoch == (max_epoch - 1): print('\n\n\n===== Training finish =====') message = 'End at epoch {}, best: {}, best_epoch: {}\n' print(message.format(epoch, best, best_epoch)) logger.info('Training process ends') logger.info(message.format(epoch, best, best_epoch)) break epoch += 1 train_loss = 0.0 last_it = it + 1 handler.flush() it += 1 f_info.write('Best epoch:{}\n'.format(best_epoch)) f_info.write('Best validation average:{}\n'.format(best)) f_info.close() return