def train(epoch): model.train() train_sampler.set_epoch(epoch) train_loss = Metric('train_loss') train_accuracy = Metric('train_accuracy') with tqdm(total=len(train_loader), desc='Train Epoch #{}'.format(epoch + 1), disable=not verbose) as t: for batch_idx, (data, target) in enumerate(train_loader): adjust_learning_rate(epoch, batch_idx) # number of batchs limit if batch_idx >= 3000: return if args.cuda: with log_time(model_logger, "batch-data-tocuda", 0): data, target = data.cuda(), target.cuda() optimizer.zero_grad() # Split data into sub-batches of size batch_size for i in range(0, len(data), args.batch_size): data_batch = data[i:i + args.batch_size] target_batch = target[i:i + args.batch_size] # sync_e() lobj = { "ph": "X", "name": "foward", "ts": time.time(), "pid": 0, "dur": 0 } output = model(data_batch).logits # sync_e() lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) lobj = { "ph": "X", "name": "compute-loss", "ts": time.time(), "pid": 0, "dur": 0 } with log_time(model_logger, "horovod-acc-comp", 0): _acc = accuracy(output, target_batch) with log_time(model_logger, "horovod-acc-update", 0): train_accuracy.update(_acc) with log_time(model_logger, "torch-loss-comp", 0): loss = F.cross_entropy(output, target_batch) with log_time(model_logger, "horovod-loss-update", 0): train_loss.update(loss) # Average gradients among sub-batches with log_time(model_logger, "avg-sub-batches-loss", 0): loss.div_(math.ceil(float(len(data)) / args.batch_size)) lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) # sync_e() lobj = { "ph": "X", "name": "backward", "ts": time.time(), "pid": 0, "dur": 0 } loss.backward() # sync_e() lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) # Gradient is applied across all ranks lobj = { "ph": "X", "name": "update-gradients", "ts": time.time(), "pid": 0, "dur": 0 } optimizer.step() # time_batch.append(step14.elapsed_time(step1)) # step1.record() # if batch_idx == 3: # file = open("correct.log", "w") # for n, p in model.named_parameters(): # print(p, file=file) # assert(False) lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) t.set_postfix({ 'loss': train_loss.avg.item(), 'accuracy': 100. * train_accuracy.avg.item() }) t.update(1) if log_writer: log_writer.add_scalar('train/loss', train_loss.avg, epoch) log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)
def train(epoch): model.train() train_sampler.set_epoch(epoch) train_loss = Metric('train_loss') train_accuracy = Metric('train_accuracy') with tqdm(total=len(train_loader), desc='Train Epoch #{}'.format(epoch + 1), disable=not verbose) as t: for batch_idx, (data, target) in enumerate(train_loader): adjust_learning_rate(epoch, batch_idx) # number of batchs limit if batch_idx >= 500: return if args.cuda: with log_time(model_logger, "batch-data-tocuda", 0): data, target = data.cuda(), target.cuda() optimizer.zero_grad() # Split data into sub-batches of size batch_size for i in range(0, len(data), args.batch_size): data_batch = data[i:i + args.batch_size] target_batch = target[i:i + args.batch_size] # sync_e() lobj = { "ph": "X", "name": "foward", "ts": time.time(), "pid": 0, "dur": 0 } output = model(data_batch) # sync_e() lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) lobj = { "ph": "X", "name": "compute-loss", "ts": time.time(), "pid": 0, "dur": 0 } with log_time(model_logger, "horovod-acc-comp", 0): _acc = accuracy(output, target_batch) with log_time(model_logger, "horovod-acc-update", 0): train_accuracy.update(_acc) with log_time(model_logger, "torch-loss-comp", 0): loss = F.cross_entropy(output, target_batch) with log_time(model_logger, "horovod-loss-update", 0): train_loss.update(loss) # Average gradients among sub-batches with log_time(model_logger, "avg-sub-batches-loss", 0): loss.div_(math.ceil(float(len(data)) / args.batch_size)) lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) # sync_e() lobj = { "ph": "X", "name": "backward", "ts": time.time(), "pid": 0, "dur": 0 } loss.backward() if batch_index % 100 == 50: Flag_event.record() torch.cuda.synchronize() time_dict = { n: allEvent[n].elapsed_time(Flag_event) for n, p in model.named_parameters() } time_dict = sorted(time_dict.items(), key=lambda x: x[1], reverse=True) pprint("new step", stream=fout) pprint(time_dict, stream=fout) # sync_e() lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) # Gradient is applied across all ranks lobj = { "ph": "X", "name": "update-gradients", "ts": time.time(), "pid": 0, "dur": 0 } optimizer.step() lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) t.set_postfix({ 'loss': train_loss.avg.item(), 'accuracy': 100. * train_accuracy.avg.item() }) t.update(1) if log_writer: log_writer.add_scalar('train/loss', train_loss.avg, epoch) log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)
def train(epoch): model.train() train_sampler.set_epoch(epoch) train_loss = Metric('train_loss') train_accuracy = Metric('train_accuracy') with tqdm(total=len(train_loader), desc='Train Epoch #{}'.format(epoch + 1), disable=not verbose) as t: for batch_idx, (data, target) in enumerate(train_loader): adjust_learning_rate(epoch, batch_idx) # if batch_idx >= 50: # return if args.cuda: with log_time(model_logger, "batch-data-tocuda", hvd): data, target = data.cuda(), target.cuda() optimizer.zero_grad() # Split data into sub-batches of size batch_size for i in range(0, len(data), args.batch_size): data_batch = data[i:i + args.batch_size] target_batch = target[i:i + args.batch_size] lobj = { "ph": "X", "name": "foward", "ts": time.time(), "pid": hvd.rank(), "dur": 0 } output = model(data_batch) lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) lobj = { "ph": "X", "name": "compute-loss", "ts": time.time(), "pid": hvd.rank(), "dur": 0 } train_accuracy.update(accuracy(output, target_batch)) loss = F.cross_entropy(output, target_batch) train_loss.update(loss) # Average gradients among sub-batches loss.div_(math.ceil(float(len(data)) / args.batch_size)) lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) lobj = { "ph": "X", "name": "backward", "ts": time.time(), "pid": hvd.rank(), "dur": 0 } loss.backward() lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) break # Gradient is applied across all ranks lobj = { "ph": "X", "name": "update-gradients", "ts": time.time(), "pid": hvd.rank(), "dur": 0 } optimizer.step() lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) t.set_postfix({ 'loss': train_loss.avg.item(), 'accuracy': 100. * train_accuracy.avg.item() }) t.update(1) if log_writer: log_writer.add_scalar('train/loss', train_loss.avg, epoch) log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)
def train(epoch): model.train() train_loss = Metric('train_loss') train_accuracy = Metric('train_accuracy') with tqdm(total=len(train_loader), desc='Train Epoch #{}'.format(epoch + 1), disable=not verbose) as t: for batch_idx in range(500): optimizer.zero_grad() # sync_e() lobj = { "ph": "X", "name": "foward", "ts": time.time(), "pid": 0, "dur": 0 } output = model(data_batch) # sync_e() lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) lobj = { "ph": "X", "name": "compute-loss", "ts": time.time(), "pid": 0, "dur": 0 } with log_time(model_logger, "torch-loss-comp", 0): loss = F.cross_entropy(output, target_batch) # Average gradients among sub-batches with log_time(model_logger, "avg-sub-batches-loss", 0): loss.div_(math.ceil(float(len(data)) / args.batch_size)) lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) # sync_e() lobj = { "ph": "X", "name": "backward", "ts": time.time(), "pid": 0, "dur": 0 } loss.backward() # sync_e() lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) # Gradient is applied across all ranks lobj = { "ph": "X", "name": "update-gradients", "ts": time.time(), "pid": 0, "dur": 0 } optimizer.step() lobj["dur"] = time.time() - lobj["ts"] model_logger.info(json.dumps(lobj)) t.set_postfix({ 'loss': train_loss.avg.item(), 'accuracy': 100. * train_accuracy.avg.item() }) t.update(1) if log_writer: log_writer.add_scalar('train/loss', train_loss.avg, epoch) log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)