Esempio n. 1
0
def train(epoch):
    model.train()
    train_sampler.set_epoch(epoch)
    train_loss = Metric('train_loss')
    train_accuracy = Metric('train_accuracy')

    with tqdm(total=len(train_loader),
              desc='Train Epoch     #{}'.format(epoch + 1),
              disable=not verbose) as t:
        for batch_idx, (data, target) in enumerate(train_loader):
            adjust_learning_rate(epoch, batch_idx)
            # number of batchs limit
            if batch_idx >= 3000:
                return

            if args.cuda:
                with log_time(model_logger, "batch-data-tocuda", 0):
                    data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            # Split data into sub-batches of size batch_size
            for i in range(0, len(data), args.batch_size):
                data_batch = data[i:i + args.batch_size]
                target_batch = target[i:i + args.batch_size]
                # sync_e()
                lobj = {
                    "ph": "X",
                    "name": "foward",
                    "ts": time.time(),
                    "pid": 0,
                    "dur": 0
                }
                output = model(data_batch).logits
                # sync_e()
                lobj["dur"] = time.time() - lobj["ts"]
                model_logger.info(json.dumps(lobj))

                lobj = {
                    "ph": "X",
                    "name": "compute-loss",
                    "ts": time.time(),
                    "pid": 0,
                    "dur": 0
                }
                with log_time(model_logger, "horovod-acc-comp", 0):
                    _acc = accuracy(output, target_batch)
                with log_time(model_logger, "horovod-acc-update", 0):
                    train_accuracy.update(_acc)
                with log_time(model_logger, "torch-loss-comp", 0):
                    loss = F.cross_entropy(output, target_batch)
                with log_time(model_logger, "horovod-loss-update", 0):
                    train_loss.update(loss)
                # Average gradients among sub-batches
                with log_time(model_logger, "avg-sub-batches-loss", 0):
                    loss.div_(math.ceil(float(len(data)) / args.batch_size))
                lobj["dur"] = time.time() - lobj["ts"]
                model_logger.info(json.dumps(lobj))

                # sync_e()
                lobj = {
                    "ph": "X",
                    "name": "backward",
                    "ts": time.time(),
                    "pid": 0,
                    "dur": 0
                }
                loss.backward()
                # sync_e()
                lobj["dur"] = time.time() - lobj["ts"]
                model_logger.info(json.dumps(lobj))

            # Gradient is applied across all ranks
            lobj = {
                "ph": "X",
                "name": "update-gradients",
                "ts": time.time(),
                "pid": 0,
                "dur": 0
            }
            optimizer.step()
            # time_batch.append(step14.elapsed_time(step1))
            #  step1.record()
            # if batch_idx == 3:
            #     file = open("correct.log", "w")
            #     for n, p in model.named_parameters():
            #         print(p, file=file)
            #     assert(False)
            lobj["dur"] = time.time() - lobj["ts"]
            model_logger.info(json.dumps(lobj))

            t.set_postfix({
                'loss': train_loss.avg.item(),
                'accuracy': 100. * train_accuracy.avg.item()
            })
            t.update(1)

    if log_writer:
        log_writer.add_scalar('train/loss', train_loss.avg, epoch)
        log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)
def train(epoch):
    model.train()
    train_sampler.set_epoch(epoch)
    train_loss = Metric('train_loss')
    train_accuracy = Metric('train_accuracy')

    with tqdm(total=len(train_loader),
              desc='Train Epoch     #{}'.format(epoch + 1),
              disable=not verbose) as t:
        for batch_idx, (data, target) in enumerate(train_loader):
            adjust_learning_rate(epoch, batch_idx)
            # number of batchs limit
            if batch_idx >= 500:
                return

            if args.cuda:
                with log_time(model_logger, "batch-data-tocuda", 0):
                    data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            # Split data into sub-batches of size batch_size
            for i in range(0, len(data), args.batch_size):
                data_batch = data[i:i + args.batch_size]
                target_batch = target[i:i + args.batch_size]
                # sync_e()
                lobj = {
                    "ph": "X",
                    "name": "foward",
                    "ts": time.time(),
                    "pid": 0,
                    "dur": 0
                }
                output = model(data_batch)
                # sync_e()
                lobj["dur"] = time.time() - lobj["ts"]
                model_logger.info(json.dumps(lobj))

                lobj = {
                    "ph": "X",
                    "name": "compute-loss",
                    "ts": time.time(),
                    "pid": 0,
                    "dur": 0
                }
                with log_time(model_logger, "horovod-acc-comp", 0):
                    _acc = accuracy(output, target_batch)
                with log_time(model_logger, "horovod-acc-update", 0):
                    train_accuracy.update(_acc)
                with log_time(model_logger, "torch-loss-comp", 0):
                    loss = F.cross_entropy(output, target_batch)
                with log_time(model_logger, "horovod-loss-update", 0):
                    train_loss.update(loss)
                # Average gradients among sub-batches
                with log_time(model_logger, "avg-sub-batches-loss", 0):
                    loss.div_(math.ceil(float(len(data)) / args.batch_size))
                lobj["dur"] = time.time() - lobj["ts"]
                model_logger.info(json.dumps(lobj))

                # sync_e()
                lobj = {
                    "ph": "X",
                    "name": "backward",
                    "ts": time.time(),
                    "pid": 0,
                    "dur": 0
                }
                loss.backward()
                if batch_index % 100 == 50:
                    Flag_event.record()
                    torch.cuda.synchronize()
                    time_dict = {
                        n: allEvent[n].elapsed_time(Flag_event)
                        for n, p in model.named_parameters()
                    }
                    time_dict = sorted(time_dict.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
                    pprint("new step", stream=fout)
                    pprint(time_dict, stream=fout)
                # sync_e()
                lobj["dur"] = time.time() - lobj["ts"]
                model_logger.info(json.dumps(lobj))

            # Gradient is applied across all ranks
            lobj = {
                "ph": "X",
                "name": "update-gradients",
                "ts": time.time(),
                "pid": 0,
                "dur": 0
            }
            optimizer.step()
            lobj["dur"] = time.time() - lobj["ts"]
            model_logger.info(json.dumps(lobj))

            t.set_postfix({
                'loss': train_loss.avg.item(),
                'accuracy': 100. * train_accuracy.avg.item()
            })
            t.update(1)

    if log_writer:
        log_writer.add_scalar('train/loss', train_loss.avg, epoch)
        log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)
def train(epoch):
    model.train()
    train_sampler.set_epoch(epoch)
    train_loss = Metric('train_loss')
    train_accuracy = Metric('train_accuracy')

    with tqdm(total=len(train_loader),
              desc='Train Epoch     #{}'.format(epoch + 1),
              disable=not verbose) as t:
        for batch_idx, (data, target) in enumerate(train_loader):
            adjust_learning_rate(epoch, batch_idx)
            # if batch_idx >= 50:
            #     return
            if args.cuda:
                with log_time(model_logger, "batch-data-tocuda", hvd):
                    data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            # Split data into sub-batches of size batch_size
            for i in range(0, len(data), args.batch_size):
                data_batch = data[i:i + args.batch_size]
                target_batch = target[i:i + args.batch_size]
                lobj = {
                    "ph": "X",
                    "name": "foward",
                    "ts": time.time(),
                    "pid": hvd.rank(),
                    "dur": 0
                }
                output = model(data_batch)
                lobj["dur"] = time.time() - lobj["ts"]
                model_logger.info(json.dumps(lobj))

                lobj = {
                    "ph": "X",
                    "name": "compute-loss",
                    "ts": time.time(),
                    "pid": hvd.rank(),
                    "dur": 0
                }
                train_accuracy.update(accuracy(output, target_batch))
                loss = F.cross_entropy(output, target_batch)
                train_loss.update(loss)
                # Average gradients among sub-batches
                loss.div_(math.ceil(float(len(data)) / args.batch_size))
                lobj["dur"] = time.time() - lobj["ts"]
                model_logger.info(json.dumps(lobj))
                lobj = {
                    "ph": "X",
                    "name": "backward",
                    "ts": time.time(),
                    "pid": hvd.rank(),
                    "dur": 0
                }
                loss.backward()
                lobj["dur"] = time.time() - lobj["ts"]
                model_logger.info(json.dumps(lobj))
                break

            # Gradient is applied across all ranks
            lobj = {
                "ph": "X",
                "name": "update-gradients",
                "ts": time.time(),
                "pid": hvd.rank(),
                "dur": 0
            }
            optimizer.step()
            lobj["dur"] = time.time() - lobj["ts"]
            model_logger.info(json.dumps(lobj))

            t.set_postfix({
                'loss': train_loss.avg.item(),
                'accuracy': 100. * train_accuracy.avg.item()
            })
            t.update(1)

    if log_writer:
        log_writer.add_scalar('train/loss', train_loss.avg, epoch)
        log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)
Esempio n. 4
0
def train(epoch):
    model.train()
    train_loss = Metric('train_loss')
    train_accuracy = Metric('train_accuracy')

    with tqdm(total=len(train_loader),
              desc='Train Epoch     #{}'.format(epoch + 1),
              disable=not verbose) as t:
        for batch_idx in range(500):

            optimizer.zero_grad()

            # sync_e()
            lobj = {
                "ph": "X",
                "name": "foward",
                "ts": time.time(),
                "pid": 0,
                "dur": 0
            }
            output = model(data_batch)
            # sync_e()
            lobj["dur"] = time.time() - lobj["ts"]
            model_logger.info(json.dumps(lobj))

            lobj = {
                "ph": "X",
                "name": "compute-loss",
                "ts": time.time(),
                "pid": 0,
                "dur": 0
            }
            with log_time(model_logger, "torch-loss-comp", 0):
                loss = F.cross_entropy(output, target_batch)
            # Average gradients among sub-batches
            with log_time(model_logger, "avg-sub-batches-loss", 0):
                loss.div_(math.ceil(float(len(data)) / args.batch_size))
            lobj["dur"] = time.time() - lobj["ts"]
            model_logger.info(json.dumps(lobj))

            # sync_e()
            lobj = {
                "ph": "X",
                "name": "backward",
                "ts": time.time(),
                "pid": 0,
                "dur": 0
            }
            loss.backward()
            # sync_e()
            lobj["dur"] = time.time() - lobj["ts"]
            model_logger.info(json.dumps(lobj))

            # Gradient is applied across all ranks
            lobj = {
                "ph": "X",
                "name": "update-gradients",
                "ts": time.time(),
                "pid": 0,
                "dur": 0
            }
            optimizer.step()
            lobj["dur"] = time.time() - lobj["ts"]
            model_logger.info(json.dumps(lobj))

            t.set_postfix({
                'loss': train_loss.avg.item(),
                'accuracy': 100. * train_accuracy.avg.item()
            })
            t.update(1)

    if log_writer:
        log_writer.add_scalar('train/loss', train_loss.avg, epoch)
        log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)