Ejemplo n.º 1
0
def train(config):
    use_gpu = config.use_gpu

    # Get the rank of the current training process.
    rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    if rank == 0:
        # Print the whole config setting.
        pprint(vars(config))

    # Make checkpoint directory.
    run_dir = os.path.join("runs", config.model, config.name)
    checkpoint_dir = os.path.join(run_dir, "checkpoint")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    # Create tensorboard logger.
    vdl = LogWriter(os.path.join(run_dir, "logs")) \
          if rank == 0 else None

    # Configurate device
    place = fluid.CUDAPlace(rank) if use_gpu else fluid.CPUPlace()

    with dg.guard(place):
        # Fix random seed.
        seed = config.seed
        random.seed(seed)
        np.random.seed(seed)
        fluid.default_startup_program().random_seed = seed
        fluid.default_main_program().random_seed = seed
        print("Random Seed: ", seed)

        # Build model.
        model = WaveFlow(config, checkpoint_dir, parallel, rank, nranks, vdl)
        iteration = model.build()

        while iteration < config.max_iterations:
            # Run one single training step.
            model.train_step(iteration)

            iteration += 1

            if iteration % config.test_every == 0:
                # Run validation step.
                model.valid_step(iteration)

            if rank == 0 and iteration % config.save_every == 0:
                # Save parameters.
                model.save(iteration)

    # Close TensorBoard.
    if rank == 0:
        vdl.close()
Ejemplo n.º 2
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
    fluid.enable_dygraph(place)

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output, 'log'))

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     checkpoint_path=args.checkpoint)
    model.eval()

    text = np.asarray(text_to_sequence(text_input))
    text = np.expand_dims(text, axis=0)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = np.expand_dims(pos_text, axis=0)

    text = dg.to_variable(text).astype(np.int64)
    pos_text = dg.to_variable(pos_text).astype(np.int64)

    _, mel_output_postnet = model(text, pos_text, alpha=args.alpha)

    if args.vocoder == 'griffin-lim':
        #synthesis use griffin-lim
        wav = synthesis_with_griffinlim(mel_output_postnet, cfg['audio'])
    elif args.vocoder == 'waveflow':
        wav = synthesis_with_waveflow(mel_output_postnet, args,
                                      args.checkpoint_vocoder, place)
    else:
        print(
            'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
            % args.vocoder)

    writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(
        os.path.join(os.path.join(args.output, 'samples'),
                     args.vocoder + '.wav'), cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Ejemplo n.º 3
0
class VisualDL(Callback):
    def __init__(self, log_dir="./log", freq=1):
        super(VisualDL, self).__init__()
        self.log_dir = log_dir
        self.freq = freq

    def on_train_begin(self, logs=None):
        self.writer = LogWriter(self.log_dir)

    def on_iter_end(self, iter, logs=None):
        logs = logs or {}
        if iter % self.freq == 0 and ParallelEnv().local_rank == 0:
            for k, v in logs.items():
                self.writer.add_scalar("Train/{}".format(k), v, iter)

        self.writer.flush()

    def on_train_end(self, logs=None):
        self.writer.close()
Ejemplo n.º 4
0
class VisualHook(Hook):
    def __init__(self, priority=1):
        self.priority = priority

    def run_begin(self, trainer):
        rank = dist.get_rank()
        if rank != 0:
            return
        logdir = os.path.join(trainer.output_dir, 'visual_dl')
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        self.writer = LogWriter(logdir=logdir)
        # app.run(logdir=logdir, port=8040, host="0.0.0.0")

    def train_epoch_end(self, trainer):
        rank = dist.get_rank()
        if rank != 0:
            return
        outputs = trainer.outputs
        for k in outputs.keys():
            v = trainer.logs[k].avg
            self.writer.add_scalar(tag='train/{}'.format(k),
                                   step=trainer.current_epoch,
                                   value=v)
        with paddle.no_grad():
            if dist.get_world_size() > 1:
                for name, param in trainer.model._layers.named_parameters():
                    if 'bn' not in name:
                        self.writer.add_histogram(name, param.numpy(),
                                                  trainer.current_epoch)
            else:
                for name, param in trainer.model.named_parameters():
                    if 'bn' not in name:
                        self.writer.add_histogram(name, param.numpy(),
                                                  trainer.current_epoch)

    def run_end(self, trainer):
        rank = dist.get_rank()
        if rank != 0:
            return
        self.writer.close()
Ejemplo n.º 5
0
def train(model,
          train_dataset,
          val_dataset=None,
          optimizer=None,
          save_dir='output',
          iters=10000,
          batch_size=2,
          resume_model=None,
          save_interval=1000,
          log_iters=10,
          num_workers=0,
          use_vdl=False,
          losses=None,
          keep_checkpoint_max=5):
    """
    Launch training.
    Args:
        model(nn.Layer): A sementic segmentation model.
        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
        optimizer (paddle.optimizer.Optimizer): The optimizer.
        save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
        iters (int, optional): How may iters to train the model. Defualt: 10000.
        batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
        resume_model (str, optional): The path of resume model.
        save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
        log_iters (int, optional): Display logging information at every log_iters. Default: 10.
        num_workers (int, optional): Num workers for data loader. Default: 0.
        use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False.
        losses (dict): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']).
            The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient.
        keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
    """
    model.train()
    nranks = paddle.distributed.ParallelEnv().nranks
    local_rank = paddle.distributed.ParallelEnv().local_rank

    start_iter = 0
    if resume_model is not None:
        start_iter = resume(model, optimizer, resume_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        # Initialize parallel environment if not done.
        if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
        ):
            paddle.distributed.init_parallel_env()
            ddp_model = paddle.DataParallel(model)
        else:
            ddp_model = paddle.DataParallel(model)

    batch_sampler = paddle.io.DistributedBatchSampler(train_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=True,
                                                      drop_last=True)

    loader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        num_workers=num_workers,
        return_list=True,
    )

    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    avg_loss = 0.0
    avg_loss_list = []
    iters_per_epoch = len(batch_sampler)
    best_mean_iou = -1.0
    best_model_iter = -1
    reader_cost_averager = TimeAverager()
    batch_cost_averager = TimeAverager()
    save_models = deque()
    batch_start = time.time()

    iter = start_iter
    while iter < iters:
        for data in loader:
            iter += 1
            if iter > iters:
                break
            reader_cost_averager.record(time.time() - batch_start)
            images = data[0]
            labels = data[1].astype('int64')
            edges = None
            if len(data) == 3:
                edges = data[2].astype('int64')

            if nranks > 1:
                logits_list = ddp_model(images)
            else:
                logits_list = model(images)
            loss_list = loss_computation(logits_list=logits_list,
                                         labels=labels,
                                         losses=losses,
                                         edges=edges)
            loss = sum(loss_list)
            loss.backward()

            optimizer.step()
            lr = optimizer.get_lr()
            if isinstance(optimizer._learning_rate,
                          paddle.optimizer.lr.LRScheduler):
                optimizer._learning_rate.step()
            model.clear_gradients()
            avg_loss += loss.numpy()[0]
            if not avg_loss_list:
                avg_loss_list = [l.numpy() for l in loss_list]
            else:
                for i in range(len(loss_list)):
                    avg_loss_list[i] += loss_list[i].numpy()
            batch_cost_averager.record(time.time() - batch_start,
                                       num_samples=batch_size)

            if (iter) % log_iters == 0 and local_rank == 0:
                avg_loss /= log_iters
                avg_loss_list = [l[0] / log_iters for l in avg_loss_list]
                remain_iters = iters - iter
                avg_train_batch_cost = batch_cost_averager.get_average()
                avg_train_reader_cost = reader_cost_averager.get_average()
                eta = calculate_eta(remain_iters, avg_train_batch_cost)
                logger.info(
                    "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}"
                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
                            avg_loss, lr, avg_train_batch_cost,
                            avg_train_reader_cost,
                            batch_cost_averager.get_ips_average(), eta))
                if use_vdl:
                    log_writer.add_scalar('Train/loss', avg_loss, iter)
                    # Record all losses if there are more than 2 losses.
                    if len(avg_loss_list) > 1:
                        avg_loss_dict = {}
                        for i, value in enumerate(avg_loss_list):
                            avg_loss_dict['loss_' + str(i)] = value
                        for key, value in avg_loss_dict.items():
                            log_tag = 'Train/' + key
                            log_writer.add_scalar(log_tag, value, iter)

                    log_writer.add_scalar('Train/lr', lr, iter)
                    log_writer.add_scalar('Train/batch_cost',
                                          avg_train_batch_cost, iter)
                    log_writer.add_scalar('Train/reader_cost',
                                          avg_train_reader_cost, iter)
                avg_loss = 0.0
                avg_loss_list = []
                reader_cost_averager.reset()
                batch_cost_averager.reset()

            if (iter % save_interval == 0 or iter == iters) and (val_dataset
                                                                 is not None):
                num_workers = 1 if num_workers > 0 else 0
                mean_iou, acc, class_iou, _, _ = evaluate(
                    model, val_dataset, num_workers=num_workers)
                model.train()

            if (iter % save_interval == 0
                    or iter == iters) and local_rank == 0:
                current_save_dir = os.path.join(save_dir,
                                                "iter_{}".format(iter))
                if not os.path.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                paddle.save(model.state_dict(),
                            os.path.join(current_save_dir, 'model.pdparams'))
                paddle.save(optimizer.state_dict(),
                            os.path.join(current_save_dir, 'model.pdopt'))
                save_models.append(current_save_dir)
                if len(save_models) > keep_checkpoint_max > 0:
                    model_to_remove = save_models.popleft()
                    shutil.rmtree(model_to_remove)

                if val_dataset is not None:
                    if mean_iou > best_mean_iou:
                        best_mean_iou = mean_iou
                        best_model_iter = iter
                        best_model_dir = os.path.join(save_dir, "best_model")
                        paddle.save(
                            model.state_dict(),
                            os.path.join(best_model_dir, 'model.pdparams'))
                    logger.info(
                        '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.'
                        .format(best_mean_iou, best_model_iter))

                    if use_vdl:
                        log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter)
                        for i, iou in enumerate(class_iou):
                            log_writer.add_scalar('Evaluate/IoU {}'.format(i),
                                                  float(iou), iter)

                        log_writer.add_scalar('Evaluate/Acc', acc, iter)
            batch_start = time.time()

    # Calculate flops.
    if local_rank == 0:

        def count_syncbn(m, x, y):
            x = x[0]
            nelements = x.numel()
            m.total_ops += int(2 * nelements)

        _, c, h, w = images.shape
        flops = paddle.flops(
            model, [1, c, h, w],
            custom_ops={paddle.nn.SyncBatchNorm: count_syncbn})

    # Sleep for half a second to let dataloader release resources.
    time.sleep(0.5)
    if use_vdl:
        log_writer.close()
Ejemplo n.º 6
0
def do_train(args):
    paddle.enable_static() if not args.eager_run else None
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)
    worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank())

    model_class, tokenizer_class = MODEL_CLASSES['ernie-health']

    # Loads or initialize a model.
    pretrained_models = list(
        tokenizer_class.pretrained_init_configuration.keys())

    if args.model_name_or_path in pretrained_models:
        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
        generator = ElectraGenerator(
            ElectraModel(**model_class.pretrained_init_configuration[
                args.model_name_or_path + '-generator']))
        discriminator = ErnieHealthDiscriminator(
            ElectraModel(**model_class.pretrained_init_configuration[
                args.model_name_or_path + '-discriminator']))
        model = model_class(generator, discriminator)
        args.init_from_ckpt = False
    else:
        if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt:
            # Load checkpoint
            tokenizer = tokenizer_class.from_pretrained(
                args.model_name_or_path)
            with open(os.path.join(args.model_name_or_path, 'run_states.json'),
                      'r') as f:
                config_dict = json.load(f)
                model_name = config_dict['model_name']
            if model_name in pretrained_models:
                generator = ElectraGenerator(
                    ElectraModel(**model_class.pretrained_init_configuration[
                        model_name + '-generator']))
                discriminator = ErnieHealthDiscriminator(
                    ElectraModel(**model_class.pretrained_init_configuration[
                        model_name + '-discriminator']))
                model = model_class(generator, discriminator)
                model.set_state_dict(
                    paddle.load(
                        os.path.join(args.model_name_or_path,
                                     'model_state.pdparams')))
            else:
                raise ValueError(
                    'initialize a model from ckpt need model_name '
                    'in model_config_file. The supported model_name '
                    'are as follows: {}'.format(
                        tokenizer_class.pretrained_init_configuration.keys()))
        else:
            raise ValueError(
                'initialize a model need identifier or the '
                'directory of storing model. if use identifier, the supported model '
                'identifiers are as follows: {}, if use directory, '
                'make sure set init_from_ckpt as True'.format(
                    model_class.pretrained_init_configuration.keys()))

    criterion = ErnieHealthPretrainingCriterion(
        getattr(model.generator,
                ElectraGenerator.base_model_prefix).config['vocab_size'],
        model.gen_weight)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    # Loads dataset.
    tic_load_data = time.time()
    logger.info('start load data : %s' %
                (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())))

    train_dataset = MedicalCorpus(data_path=args.input_dir,
                                  tokenizer=tokenizer)
    logger.info('load data done, total : %s s' % (time.time() - tic_load_data))

    # Reads data and generates mini-batches.
    data_collator = DataCollatorForErnieHealth(
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        mlm_prob=args.mlm_prob)

    train_data_loader = create_dataloader(
        train_dataset,
        batch_size=args.batch_size,
        mode='train',
        use_gpu=True if args.device in 'gpu' else False,
        data_collator=data_collator)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_epochs)
    args.num_epochs = (num_training_steps - 1) // len(train_data_loader) + 1

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_steps)

    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ['bias', 'norm'])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=clip,
        apply_decay_param_fun=lambda x: x in decay_params)
    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

    logger.info('start train : %s' %
                (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())))
    trained_global_step = global_step = 0
    t_loss = defaultdict(lambda: paddle.to_tensor([0.0]))
    log_loss = defaultdict(lambda: paddle.to_tensor([0.0]))
    loss_list = defaultdict(list)
    log_list = []
    tic_train = time.time()

    if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt:
        optimizer.set_state_dict(
            paddle.load(
                os.path.join(args.model_name_or_path, 'model_state.pdopt')))
        trained_global_step = global_step = config_dict['global_step']
        if trained_global_step < num_training_steps:
            logger.info(
                '[ start train from checkpoint ] we have already trained %s steps, seeking next step : %s'
                % (trained_global_step, trained_global_step + 1))
        else:
            logger.info(
                '[ start train from checkpoint ] we have already trained %s steps, but total training steps is %s, please check configuration !'
                % (trained_global_step, num_training_steps))
            exit(0)

    if paddle.distributed.get_rank() == 0:
        writer = LogWriter(os.path.join(args.output_dir, 'loss_log'))

    for epoch in range(args.num_epochs):
        for step, batch in enumerate(train_data_loader):
            if trained_global_step > 0:
                trained_global_step -= 1
                continue
            global_step += 1
            masked_input_ids, input_ids, gen_labels = batch

            if args.use_amp:
                with paddle.amp.auto_cast():
                    gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model(
                        input_ids=masked_input_ids,
                        raw_input_ids=input_ids,
                        generator_labels=gen_labels)
                    loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion(
                        gen_logits, gen_labels, logits_rtd, logits_mts,
                        logits_csp, disc_labels, masks)

                scaled = scaler.scale(loss)
                scaled.backward()
                t_loss['loss'] += loss.detach()
                t_loss['gen'] += gen_loss.detach()
                t_loss['rtd'] += rtd_loss.detach()
                t_loss['mts'] += mts_loss.detach()
                t_loss['csp'] += csp_loss.detach()
                scaler.minimize(optimizer, scaled)
            else:
                gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model(
                    input_ids=masked_input_ids,
                    raw_input_ids=input_ids,
                    generator_labels=gen_labels)
                loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion(
                    gen_logits, gen_labels, logits_rtd, logits_mts, logits_csp,
                    disc_labels, masks)
                loss.backward()
                t_loss['loss'] += loss.detach()
                t_loss['gen'] += gen_loss.detach()
                t_loss['rtd'] += rtd_loss.detach()
                t_loss['mts'] += mts_loss.detach()
                t_loss['csp'] += csp_loss.detach()
                optimizer.step()

            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.logging_steps == 0:
                local_loss = dict([
                    (k, (t_loss[k] - log_loss[k]) / args.logging_steps)
                    for k in ['loss', 'gen', 'rtd', 'mts', 'csp']
                ])
                if paddle.distributed.get_world_size() > 1:
                    for k in ['loss', 'gen', 'rtd', 'mts', 'csp']:
                        paddle.distributed.all_gather(loss_list[k],
                                                      local_loss[k])
                    if paddle.distributed.get_rank() == 0:
                        tmp_loss = dict([
                            (k,
                             float((paddle.stack(loss_list[k]).sum() /
                                    len(loss_list[k])).numpy()))
                            for k in ['loss', 'gen', 'rtd', 'mts', 'csp']
                        ])
                        log_str = (
                            'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, '
                            'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, '
                            'seq_contrastive: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it'
                        ).format(
                            global_step, num_training_steps, epoch, step,
                            tmp_loss['loss'], tmp_loss['gen'],
                            tmp_loss['rtd'], tmp_loss['mts'], tmp_loss['csp'],
                            optimizer.get_lr(),
                            (time.time() - tic_train) / args.logging_steps)
                        logger.info(log_str)
                        log_list.append(log_str)
                        writer.add_scalar('generator_loss', tmp_loss['gen'],
                                          global_step)
                        writer.add_scalar('rtd_loss', tmp_loss['rtd'] * 50,
                                          global_step)
                        writer.add_scalar('mts_loss', tmp_loss['mts'] * 20,
                                          global_step)
                        writer.add_scalar('csp_loss', tmp_loss['csp'],
                                          global_step)
                        writer.add_scalar('total_loss', tmp_loss['loss'],
                                          global_step)
                        writer.add_scalar('lr', optimizer.get_lr(),
                                          global_step)
                    loss_list = defaultdict(list)
                else:
                    local_loss = dict([(k, v.numpy()[0])
                                       for k, v in local_loss.items()])
                    log_str = (
                        'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, '
                        'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, '
                        'seq_contrastive_loss: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it'
                    ).format(global_step, num_training_steps, epoch, step,
                             local_loss['loss'], local_loss['gen'],
                             local_loss['rtd'], local_loss['mts'],
                             local_loss['csp'], optimizer.get_lr(),
                             (time.time() - tic_train) / args.logging_steps)
                    logger.info(log_str)
                    log_list.append(log_str)
                    loss_dict = {
                        'generator_loss': local_loss['gen'],
                        'rtd_loss': local_loss['rtd'] * 50,
                        'mts_loss': local_loss['mts'] * 20,
                        'csp_loss': local_loss['csp']
                    }
                    for k, v in loss_dict.items():
                        writer.add_scalar('loss/%s' % k, v, global_step)
                    writer.add_scalar('total_loss', local_loss['loss'],
                                      global_step)
                    writer.add_scalar('lr', optimizer.get_lr(), global_step)
                log_loss = dict(t_loss)
                tic_train = time.time()

            if global_step % args.save_steps == 0:
                if paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(
                        args.output_dir, 'model_%d.pdparams' % global_step)
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    config_to_save = copy.deepcopy(
                        model_to_save.discriminator.electra.config)
                    if 'self' in config_to_save:
                        del config_to_save['self']
                    run_states = {
                        'model_name': model_name
                        if args.init_from_ckpt else args.model_name_or_path,
                        'global_step': global_step,
                        'epoch': epoch,
                        'step': step,
                    }
                    with open(os.path.join(output_dir, 'model_config.json'),
                              'w') as f:
                        json.dump(config_to_save, f)
                    with open(os.path.join(output_dir, 'run_states.json'),
                              'w') as f:
                        json.dump(run_states, f)
                    paddle.save(
                        model.state_dict(),
                        os.path.join(output_dir, 'model_state.pdparams'))
                    tokenizer.save_pretrained(output_dir)
                    paddle.save(optimizer.state_dict(),
                                os.path.join(output_dir, 'model_state.pdopt'))
                    if len(log_list) > 0:
                        with open(os.path.join(output_dir, 'train.log'),
                                  'w') as f:
                            for log in log_list:
                                if len(log.strip()) > 0:
                                    f.write(log.strip() + '\n')
            if global_step >= num_training_steps:
                if paddle.distributed.get_rank() == 0:
                    writer.close()
                return
Ejemplo n.º 7
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output,
                                    'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    model = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'],
                    cfg['audio']['num_mels'], cfg['audio']['n_fft'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(
            1 / (cfg['train']['warm_up_step'] *
                 (cfg['train']['learning_rate']**2)),
            cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(
            cfg['train']['grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     optimizer=optimizer,
                                     checkpoint_dir=os.path.join(
                                         args.output, 'checkpoints'),
                                     iteration=args.iteration,
                                     checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(cfg['audio'],
                            place,
                            args.data,
                            cfg['train']['batch_size'],
                            nranks,
                            local_rank,
                            is_vocoder=True).reader()

    for epoch in range(cfg['train']['max_iteration']):
        pbar = tqdm(reader)
        for i, data in enumerate(pbar):
            pbar.set_description('Processing at epoch %d' % epoch)
            mel, mag = data
            mag = dg.to_variable(mag.numpy())
            mel = dg.to_variable(mel.numpy())
            global_step += 1

            mag_pred = model(mel)
            loss = layers.mean(
                layers.abs(layers.elementwise_sub(mag_pred, mag)))

            if parallel:
                loss = model.scale_loss(loss)
                loss.backward()
                model.apply_collective_grads()
            else:
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()

            if local_rank == 0:
                writer.add_scalar('training_loss/loss', loss.numpy(),
                                  global_step)

            # save checkpoint
            if local_rank == 0 and global_step % cfg['train'][
                    'checkpoint_interval'] == 0:
                io.save_parameters(os.path.join(args.output, 'checkpoints'),
                                   global_step, model, optimizer)

    if local_rank == 0:
        writer.close()
Ejemplo n.º 8
0
def train(model,
          train_dataset,
          val_dataset=None,
          optimizer=None,
          save_dir='output',
          iters=10000,
          batch_size=2,
          resume_model=None,
          save_interval=1000,
          log_iters=10,
          num_workers=0,
          use_vdl=False,
          losses=None):
    """
    Launch training.

    Args:
        model(nn.Layer): A sementic segmentation model.
        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
        optimizer (paddle.optimizer.Optimizer): The optimizer.
        save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
        iters (int, optional): How may iters to train the model. Defualt: 10000.
        batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
        resume_model (str, optional): The path of resume model.
        save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
        log_iters (int, optional): Display logging information at every log_iters. Default: 10.
        num_workers (int, optional): Num workers for data loader. Default: 0.
        use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False.
        losses (dict): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']).
            The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient.
    """
    nranks = paddle.distributed.ParallelEnv().nranks
    local_rank = paddle.distributed.ParallelEnv().local_rank

    start_iter = 0
    if resume_model is not None:
        start_iter = resume(model, optimizer, resume_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        # Initialize parallel training environment.
        paddle.distributed.init_parallel_env()
        strategy = paddle.distributed.prepare_context()
        ddp_model = paddle.DataParallel(model, strategy)

    batch_sampler = paddle.io.DistributedBatchSampler(train_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=True,
                                                      drop_last=True)

    loader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        num_workers=num_workers,
        return_list=True,
    )

    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    timer = Timer()
    avg_loss = 0.0
    iters_per_epoch = len(batch_sampler)
    best_mean_iou = -1.0
    best_model_iter = -1
    train_reader_cost = 0.0
    train_batch_cost = 0.0
    timer.start()

    iter = start_iter
    while iter < iters:
        for data in loader:
            iter += 1
            if iter > iters:
                break
            train_reader_cost += timer.elapsed_time()
            images = data[0]
            labels = data[1].astype('int64')
            edges = None
            if len(data) == 3:
                edges = data[2].astype('int64')

            if nranks > 1:
                logits_list = ddp_model(images)
            else:
                logits_list = model(images)
            loss = loss_computation(logits_list=logits_list,
                                    labels=labels,
                                    losses=losses,
                                    edges=edges)
            loss.backward()

            optimizer.step()
            lr = optimizer.get_lr()
            if isinstance(optimizer._learning_rate,
                          paddle.optimizer.lr.LRScheduler):
                optimizer._learning_rate.step()
            model.clear_gradients()
            avg_loss += loss.numpy()[0]
            train_batch_cost += timer.elapsed_time()

            if (iter) % log_iters == 0 and local_rank == 0:
                avg_loss /= log_iters
                avg_train_reader_cost = train_reader_cost / log_iters
                avg_train_batch_cost = train_batch_cost / log_iters
                train_reader_cost = 0.0
                train_batch_cost = 0.0
                remain_iters = iters - iter
                eta = calculate_eta(remain_iters, avg_train_batch_cost)
                logger.info(
                    "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
                            avg_loss, lr, avg_train_batch_cost,
                            avg_train_reader_cost, eta))
                if use_vdl:
                    log_writer.add_scalar('Train/loss', avg_loss, iter)
                    log_writer.add_scalar('Train/lr', lr, iter)
                    log_writer.add_scalar('Train/batch_cost',
                                          avg_train_batch_cost, iter)
                    log_writer.add_scalar('Train/reader_cost',
                                          avg_train_reader_cost, iter)
                avg_loss = 0.0

            if (iter % save_interval == 0 or iter == iters) and (val_dataset
                                                                 is not None):
                num_workers = 1 if num_workers > 0 else 0
                mean_iou, acc = evaluate(model,
                                         val_dataset,
                                         num_workers=num_workers)
                model.train()

            if (iter % save_interval == 0
                    or iter == iters) and local_rank == 0:
                current_save_dir = os.path.join(save_dir,
                                                "iter_{}".format(iter))
                if not os.path.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                paddle.save(model.state_dict(),
                            os.path.join(current_save_dir, 'model.pdparams'))
                paddle.save(optimizer.state_dict(),
                            os.path.join(current_save_dir, 'model.pdopt'))

                if val_dataset is not None:
                    if mean_iou > best_mean_iou:
                        best_mean_iou = mean_iou
                        best_model_iter = iter
                        best_model_dir = os.path.join(save_dir, "best_model")
                        paddle.save(
                            model.state_dict(),
                            os.path.join(best_model_dir, 'model.pdparams'))
                    logger.info(
                        '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.'
                        .format(best_mean_iou, best_model_iter))

                    if use_vdl:
                        log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter)
                        log_writer.add_scalar('Evaluate/Acc', acc, iter)
            timer.restart()

    # Sleep for half a second to let dataloader release resources.
    time.sleep(0.5)
    if use_vdl:
        log_writer.close()
Ejemplo n.º 9
0
def train(model,
          train_dataset,
          val_dataset=None,
          optimizer=None,
          save_dir='output',
          iters=10000,
          batch_size=2,
          resume_model=None,
          save_interval=1000,
          log_iters=10,
          num_workers=0,
          use_vdl=False,
          losses=None,
          keep_checkpoint_max=5,
          threshold=0.1,
          nms_kernel=7,
          top_k=200):
    """
    Launch training.

    Args:
        model(nn.Layer): A sementic segmentation model.
        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
        optimizer (paddle.optimizer.Optimizer): The optimizer.
        save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
        iters (int, optional): How may iters to train the model. Defualt: 10000.
        batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
        resume_model (str, optional): The path of resume model.
        save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
        log_iters (int, optional): Display logging information at every log_iters. Default: 10.
        num_workers (int, optional): Num workers for data loader. Default: 0.
        use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False.
        losses (dict): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']).
            The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient.
        keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
        threshold (float, optional): A Float, threshold applied to center heatmap score. Default: 0.1.
        nms_kernel (int, optional): An Integer, NMS max pooling kernel size. Default: 7.
        top_k (int, optional): An Integer, top k centers to keep. Default: 200.
    """
    model.train()
    nranks = paddle.distributed.ParallelEnv().nranks
    local_rank = paddle.distributed.ParallelEnv().local_rank

    start_iter = 0
    if resume_model is not None:
        start_iter = resume(model, optimizer, resume_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        # Initialize parallel environment if not done.
        if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
        ):
            paddle.distributed.init_parallel_env()
            ddp_model = paddle.DataParallel(model)
        else:
            ddp_model = paddle.DataParallel(model)

    batch_sampler = paddle.io.DistributedBatchSampler(train_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=True,
                                                      drop_last=True)

    loader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        num_workers=num_workers,
        return_list=True,
    )

    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    avg_loss = 0.0
    avg_loss_list = []
    iters_per_epoch = len(batch_sampler)
    best_pq = -1.0
    best_model_iter = -1
    reader_cost_averager = TimeAverager()
    batch_cost_averager = TimeAverager()
    save_models = deque()
    batch_start = time.time()

    iter = start_iter
    while iter < iters:
        for data in loader:
            iter += 1
            if iter > iters:
                break
            reader_cost_averager.record(time.time() - batch_start)
            images = data[0]
            semantic = data[1]
            semantic_weights = data[2]
            center = data[3]
            center_weights = data[4]
            offset = data[5]
            offset_weights = data[6]
            foreground = data[7]

            if nranks > 1:
                logits_list = ddp_model(images)
            else:
                logits_list = model(images)

            loss_list = loss_computation(logits_list=logits_list,
                                         losses=losses,
                                         semantic=semantic,
                                         semantic_weights=semantic_weights,
                                         center=center,
                                         center_weights=center_weights,
                                         offset=offset,
                                         offset_weights=offset_weights)
            loss = sum(loss_list)
            loss.backward()

            optimizer.step()
            lr = optimizer.get_lr()
            if isinstance(optimizer._learning_rate,
                          paddle.optimizer.lr.LRScheduler):
                optimizer._learning_rate.step()
            model.clear_gradients()
            avg_loss += loss.numpy()[0]
            if not avg_loss_list:
                avg_loss_list = [l.numpy() for l in loss_list]
            else:
                for i in range(len(loss_list)):
                    avg_loss_list[i] += loss_list[i].numpy()
            batch_cost_averager.record(time.time() - batch_start,
                                       num_samples=batch_size)

            if (iter) % log_iters == 0 and local_rank == 0:
                avg_loss /= log_iters
                avg_loss_list = [l[0] / log_iters for l in avg_loss_list]
                remain_iters = iters - iter
                avg_train_batch_cost = batch_cost_averager.get_average()
                avg_train_reader_cost = reader_cost_averager.get_average()
                eta = calculate_eta(remain_iters, avg_train_batch_cost)
                logger.info(
                    "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.5f}, ips={:.4f} samples/sec | ETA {}"
                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
                            avg_loss, lr, avg_train_batch_cost,
                            avg_train_reader_cost,
                            batch_cost_averager.get_ips_average(), eta))
                logger.info(
                    "[LOSS] loss={:.4f}, semantic_loss={:.4f}, center_loss={:.4f}, offset_loss={:.4f}"
                    .format(avg_loss, avg_loss_list[0], avg_loss_list[1],
                            avg_loss_list[2]))
                if use_vdl:
                    log_writer.add_scalar('Train/loss', avg_loss, iter)
                    # Record all losses if there are more than 2 losses.
                    if len(avg_loss_list) > 1:
                        avg_loss_dict = {}
                        for i, value in enumerate(avg_loss_list):
                            avg_loss_dict['loss_' + str(i)] = value
                        for key, value in avg_loss_dict.items():
                            log_tag = 'Train/' + key
                            log_writer.add_scalar(log_tag, value, iter)

                    log_writer.add_scalar('Train/lr', lr, iter)
                    log_writer.add_scalar('Train/batch_cost',
                                          avg_train_batch_cost, iter)
                    log_writer.add_scalar('Train/reader_cost',
                                          avg_train_reader_cost, iter)

                avg_loss = 0.0
                avg_loss_list = []
                reader_cost_averager.reset()
                batch_cost_averager.reset()

            # save model
            if (iter % save_interval == 0
                    or iter == iters) and local_rank == 0:
                current_save_dir = os.path.join(save_dir,
                                                "iter_{}".format(iter))
                if not os.path.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                paddle.save(model.state_dict(),
                            os.path.join(current_save_dir, 'model.pdparams'))
                paddle.save(optimizer.state_dict(),
                            os.path.join(current_save_dir, 'model.pdopt'))
                save_models.append(current_save_dir)
                if len(save_models) > keep_checkpoint_max > 0:
                    model_to_remove = save_models.popleft()
                    shutil.rmtree(model_to_remove)

            # eval model
            if (iter % save_interval == 0 or iter == iters) and (
                    val_dataset
                    is not None) and local_rank == 0 and iter > iters // 2:
                num_workers = 1 if num_workers > 0 else 0
                panoptic_results, semantic_results, instance_results = evaluate(
                    model,
                    val_dataset,
                    threshold=threshold,
                    nms_kernel=nms_kernel,
                    top_k=top_k,
                    num_workers=num_workers,
                    print_detail=False)
                pq = panoptic_results['pan_seg']['All']['pq']
                miou = semantic_results['sem_seg']['mIoU']
                map = instance_results['ins_seg']['mAP']
                map50 = instance_results['ins_seg']['mAP50']
                logger.info(
                    "[EVAL] PQ: {:.4f}, mIoU: {:.4f}, mAP: {:.4f}, mAP50: {:.4f}"
                    .format(pq, miou, map, map50))
                model.train()

            # save best model and add evaluate results to vdl
            if (iter % save_interval == 0
                    or iter == iters) and local_rank == 0:
                if val_dataset is not None and iter > iters // 2:
                    if pq > best_pq:
                        best_pq = pq
                        best_model_iter = iter
                        best_model_dir = os.path.join(save_dir, "best_model")
                        paddle.save(
                            model.state_dict(),
                            os.path.join(best_model_dir, 'model.pdparams'))
                    logger.info(
                        '[EVAL] The model with the best validation pq ({:.4f}) was saved at iter {}.'
                        .format(best_pq, best_model_iter))

                    if use_vdl:
                        log_writer.add_scalar('Evaluate/PQ', pq, iter)
                        log_writer.add_scalar('Evaluate/mIoU', miou, iter)
                        log_writer.add_scalar('Evaluate/mAP', map, iter)
                        log_writer.add_scalar('Evaluate/mAP50', map50, iter)
            batch_start = time.time()

    # Calculate flops.
    if local_rank == 0:

        def count_syncbn(m, x, y):
            x = x[0]
            nelements = x.numel()
            m.total_ops += int(2 * nelements)

        _, c, h, w = images.shape
        flops = paddle.flops(
            model, [1, c, h, w],
            custom_ops={paddle.nn.SyncBatchNorm: count_syncbn})

    # Sleep for half a second to let dataloader release resources.
    time.sleep(0.5)
    if use_vdl:
        log_writer.close()
Ejemplo n.º 10
0
class Engine(object):
    def __init__(self, config, mode="train"):
        assert mode in ["train", "eval", "infer", "export"]
        self.mode = mode
        self.config = config
        self.eval_mode = self.config["Global"].get("eval_mode",
                                                   "classification")
        if "Head" in self.config["Arch"] or self.config["Arch"].get(
                "is_rec", False):
            self.is_rec = True
        else:
            self.is_rec = False

        # set seed
        seed = self.config["Global"].get("seed", False)
        if seed or seed == 0:
            assert isinstance(seed, int), "The 'seed' must be a integer!"
            paddle.seed(seed)
            np.random.seed(seed)
            random.seed(seed)

        # init logger
        self.output_dir = self.config['Global']['output_dir']
        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
                                f"{mode}.log")
        init_logger(log_file=log_file)
        print_config(config)

        # init train_func and eval_func
        assert self.eval_mode in ["classification", "retrieval"], logger.error(
            "Invalid eval mode: {}".format(self.eval_mode))
        self.train_epoch_func = train_epoch
        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")

        self.use_dali = self.config['Global'].get("use_dali", False)

        # for visualdl
        self.vdl_writer = None
        if self.config['Global'][
                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
            vdl_writer_path = os.path.join(self.output_dir, "vdl")
            if not os.path.exists(vdl_writer_path):
                os.makedirs(vdl_writer_path)
            self.vdl_writer = LogWriter(logdir=vdl_writer_path)

        # set device
        assert self.config["Global"]["device"] in [
            "cpu", "gpu", "xpu", "npu", "mlu"
        ]
        self.device = paddle.set_device(self.config["Global"]["device"])
        logger.info('train with paddle {} and device {}'.format(
            paddle.__version__, self.device))

        # AMP training
        self.amp = True if "AMP" in self.config and self.mode == "train" else False
        if self.amp and self.config["AMP"] is not None:
            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
            self.use_dynamic_loss_scaling = self.config["AMP"].get(
                "use_dynamic_loss_scaling", False)
        else:
            self.scale_loss = 1.0
            self.use_dynamic_loss_scaling = False
        if self.amp:
            AMP_RELATED_FLAGS_SETTING = {
                'FLAGS_max_inplace_grad_add': 8,
            }
            if paddle.is_compiled_with_cuda():
                AMP_RELATED_FLAGS_SETTING.update(
                    {'FLAGS_cudnn_batchnorm_spatial_persistent': 1})
            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)

        if "class_num" in config["Global"]:
            global_class_num = config["Global"]["class_num"]
            if "class_num" not in config["Arch"]:
                config["Arch"]["class_num"] = global_class_num
                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
            else:
                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
            logger.warning(msg)
        #TODO(gaotingquan): support rec
        class_num = config["Arch"].get("class_num", None)
        self.config["DataLoader"].update({"class_num": class_num})
        # build dataloader
        if self.mode == 'train':
            self.train_dataloader = build_dataloader(self.config["DataLoader"],
                                                     "Train", self.device,
                                                     self.use_dali)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            if self.eval_mode == "classification":
                self.eval_dataloader = build_dataloader(
                    self.config["DataLoader"], "Eval", self.device,
                    self.use_dali)
            elif self.eval_mode == "retrieval":
                self.gallery_query_dataloader = None
                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
                    self.gallery_query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], key, self.device,
                        self.use_dali)
                else:
                    self.gallery_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Gallery",
                        self.device, self.use_dali)
                    self.query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Query",
                        self.device, self.use_dali)

        # build loss
        if self.mode == "train":
            loss_info = self.config["Loss"]["Train"]
            self.train_loss_func = build_loss(loss_info)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            loss_config = self.config.get("Loss", None)
            if loss_config is not None:
                loss_config = loss_config.get("Eval")
                if loss_config is not None:
                    self.eval_loss_func = build_loss(loss_config)
                else:
                    self.eval_loss_func = None
            else:
                self.eval_loss_func = None

        # build metric
        if self.mode == 'train':
            metric_config = self.config.get("Metric")
            if metric_config is not None:
                metric_config = metric_config.get("Train")
                if metric_config is not None:
                    if hasattr(
                            self.train_dataloader, "collate_fn"
                    ) and self.train_dataloader.collate_fn is not None:
                        for m_idx, m in enumerate(metric_config):
                            if "TopkAcc" in m:
                                msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed."
                                logger.warning(msg)
                                break
                        metric_config.pop(m_idx)
                    self.train_metric_func = build_metrics(metric_config)
                else:
                    self.train_metric_func = None
        else:
            self.train_metric_func = None

        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            metric_config = self.config.get("Metric")
            if self.eval_mode == "classification":
                if metric_config is not None:
                    metric_config = metric_config.get("Eval")
                    if metric_config is not None:
                        self.eval_metric_func = build_metrics(metric_config)
            elif self.eval_mode == "retrieval":
                if metric_config is None:
                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
                else:
                    metric_config = metric_config["Eval"]
                self.eval_metric_func = build_metrics(metric_config)
        else:
            self.eval_metric_func = None

        # build model
        self.model = build_model(self.config)
        # set @to_static for benchmark, skip this by default.
        apply_to_static(self.config, self.model)

        # load_pretrain
        if self.config["Global"]["pretrained_model"] is not None:
            if self.config["Global"]["pretrained_model"].startswith("http"):
                load_dygraph_pretrain_from_url(
                    self.model, self.config["Global"]["pretrained_model"])
            else:
                load_dygraph_pretrain(
                    self.model, self.config["Global"]["pretrained_model"])

        # build optimizer
        if self.mode == 'train':
            self.optimizer, self.lr_sch = build_optimizer(
                self.config["Optimizer"], self.config["Global"]["epochs"],
                len(self.train_dataloader), [self.model])

        # for amp training
        if self.amp:
            self.scaler = paddle.amp.GradScaler(
                init_loss_scaling=self.scale_loss,
                use_dynamic_loss_scaling=self.use_dynamic_loss_scaling)
            amp_level = self.config['AMP'].get("level", "O1")
            if amp_level not in ["O1", "O2"]:
                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
                logger.warning(msg)
                self.config['AMP']["level"] = "O1"
                amp_level = "O1"
            self.model, self.optimizer = paddle.amp.decorate(
                models=self.model,
                optimizers=self.optimizer,
                level=amp_level,
                save_dtype='float32')

        # for distributed
        world_size = dist.get_world_size()
        self.config["Global"]["distributed"] = world_size != 1
        if world_size != 4 and self.mode == "train":
            msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train."
            logger.warning(msg)
        if self.config["Global"]["distributed"]:
            dist.init_parallel_env()
            self.model = paddle.DataParallel(self.model)

        # build postprocess for infer
        if self.mode == 'infer':
            self.preprocess_func = create_operators(
                self.config["Infer"]["transforms"])
            self.postprocess_func = build_postprocess(
                self.config["Infer"]["PostProcess"])

    def train(self):
        assert self.mode == "train"
        print_batch_step = self.config['Global']['print_batch_step']
        save_interval = self.config["Global"]["save_interval"]
        best_metric = {
            "metric": 0.0,
            "epoch": 0,
        }
        # key:
        # val: metrics list word
        self.output_info = dict()
        self.time_info = {
            "batch_cost": AverageMeter("batch_cost", '.5f', postfix=" s,"),
            "reader_cost": AverageMeter("reader_cost", ".5f", postfix=" s,"),
        }
        # global iter counter
        self.global_step = 0

        if self.config["Global"]["checkpoints"] is not None:
            metric_info = init_model(self.config["Global"], self.model,
                                     self.optimizer)
            if metric_info is not None:
                best_metric.update(metric_info)

        self.max_iter = len(self.train_dataloader) - 1 if platform.system(
        ) == "Windows" else len(self.train_dataloader)
        for epoch_id in range(best_metric["epoch"] + 1,
                              self.config["Global"]["epochs"] + 1):
            acc = 0.0
            # for one epoch train
            self.train_epoch_func(self, epoch_id, print_batch_step)

            if self.use_dali:
                self.train_dataloader.reset()
            metric_msg = ", ".join([
                "{}: {:.5f}".format(key, self.output_info[key].avg)
                for key in self.output_info
            ])
            logger.info("[Train][Epoch {}/{}][Avg]{}".format(
                epoch_id, self.config["Global"]["epochs"], metric_msg))
            self.output_info.clear()

            # eval model and save model if possible
            if self.config["Global"][
                    "eval_during_train"] and epoch_id % self.config["Global"][
                        "eval_interval"] == 0:
                acc = self.eval(epoch_id)
                if acc > best_metric["metric"]:
                    best_metric["metric"] = acc
                    best_metric["epoch"] = epoch_id
                    save_load.save_model(
                        self.model,
                        self.optimizer,
                        best_metric,
                        self.output_dir,
                        model_name=self.config["Arch"]["name"],
                        prefix="best_model")
                logger.info("[Eval][Epoch {}][best metric: {}]".format(
                    epoch_id, best_metric["metric"]))
                logger.scaler(name="eval_acc",
                              value=acc,
                              step=epoch_id,
                              writer=self.vdl_writer)

                self.model.train()

            # save model
            if epoch_id % save_interval == 0:
                save_load.save_model(self.model,
                                     self.optimizer, {
                                         "metric": acc,
                                         "epoch": epoch_id
                                     },
                                     self.output_dir,
                                     model_name=self.config["Arch"]["name"],
                                     prefix="epoch_{}".format(epoch_id))
            # save the latest model
            save_load.save_model(self.model,
                                 self.optimizer, {
                                     "metric": acc,
                                     "epoch": epoch_id
                                 },
                                 self.output_dir,
                                 model_name=self.config["Arch"]["name"],
                                 prefix="latest")

        if self.vdl_writer is not None:
            self.vdl_writer.close()

    @paddle.no_grad()
    def eval(self, epoch_id=0):
        assert self.mode in ["train", "eval"]
        self.model.eval()
        eval_result = self.eval_func(self, epoch_id)
        self.model.train()
        return eval_result

    @paddle.no_grad()
    def infer(self):
        assert self.mode == "infer" and self.eval_mode == "classification"
        total_trainer = dist.get_world_size()
        local_rank = dist.get_rank()
        image_list = get_image_list(self.config["Infer"]["infer_imgs"])
        # data split
        image_list = image_list[local_rank::total_trainer]

        batch_size = self.config["Infer"]["batch_size"]
        self.model.eval()
        batch_data = []
        image_file_list = []
        for idx, image_file in enumerate(image_list):
            with open(image_file, 'rb') as f:
                x = f.read()
            for process in self.preprocess_func:
                x = process(x)
            batch_data.append(x)
            image_file_list.append(image_file)
            if len(batch_data) >= batch_size or idx == len(image_list) - 1:
                batch_tensor = paddle.to_tensor(batch_data)
                out = self.model(batch_tensor)
                if isinstance(out, list):
                    out = out[0]
                if isinstance(out, dict) and "logits" in out:
                    out = out["logits"]
                if isinstance(out, dict) and "output" in out:
                    out = out["output"]
                result = self.postprocess_func(out, image_file_list)
                print(result)
                batch_data.clear()
                image_file_list.clear()

    def export(self):
        assert self.mode == "export"
        use_multilabel = self.config["Global"].get("use_multilabel", False)
        model = ExportModel(self.config["Arch"], self.model, use_multilabel)
        if self.config["Global"]["pretrained_model"] is not None:
            load_dygraph_pretrain(model.base_model,
                                  self.config["Global"]["pretrained_model"])

        model.eval()
        save_path = os.path.join(self.config["Global"]["save_inference_dir"],
                                 "inference")
        if model.quanter:
            model.quanter.save_quantized_model(
                model.base_model,
                save_path,
                input_spec=[
                    paddle.static.InputSpec(
                        shape=[None] + self.config["Global"]["image_shape"],
                        dtype='float32')
                ])
        else:
            model = paddle.jit.to_static(
                model,
                input_spec=[
                    paddle.static.InputSpec(
                        shape=[None] + self.config["Global"]["image_shape"],
                        dtype='float32')
                ])
            paddle.jit.save(model, save_path)
Ejemplo n.º 11
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output, 'log'))

    fluid.enable_dygraph(place)
    with fluid.unique_name.guard():
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

    # init input
    text = np.asarray(text_to_sequence(text_input))
    text = fluid.layers.unsqueeze(dg.to_variable(text).astype(np.int64), [0])
    mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = fluid.layers.unsqueeze(
        dg.to_variable(pos_text).astype(np.int64), [0])

    for i in range(args.max_len):
        pos_mel = np.arange(1, mel_input.shape[1] + 1)
        pos_mel = fluid.layers.unsqueeze(
            dg.to_variable(pos_mel).astype(np.int64), [0])
        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            text, mel_input, pos_text, pos_mel)
        if stop_preds.numpy()[0, -1] > args.stop_threshold:
            break
        mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]],
                                        axis=1)
    global_step = 0
    for i, prob in enumerate(attn_probs):
        for j in range(4):
            x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
            writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j)

    if args.vocoder == 'griffin-lim':
        #synthesis use griffin-lim
        wav = synthesis_with_griffinlim(postnet_pred, cfg['audio'])
    elif args.vocoder == 'waveflow':
        # synthesis use waveflow
        wav = synthesis_with_waveflow(postnet_pred, args,
                                      args.checkpoint_vocoder, place)
    else:
        print(
            'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
            % args.vocoder)

    writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(
        os.path.join(os.path.join(args.output, 'samples'),
                     args.vocoder + '.wav'), cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Ejemplo n.º 12
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output,
                                    'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    network_cfg = cfg['network']
    model = TransformerTTS(
        network_cfg['embedding_size'], network_cfg['hidden_size'],
        network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
        cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
        network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
                                        (cfg['train']['learning_rate']**2)),
                                   cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
            'grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(
        model=model,
        optimizer=optimizer,
        checkpoint_dir=os.path.join(args.output, 'checkpoints'),
        iteration=args.iteration,
        checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(
        cfg['audio'],
        place,
        args.data,
        cfg['train']['batch_size'],
        nranks,
        local_rank,
        shuffle=True).reader

    iterator = iter(tqdm(reader))

    global_step += 1

    while global_step <= cfg['train']['max_iteration']:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm(reader))
            batch = next(iterator)

        character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch

        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            character, mel_input, pos_text, pos_mel)

        mel_loss = layers.mean(
            layers.abs(layers.elementwise_sub(mel_pred, mel)))
        post_mel_loss = layers.mean(
            layers.abs(layers.elementwise_sub(postnet_pred, mel)))
        loss = mel_loss + post_mel_loss

        stop_loss = cross_entropy(
            stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight'])
        loss = loss + stop_loss

        if local_rank == 0:
            writer.add_scalar('training_loss/mel_loss',
                              mel_loss.numpy(),
                              global_step)
            writer.add_scalar('training_loss/post_mel_loss',
                              post_mel_loss.numpy(),
                              global_step)
            writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)

            if parallel:
                writer.add_scalar('alphas/encoder_alpha',
                                   model._layers.encoder.alpha.numpy(),
                                   global_step)
                writer.add_scalar('alphas/decoder_alpha',
                                   model._layers.decoder.alpha.numpy(),
                                   global_step)
            else:
                writer.add_scalar('alphas/encoder_alpha',
                                   model.encoder.alpha.numpy(),
                                   global_step)
                writer.add_scalar('alphas/decoder_alpha',
                                   model.decoder.alpha.numpy(),
                                   global_step)

            writer.add_scalar('learning_rate',
                              optimizer._learning_rate.step().numpy(),
                              global_step)

            if global_step % cfg['train']['image_interval'] == 1:
                for i, prob in enumerate(attn_probs):
                    for j in range(cfg['network']['decoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_%d_0' % global_step,
                            x,
                            i * 4 + j)

                for i, prob in enumerate(attn_enc):
                    for j in range(cfg['network']['encoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_enc_%d_0' % global_step,
                            x,
                            i * 4 + j)

                for i, prob in enumerate(attn_dec):
                    for j in range(cfg['network']['decoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_dec_%d_0' % global_step,
                            x,
                            i * 4 + j)

        if parallel:
            loss = model.scale_loss(loss)
            loss.backward()
            model.apply_collective_grads()
        else:
            loss.backward()
        optimizer.minimize(loss)
        model.clear_gradients()

        # save checkpoint
        if local_rank == 0 and global_step % cfg['train'][
                'checkpoint_interval'] == 0:
            io.save_parameters(
                os.path.join(args.output, 'checkpoints'), global_step, model,
                optimizer)
        global_step += 1

    if local_rank == 0:
        writer.close()
Ejemplo n.º 13
0
def train(args):

    writer = LogWriter(logdir=args.logdir)

    rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
    world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))

    gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
    place = paddle.CUDAPlace(gpu_id)

    if world_size > 1:
        import paddle.distributed.fleet as fleet
        strategy = fleet.DistributedStrategy()
        strategy.without_graph_optimization = True
        fleet.init(is_collective=True, strategy=strategy)

    if args.use_synthetic_dataset:
        trainset = SyntheticDataset(args.num_classes, fp16=args.fp16)
    else:
        trainset = CommonDataset(root_dir=args.data_dir,
                                 label_file=args.label_file,
                                 fp16=args.fp16,
                                 is_bin=args.is_bin)

    num_image = len(trainset)
    total_batch_size = args.batch_size * world_size
    steps_per_epoch = num_image // total_batch_size
    if args.train_unit == 'epoch':
        warmup_steps = steps_per_epoch * args.warmup_num
        total_steps = steps_per_epoch * args.train_num
        decay_steps = [x * steps_per_epoch for x in args.decay_boundaries]
        total_epoch = args.train_num
    else:
        warmup_steps = args.warmup_num
        total_steps = args.train_num
        decay_steps = [x for x in args.decay_boundaries]
        total_epoch = (total_steps + steps_per_epoch - 1) // steps_per_epoch

    if rank == 0:
        logging.info('world_size: {}'.format(world_size))
        logging.info('total_batch_size: {}'.format(total_batch_size))
        logging.info('warmup_steps: {}'.format(warmup_steps))
        logging.info('steps_per_epoch: {}'.format(steps_per_epoch))
        logging.info('total_steps: {}'.format(total_steps))
        logging.info('total_epoch: {}'.format(total_epoch))
        logging.info('decay_steps: {}'.format(decay_steps))

    base_lr = total_batch_size * args.lr / 512
    lr_scheduler = paddle.optimizer.lr.PiecewiseDecay(
        boundaries=decay_steps,
        values=[
            base_lr * (args.lr_decay**i) for i in range(len(decay_steps) + 1)
        ])
    if warmup_steps > 0:
        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
            lr_scheduler, warmup_steps, 0, base_lr)

    train_program = paddle.static.Program()
    test_program = paddle.static.Program()
    startup_program = paddle.static.Program()

    margin_loss_params = eval("losses.{}".format(args.loss))()
    train_model = StaticModel(
        main_program=train_program,
        startup_program=startup_program,
        backbone_class_name=args.backbone,
        embedding_size=args.embedding_size,
        classifier_class_name=args.classifier,
        num_classes=args.num_classes,
        sample_ratio=args.sample_ratio,
        lr_scheduler=lr_scheduler,
        momentum=args.momentum,
        weight_decay=args.weight_decay,
        dropout=args.dropout,
        mode='train',
        fp16=args.fp16,
        fp16_configs={
            'init_loss_scaling': args.init_loss_scaling,
            'incr_every_n_steps': args.incr_every_n_steps,
            'decr_every_n_nan_or_inf': args.decr_every_n_nan_or_inf,
            'incr_ratio': args.incr_ratio,
            'decr_ratio': args.decr_ratio,
            'use_dynamic_loss_scaling': args.use_dynamic_loss_scaling,
            'use_pure_fp16': args.fp16,
            'custom_white_list': args.custom_white_list,
            'custom_black_list': args.custom_black_list,
        },
        margin_loss_params=margin_loss_params,
    )

    if rank == 0:
        with open(os.path.join(args.output, 'main_program.txt'), 'w') as f:
            f.write(str(train_program))

    if rank == 0 and args.do_validation_while_train:
        test_model = StaticModel(
            main_program=test_program,
            startup_program=startup_program,
            backbone_class_name=args.backbone,
            embedding_size=args.embedding_size,
            dropout=args.dropout,
            mode='test',
            fp16=args.fp16,
        )

        callback_verification = CallBackVerification(
            args.validation_interval_step, rank, args.batch_size, test_program,
            list(test_model.backbone.input_dict.values()),
            list(test_model.backbone.output_dict.values()), args.val_targets,
            args.data_dir)

    callback_logging = CallBackLogging(args.log_interval_step, rank,
                                       world_size, total_steps,
                                       args.batch_size, writer)
    checkpoint = Checkpoint(
        rank=rank,
        world_size=world_size,
        embedding_size=args.embedding_size,
        num_classes=args.num_classes,
        model_save_dir=os.path.join(args.output, args.backbone),
        checkpoint_dir=args.checkpoint_dir,
        max_num_last_checkpoint=args.max_num_last_checkpoint)

    exe = paddle.static.Executor(place)
    exe.run(startup_program)

    start_epoch = 0
    global_step = 0
    loss_avg = AverageMeter()
    if args.resume:
        extra_info = checkpoint.load(program=train_program, for_train=True)
        start_epoch = extra_info['epoch'] + 1
        lr_state = extra_info['lr_state']
        # there last_epoch means last_step in for PiecewiseDecay
        # since we always use step style for lr_scheduler
        global_step = lr_state['last_epoch']
        train_model.lr_scheduler.set_state_dict(lr_state)

    train_loader = paddle.io.DataLoader(
        trainset,
        feed_list=list(train_model.backbone.input_dict.values()),
        places=place,
        return_list=False,
        num_workers=args.num_workers,
        batch_sampler=paddle.io.DistributedBatchSampler(
            dataset=trainset,
            batch_size=args.batch_size,
            shuffle=True,
            drop_last=True))

    max_loss_scaling = np.array([args.max_loss_scaling]).astype(np.float32)
    for epoch in range(start_epoch, total_epoch):
        for step, data in enumerate(train_loader):
            global_step += 1

            loss_v = exe.run(
                train_program,
                feed=data,
                fetch_list=[train_model.classifier.output_dict['loss']],
                use_program_cache=True)

            loss_avg.update(np.array(loss_v)[0], 1)
            lr_value = train_model.optimizer.get_lr()
            callback_logging(global_step, loss_avg, epoch, lr_value)
            if rank == 0 and args.do_validation_while_train:
                callback_verification(global_step)
            train_model.lr_scheduler.step()

            if global_step >= total_steps:
                break
            sys.stdout.flush()

        checkpoint.save(train_program,
                        lr_scheduler=train_model.lr_scheduler,
                        epoch=epoch,
                        for_train=True)
    writer.close()
Ejemplo n.º 14
0
def main(args):
    config = Config(args.config)
    cfg = config(vars(args), mode=['infer', 'init'])
    scale = cfg['infer']['scale']
    mdname = cfg['infer']['model']
    imgname = ''.join(mdname)  # + '/' + str(scale)
    #dirname = ''.join(mdname)  + '_' + str(scale)
    sz = cfg['infer']['sz']
    infer_size = cfg['infer']['infer_size']
    #save_path = os.path.join(args.save_dir, cfg['init']['result'])
    list = cfg['infer']['infer_size']
    save_path = create_path(args.save_dir, cfg['init']['result'])
    save_path = create_path(save_path, imgname)
    save_path = create_path(save_path, str(scale))

    tif_path = create_path(save_path, cfg['infer']['lab'])
    color_path = create_path(save_path, cfg['infer']['color'])
    gray_path = create_path(save_path, cfg['infer']['gray'])
    vdl_dir = os.path.join(args.save_dir, cfg['init']['vdl_dir'])
    palette = cfg['infer']['palette']
    palette = np.array(palette, dtype=np.uint8)
    num_class = cfg['init']['num_classes']
    batchsz = cfg['infer']['batchsz']
    infer_path = os.path.join(cfg['infer']['root_path'], cfg['infer']['path'])
    tagname = imgname + '/' + str(scale)
    vdl_dir = os.path.join(vdl_dir, 'infer')
    writer = LogWriter(logdir=vdl_dir)

    infer_ds = TeDataset(path=cfg['infer']['root_path'],
                         fl=cfg['infer']['path'],
                         sz=sz)
    total = len(infer_ds)
    # select model
    #addresult = np.zeros((total//batchsz,batchsz,num_class,sz,sz))
    addresult = np.zeros((total, num_class, sz, sz))
    for mnet in mdname:
        result_list = []
        net = modelset(mode=mnet, num_classes=cfg['init']['num_classes'])
        # load moel
        input = InputSpec([None, 3, 64, 64], 'float32', 'x')
        label = InputSpec([None, 1, 64, 64], 'int64', 'label')
        model = paddle.Model(net, input, label)
        model.load(path=os.path.join(args.save_dir, mnet) + '/' + mnet)
        model.prepare()
        result = model.predict(
            infer_ds,
            batch_size=batchsz,
            num_workers=cfg['infer']['num_workers'],
            stack_outputs=True  # [160,2,64,64]
        )

        addresult = result[0] + scale * addresult

    pred = construct(addresult, infer_size, sz=sz)
    # pred = construct(addresult,infer_size,sz = sz)
    # # 腐蚀膨胀
    # read vdl
    file_list = os.listdir(infer_path)
    file_list.sort(key=lambda x: int(x[-5:-4]))
    step = 0
    for i, fl in enumerate(file_list):
        name, _ = fl.split(".")
        # save pred
        lab_img = Image.fromarray(pred[i].astype(np.uint8)).convert("L")
        saveimg(lab_img, tif_path, name=name, type='.tif')

        # gray_label
        label = colorize(pred[i], palette)
        writer.add_image(tag=tagname,
                         img=saveimg(label,
                                     gray_path,
                                     name=name,
                                     type='.png',
                                     re_out=True),
                         step=step,
                         dataformats='HW')
        step += 1

        # color_label
        file = os.path.join(infer_path, fl)
        out = blend_image(file, label, alpha=0.25)
        writer.add_image(tag=tagname,
                         img=saveimg(out,
                                     color_path,
                                     name=name,
                                     type='.png',
                                     re_out=True),
                         step=step,
                         dataformats='HWC')
        step += 1
    writer.close()
Ejemplo n.º 15
0
def train(model,
          train_dataset,
          places=None,
          eval_dataset=None,
          optimizer=None,
          save_dir='output',
          num_epochs=100,
          batch_size=2,
          pretrained_model=None,
          resume_model=None,
          save_interval_epochs=1,
          log_steps=10,
          num_classes=None,
          num_workers=8,
          use_vdl=False):
    ignore_index = model.ignore_index
    nranks = ParallelEnv().nranks

    start_epoch = 0
    if resume_model is not None:
        start_epoch = resume(model, optimizer, resume_model)
    elif pretrained_model is not None:
        load_pretrained_model(model, pretrained_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        strategy = fluid.dygraph.prepare_context()
        ddp_model = fluid.dygraph.DataParallel(model, strategy)

    batch_sampler = DistributedBatchSampler(train_dataset,
                                            batch_size=batch_size,
                                            shuffle=True,
                                            drop_last=True)
    loader = DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        places=places,
        num_workers=num_workers,
        return_list=True,
    )

    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    timer = Timer()
    avg_loss = 0.0
    steps_per_epoch = len(batch_sampler)
    total_steps = steps_per_epoch * (num_epochs - start_epoch)
    num_steps = 0
    best_mean_iou = -1.0
    best_model_epoch = -1
    train_reader_cost = 0.0
    train_batch_cost = 0.0
    for epoch in range(start_epoch, num_epochs):
        timer.start()
        for step, data in enumerate(loader):
            train_reader_cost += timer.elapsed_time()
            images = data[0]
            labels = data[1].astype('int64')
            if nranks > 1:
                loss = ddp_model(images, labels)
                # apply_collective_grads sum grads over multiple gpus.
                loss = ddp_model.scale_loss(loss)
                loss.backward()
                ddp_model.apply_collective_grads()
            else:
                loss = model(images, labels)
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()
            avg_loss += loss.numpy()[0]
            lr = optimizer.current_step_lr()
            num_steps += 1
            train_batch_cost += timer.elapsed_time()
            if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0:
                avg_loss /= log_steps
                avg_train_reader_cost = train_reader_cost / log_steps
                avg_train_batch_cost = train_batch_cost / log_steps
                train_reader_cost = 0.0
                train_batch_cost = 0.0
                remain_steps = total_steps - num_steps
                eta = calculate_eta(remain_steps, avg_train_batch_cost)
                logging.info(
                    "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
                    .format(epoch + 1, num_epochs, step + 1, steps_per_epoch,
                            avg_loss * nranks, lr, avg_train_batch_cost,
                            avg_train_reader_cost, eta))
                if use_vdl:
                    log_writer.add_scalar('Train/loss', avg_loss * nranks,
                                          num_steps)
                    log_writer.add_scalar('Train/lr', lr, num_steps)
                    log_writer.add_scalar('Train/batch_cost',
                                          avg_train_batch_cost, num_steps)
                    log_writer.add_scalar('Train/reader_cost',
                                          avg_train_reader_cost, num_steps)
                avg_loss = 0.0
            timer.restart()

        if ((epoch + 1) % save_interval_epochs == 0
                or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0:
            current_save_dir = os.path.join(save_dir,
                                            "epoch_{}".format(epoch + 1))
            if not os.path.isdir(current_save_dir):
                os.makedirs(current_save_dir)
            fluid.save_dygraph(model.state_dict(),
                               os.path.join(current_save_dir, 'model'))
            fluid.save_dygraph(optimizer.state_dict(),
                               os.path.join(current_save_dir, 'model'))

            if eval_dataset is not None:
                mean_iou, avg_acc = evaluate(model,
                                             eval_dataset,
                                             model_dir=current_save_dir,
                                             num_classes=num_classes,
                                             ignore_index=ignore_index,
                                             epoch_id=epoch + 1)
                if mean_iou > best_mean_iou:
                    best_mean_iou = mean_iou
                    best_model_epoch = epoch + 1
                    best_model_dir = os.path.join(save_dir, "best_model")
                    fluid.save_dygraph(model.state_dict(),
                                       os.path.join(best_model_dir, 'model'))
                logging.info(
                    'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}'
                    .format(best_model_epoch, best_mean_iou))

                if use_vdl:
                    log_writer.add_scalar('Evaluate/mIoU', mean_iou, epoch + 1)
                    log_writer.add_scalar('Evaluate/aAcc', avg_acc, epoch + 1)
                model.train()
    if use_vdl:
        log_writer.close()
Ejemplo n.º 16
0
def main(args):
    paddle.seed(12345)

    config = get_config(args.config, overrides=args.override, show=True)
    # assign the place
    use_gpu = config.get("use_gpu", True)
    place = paddle.set_device('gpu' if use_gpu else 'cpu')

    trainer_num = paddle.distributed.get_world_size()
    use_data_parallel = trainer_num != 1
    config["use_data_parallel"] = use_data_parallel

    if config["use_data_parallel"]:
        paddle.distributed.init_parallel_env()

    net = program.create_model(config.ARCHITECTURE, config.classes_num)
    optimizer, lr_scheduler = program.create_optimizer(
        config, parameter_list=net.parameters())

    dp_net = net
    if config["use_data_parallel"]:
        find_unused_parameters = config.get("find_unused_parameters", False)
        dp_net = paddle.DataParallel(
            net, find_unused_parameters=find_unused_parameters)

    # load model from checkpoint or pretrained model
    init_model(config, net, optimizer)

    train_dataloader = Reader(config, 'train', places=place)()

    if config.validate:
        valid_dataloader = Reader(config, 'valid', places=place)()

    last_epoch_id = config.get("last_epoch", -1)
    best_top1_acc = 0.0  # best top1 acc record
    best_top1_epoch = last_epoch_id

    vdl_writer_path = config.get("vdl_dir", None)
    vdl_writer = None
    if vdl_writer_path:
        from visualdl import LogWriter
        vdl_writer = LogWriter(vdl_writer_path)
    # Ensure that the vdl log file can be closed normally
    try:
        for epoch_id in range(last_epoch_id + 1, config.epochs):
            net.train()
            # 1. train with train dataset
            program.run(train_dataloader, config, dp_net, optimizer,
                        lr_scheduler, epoch_id, 'train', vdl_writer)

            # 2. validate with validate dataset
            if config.validate and epoch_id % config.valid_interval == 0:
                net.eval()
                with paddle.no_grad():
                    top1_acc = program.run(valid_dataloader, config, net, None,
                                           None, epoch_id, 'valid', vdl_writer)
                if top1_acc > best_top1_acc:
                    best_top1_acc = top1_acc
                    best_top1_epoch = epoch_id
                    model_path = os.path.join(config.model_save_dir,
                                              config.ARCHITECTURE["name"])
                    save_model(net, optimizer, model_path, "best_model")
                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                    best_top1_acc, best_top1_epoch)
                logger.info(message)

            # 3. save the persistable model
            if epoch_id % config.save_interval == 0:
                model_path = os.path.join(config.model_save_dir,
                                          config.ARCHITECTURE["name"])
                save_model(net, optimizer, model_path, epoch_id)
    except Exception as e:
        logger.error(e)
    finally:
        vdl_writer.close() if vdl_writer else None
Ejemplo n.º 17
0
def main(args):
    world_size = int(1.0)
    rank = int(0.0)
    local_rank = args.local_rank

    if not os.path.exists(cfg.output):
        os.makedirs(cfg.output)
    else:
        time.sleep(2)

    if not os.path.exists(cfg.output):
        os.makedirs(cfg.output)
    else:
        time.sleep(2)

    writer = LogWriter(logdir=cfg.logdir)

    trainset = MXFaceDataset(root_dir=cfg.rec)
    train_loader = DataLoader(dataset=trainset,
                              batch_size=cfg.batch_size,
                              shuffle=True,
                              drop_last=True,
                              num_workers=0)

    dropout = 0.4 if cfg.dataset == "webface" else 0
    backbone = eval("backbones.{}".format(args.network))(False,
                                                         dropout=0.5,
                                                         fp16=False)
    backbone.train()

    clip_by_norm = ClipGradByNorm(5.0)
    margin_softmax = eval("losses.{}".format(args.loss))()

    module_partial_fc = PartialFC(rank=0,
                                  local_rank=0,
                                  world_size=1,
                                  resume=0,
                                  batch_size=cfg.batch_size,
                                  margin_softmax=margin_softmax,
                                  num_classes=cfg.num_classes,
                                  sample_rate=cfg.sample_rate,
                                  embedding_size=cfg.embedding_size,
                                  prefix=cfg.output)

    scheduler_backbone = paddle.optimizer.lr.LambdaDecay(learning_rate=cfg.lr /
                                                         512 * cfg.batch_size,
                                                         lr_lambda=cfg.lr_func,
                                                         verbose=True)
    opt_backbone = paddle.optimizer.SGD(parameters=backbone.parameters(),
                                        learning_rate=scheduler_backbone,
                                        weight_decay=cfg.weight_decay,
                                        grad_clip=clip_by_norm)
    scheduler_pfc = paddle.optimizer.lr.LambdaDecay(learning_rate=cfg.lr /
                                                    512 * cfg.batch_size,
                                                    lr_lambda=cfg.lr_func,
                                                    verbose=True)
    opt_pfc = paddle.optimizer.SGD(parameters=module_partial_fc.parameters(),
                                   learning_rate=scheduler_pfc,
                                   weight_decay=cfg.weight_decay,
                                   grad_clip=clip_by_norm)

    start_epoch = 0
    total_step = int(
        len(trainset) / cfg.batch_size / world_size * cfg.num_epoch)
    if rank == 0:
        print("Total Step is: %d" % total_step)

    callback_verification = CallBackVerification(2000, rank, cfg.val_targets,
                                                 cfg.rec)
    callback_logging = CallBackLogging(100, rank, total_step, cfg.batch_size,
                                       world_size, writer)
    callback_checkpoint = CallBackModelCheckpoint(rank, cfg.output)

    loss = AverageMeter()
    global_step = 0
    grad_scaler = MaxClipGradScaler(
        cfg.batch_size, 128 *
        cfg.batch_size, growth_interval=100) if cfg.fp16 else None
    for epoch in range(start_epoch, cfg.num_epoch):
        for step, (img, label) in enumerate(train_loader):
            label = label.flatten()
            global_step += 1
            features = F.normalize(backbone(img))
            x_grad, loss_v = module_partial_fc.forward_backward(
                label, features, opt_pfc)
            if cfg.fp16:
                scaled = grad_scaler.scale(x_grad)
                (features.multiply(scaled)).backward()
                grad_scaler._unscale(opt_backbone)
                grad_scaler.minimize(opt_backbone, scaled)
            else:
                (features.multiply(x_grad)).backward()
                opt_backbone.step()
            opt_pfc.step()
            module_partial_fc.update()
            opt_backbone.clear_gradients()
            opt_pfc.clear_gradients()
            loss.update(loss_v, 1)
            callback_logging(global_step, loss, epoch, cfg.fp16, grad_scaler)
            callback_verification(global_step, backbone)
        callback_checkpoint(global_step, backbone, module_partial_fc)
        scheduler_backbone.step()
        scheduler_pfc.step()
    writer.close()
Ejemplo n.º 18
0
    def train(self, loaders):
        args = self.args
        nets = self.nets
        nets_ema = self.nets_ema
        optims = self.optims
        writer = LogWriter(logdir=self.args.checkpoint_dir + "/log/")

        # fetch random validation images for debugging
        fetcher = InputFetcher(loaders.src, loaders.ref, args.latent_dim,
                               'train')
        fetcher_val = InputFetcher(loaders.val, None, args.latent_dim, 'val')
        inputs_val = next(fetcher_val)

        # resume training if necessary
        if args.resume_iter > 0:
            self._load_checkpoint(args.resume_iter)

        # remember the initial value of ds weight
        initial_lambda_ds = args.lambda_ds

        print('Start training...')
        import tqdm
        start_time = time.time()
        tqdm_descriptor = tqdm.trange(args.resume_iter, args.total_iters)
        for i in tqdm_descriptor:
            # fetch images and labels
            inputs = next(fetcher)
            x_real, y_org = inputs.x_src, inputs.y_src
            x_ref, x_ref2, y_trg = inputs.x_ref, inputs.x_ref2, inputs.y_ref
            z_trg, z_trg2 = inputs.z_trg, inputs.z_trg2

            masks = nets.fan.get_heatmap(x_real) if args.w_hpf > 0 else None

            # train the discriminator
            d_loss, d_losses_latent = compute_d_loss(nets,
                                                     args,
                                                     x_real,
                                                     y_org,
                                                     y_trg,
                                                     z_trg=z_trg,
                                                     masks=masks)
            self._reset_grad()
            d_loss.backward()
            optims.discriminator.minimize(d_loss)

            d_loss, d_losses_ref = compute_d_loss(nets,
                                                  args,
                                                  x_real,
                                                  y_org,
                                                  y_trg,
                                                  x_ref=x_ref,
                                                  masks=masks)
            self._reset_grad()
            d_loss.backward()
            optims.discriminator.minimize(d_loss)

            # train the generator
            if i - args.resume_iter > 100:  ##train discriminator first
                g_loss, g_losses_latent, sample_1 = compute_g_loss(
                    nets,
                    args,
                    x_real,
                    y_org,
                    y_trg,
                    z_trgs=[z_trg, z_trg2],
                    masks=masks)
                self._reset_grad()
                g_loss.backward()
                optims.generator.minimize(g_loss)
                optims.mapping_network.minimize(g_loss)
                optims.style_encoder.minimize(g_loss)

                g_loss, g_losses_ref, sample_2 = compute_g_loss(
                    nets,
                    args,
                    x_real,
                    y_org,
                    y_trg,
                    x_refs=[x_ref, x_ref2],
                    masks=masks)
                self._reset_grad()
                g_loss.backward()
                optims.generator.minimize(g_loss)

                # compute moving average of network parameters
                moving_average(nets.generator, nets_ema.generator, beta=0.999)
                moving_average(nets.mapping_network,
                               nets_ema.mapping_network,
                               beta=0.999)
                moving_average(nets.style_encoder,
                               nets_ema.style_encoder,
                               beta=0.999)

                # decay weight for diversity sensitive loss
                if args.lambda_ds > 0:
                    args.lambda_ds -= (initial_lambda_ds / args.ds_iter)

                # print out log info
                if (i + 1) % args.print_every == 0:
                    elapsed = time.time() - start_time
                    elapsed = str(datetime.timedelta(seconds=elapsed))[:-7]
                    log = "Elapsed time [%s], Iteration [%i/%i], " % (
                        elapsed, i + 1, args.total_iters)
                    all_losses = dict()
                    for loss, prefix in zip([
                            d_losses_latent, d_losses_ref, g_losses_latent,
                            g_losses_ref
                    ], ['D/latent_', 'D/ref_', 'G/latent_', 'G/ref_']):
                        for key, value in loss.items():
                            all_losses[prefix + key] = value
                            writer.add_scalar(tag=prefix + key,
                                              step=i + 1,
                                              value=value)
                    all_losses['G/lambda_ds'] = args.lambda_ds
                    log += ' '.join([
                        '%s: [%.4f]' % (key, value)
                        for key, value in all_losses.items()
                    ])
                    tqdm_descriptor.set_description(log)
                    writer.add_image("x_fake",
                                     (utils.denormalize(sample_1) *
                                      255).numpy().transpose([1, 2, 0]).astype(
                                          np.uint8), i + 1)

                # generate images for debugging
                if (i + 1) % args.sample_every == 0:
                    os.makedirs(args.sample_dir, exist_ok=True)
                    utils.debug_image(nets_ema,
                                      args,
                                      inputs=inputs_val,
                                      step=i + 1)

                # save model checkpoints
                if (i + 1) % args.save_every == 0:
                    self._save_checkpoint(step=i + 1)

                # compute FID and LPIPS if necessary
                if (i + 1) % args.eval_every == 0:
                    calculate_metrics(nets_ema, args, i + 1, mode='latent')
                    calculate_metrics(nets_ema, args, i + 1, mode='reference')
            else:
                if (i + 1) % args.print_every == 0:
                    elapsed = time.time() - start_time
                    elapsed = str(datetime.timedelta(seconds=elapsed))[:-7]
                    log = "Elapsed time [%s], Iteration [%i/%i], " % (
                        elapsed, i + 1, args.total_iters)
                    all_losses = dict()
                    for loss, prefix in zip([d_losses_latent, d_losses_ref],
                                            ['D/latent_', 'D/ref_']):
                        for key, value in loss.items():
                            all_losses[prefix + key] = value
                            writer.add_scalar(tag=prefix + key,
                                              step=i + 1,
                                              value=value)
                    log += ' '.join([
                        '%s: [%.4f]' % (key, value)
                        for key, value in all_losses.items()
                    ])
                    tqdm_descriptor.set_description(log)

        writer.close()
Ejemplo n.º 19
0
def train(model,
          train_dataset,
          val_dataset=None,
          optimizer=None,
          save_dir='output',
          iters=10000,
          batch_size=2,
          resume_model=None,
          save_interval=1000,
          log_iters=10,
          num_workers=0,
          use_vdl=False,
          losses=None,
          keep_checkpoint_max=5,
          test_config=None,
          fp16=False,
          profiler_options=None):
    """
    Launch training.

    Args:
        model(nn.Layer): A sementic segmentation model.
        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
        optimizer (paddle.optimizer.Optimizer): The optimizer.
        save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
        iters (int, optional): How may iters to train the model. Defualt: 10000.
        batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
        resume_model (str, optional): The path of resume model.
        save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
        log_iters (int, optional): Display logging information at every log_iters. Default: 10.
        num_workers (int, optional): Num workers for data loader. Default: 0.
        use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False.
        losses (dict, optional): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']).
            The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient.
        keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
        test_config(dict, optional): Evaluation config.
        fp16 (bool, optional): Whether to use amp.
        profiler_options (str, optional): The option of train profiler.
    """
    model.train()
    nranks = paddle.distributed.ParallelEnv().nranks
    local_rank = paddle.distributed.ParallelEnv().local_rank

    start_iter = 0
    if resume_model is not None:
        start_iter = resume(model, optimizer, resume_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        paddle.distributed.fleet.init(is_collective=True)
        optimizer = paddle.distributed.fleet.distributed_optimizer(
            optimizer)  # The return is Fleet object
        ddp_model = paddle.distributed.fleet.distributed_model(model)

    batch_sampler = paddle.io.DistributedBatchSampler(train_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=True,
                                                      drop_last=True)

    loader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        num_workers=num_workers,
        return_list=True,
        worker_init_fn=worker_init_fn,
    )

    # use amp
    if fp16:
        logger.info('use amp to train')
        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    avg_loss = 0.0
    avg_loss_list = []
    iters_per_epoch = len(batch_sampler)
    best_acc = -1.0
    best_model_iter = -1
    reader_cost_averager = TimeAverager()
    batch_cost_averager = TimeAverager()
    save_models = deque()
    batch_start = time.time()

    iter = start_iter
    while iter < iters:
        for data in loader:
            iter += 1
            if iter > iters:
                version = paddle.__version__
                if version == '2.1.2':
                    continue
                else:
                    break
            reader_cost_averager.record(time.time() - batch_start)
            images = data[0]
            labels = data[1].astype('int64')
            edges = None
            if len(data) == 3:
                edges = data[2].astype('int64')
            if hasattr(model, 'data_format') and model.data_format == 'NHWC':
                images = images.transpose((0, 2, 3, 1))

            if fp16:
                with paddle.amp.auto_cast(
                        enable=True,
                        custom_white_list={
                            "elementwise_add", "batch_norm", "sync_batch_norm"
                        },
                        custom_black_list={'bilinear_interp_v2'}):
                    if nranks > 1:
                        logits_list = ddp_model(images)
                    else:
                        logits_list = model(images)
                    loss_list = loss_computation(logits_list=logits_list,
                                                 labels=labels,
                                                 losses=losses,
                                                 edges=edges)
                    loss = sum(loss_list)

                scaled = scaler.scale(loss)  # scale the loss
                scaled.backward()  # do backward
                if isinstance(optimizer, paddle.distributed.fleet.Fleet):
                    scaler.minimize(optimizer.user_defined_optimizer, scaled)
                else:
                    scaler.minimize(optimizer, scaled)  # update parameters
            else:
                if nranks > 1:
                    logits_list = ddp_model(images)
                else:
                    logits_list = model(images)
                loss_list = loss_computation(logits_list=logits_list,
                                             labels=labels,
                                             losses=losses,
                                             edges=edges)
                loss = sum(loss_list)
                loss.backward()
                optimizer.step()

            lr = optimizer.get_lr()

            # update lr
            if isinstance(optimizer, paddle.distributed.fleet.Fleet):
                lr_sche = optimizer.user_defined_optimizer._learning_rate
            else:
                lr_sche = optimizer._learning_rate
            if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler):
                lr_sche.step()

            train_profiler.add_profiler_step(profiler_options)

            model.clear_gradients()
            avg_loss += loss.numpy()[0]
            if not avg_loss_list:
                avg_loss_list = [l.numpy() for l in loss_list]
            else:
                for i in range(len(loss_list)):
                    avg_loss_list[i] += loss_list[i].numpy()
            batch_cost_averager.record(time.time() - batch_start,
                                       num_samples=batch_size)

            if (iter) % log_iters == 0 and local_rank == 0:
                avg_loss /= log_iters
                avg_loss_list = [l[0] / log_iters for l in avg_loss_list]
                remain_iters = iters - iter
                avg_train_batch_cost = batch_cost_averager.get_average()
                avg_train_reader_cost = reader_cost_averager.get_average()
                eta = calculate_eta(remain_iters, avg_train_batch_cost)
                logger.info(
                    "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}"
                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
                            avg_loss, lr, avg_train_batch_cost,
                            avg_train_reader_cost,
                            batch_cost_averager.get_ips_average(), eta))
                if use_vdl:
                    log_writer.add_scalar('Train/loss', avg_loss, iter)
                    # Record all losses if there are more than 2 losses.
                    if len(avg_loss_list) > 1:
                        avg_loss_dict = {}
                        for i, value in enumerate(avg_loss_list):
                            avg_loss_dict['loss_' + str(i)] = value
                        for key, value in avg_loss_dict.items():
                            log_tag = 'Train/' + key
                            log_writer.add_scalar(log_tag, value, iter)

                    log_writer.add_scalar('Train/lr', lr, iter)
                    log_writer.add_scalar('Train/batch_cost',
                                          avg_train_batch_cost, iter)
                    log_writer.add_scalar('Train/reader_cost',
                                          avg_train_reader_cost, iter)
                avg_loss = 0.0
                avg_loss_list = []
                reader_cost_averager.reset()
                batch_cost_averager.reset()

            if (iter % save_interval == 0 or iter == iters) and (val_dataset
                                                                 is not None):
                num_workers = 1 if num_workers > 0 else 0

                if test_config is None:
                    test_config = {}

                acc, fp, fn = evaluate(model,
                                       val_dataset,
                                       num_workers=num_workers,
                                       save_dir=save_dir,
                                       **test_config)

                model.train()

            if (iter % save_interval == 0
                    or iter == iters) and local_rank == 0:
                current_save_dir = os.path.join(save_dir,
                                                "iter_{}".format(iter))
                if not os.path.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                paddle.save(model.state_dict(),
                            os.path.join(current_save_dir, 'model.pdparams'))
                paddle.save(optimizer.state_dict(),
                            os.path.join(current_save_dir, 'model.pdopt'))
                save_models.append(current_save_dir)
                if len(save_models) > keep_checkpoint_max > 0:
                    model_to_remove = save_models.popleft()
                    shutil.rmtree(model_to_remove)

                if val_dataset is not None:
                    if acc > best_acc:
                        best_acc = acc
                        best_model_iter = iter
                        best_model_dir = os.path.join(save_dir, "best_model")
                        paddle.save(
                            model.state_dict(),
                            os.path.join(best_model_dir, 'model.pdparams'))
                    logger.info(
                        '[EVAL] The model with the best validation Acc ({:.4f}) was saved at iter {}.'
                        .format(best_acc, best_model_iter))

                    if use_vdl:
                        log_writer.add_scalar('Evaluate/Acc', acc, iter)
                        log_writer.add_scalar('Evaluate/Fp', fp, iter)
                        log_writer.add_scalar('Evaluate/Fn', fn, iter)
            batch_start = time.time()

    # Calculate flops.
    if local_rank == 0:
        _, c, h, w = images.shape
        _ = paddle.flops(
            model, [1, c, h, w],
            custom_ops={paddle.nn.SyncBatchNorm: op_flops_funs.count_syncbn})

    # Sleep for half a second to let dataloader release resources.
    time.sleep(0.5)
    if use_vdl:
        log_writer.close()
Ejemplo n.º 20
0
def main(args):
    world_size = int(1.0)
    rank = int(0.0)

    if not os.path.exists(args.output):
        os.makedirs(args.output)
    else:
        time.sleep(2)

    writer = LogWriter(logdir=args.logdir)
    trainset = CommonDataset(root_dir=cfg.data_dir, label_file=cfg.file_list, is_bin=args.is_bin)
    train_loader = DataLoader(
        dataset=trainset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=0)

    backbone = eval("backbones.{}".format(args.network))()
    backbone.train()

    clip_by_norm = ClipGradByNorm(5.0)
    margin_softmax = eval("losses.{}".format(args.loss))()

    module_partial_fc = PartialFC(
        rank=0,
        world_size=1,
        resume=0,
        batch_size=args.batch_size,
        margin_softmax=margin_softmax,
        num_classes=cfg.num_classes,
        sample_rate=cfg.sample_rate,
        embedding_size=args.embedding_size,
        prefix=args.output)

    scheduler_backbone_decay = paddle.optimizer.lr.LambdaDecay(
        learning_rate=args.lr, lr_lambda=cfg.lr_func, verbose=True)
    scheduler_backbone = paddle.optimizer.lr.LinearWarmup(
        learning_rate=scheduler_backbone_decay,
        warmup_steps=cfg.warmup_epoch,
        start_lr=0,
        end_lr=args.lr / 512 * args.batch_size,
        verbose=True)
    opt_backbone = paddle.optimizer.Momentum(
        parameters=backbone.parameters(),
        learning_rate=scheduler_backbone,
        momentum=0.9,
        weight_decay=args.weight_decay,
        grad_clip=clip_by_norm)

    scheduler_pfc_decay = paddle.optimizer.lr.LambdaDecay(
        learning_rate=args.lr, lr_lambda=cfg.lr_func, verbose=True)
    scheduler_pfc = paddle.optimizer.lr.LinearWarmup(
        learning_rate=scheduler_pfc_decay,
        warmup_steps=cfg.warmup_epoch,
        start_lr=0,
        end_lr=args.lr / 512 * args.batch_size,
        verbose=True)
    opt_pfc = paddle.optimizer.Momentum(
        parameters=module_partial_fc.parameters(),
        learning_rate=scheduler_pfc,
        momentum=0.9,
        weight_decay=args.weight_decay,
        grad_clip=clip_by_norm)

    start_epoch = 0
    total_step = int(
        len(trainset) / args.batch_size / world_size * cfg.num_epoch)
    if rank == 0:
        print("Total Step is: %d" % total_step)

    callback_verification = CallBackVerification(2000, rank, cfg.val_targets,
                                                 cfg.data_dir)
    callback_logging = CallBackLogging(10, rank, total_step, args.batch_size,
                                       world_size, writer)
    callback_checkpoint = CallBackModelCheckpoint(rank, args.output,
                                                  args.network)

    loss = AverageMeter()
    global_step = 0
    for epoch in range(start_epoch, cfg.num_epoch):
        for step, (img, label) in enumerate(train_loader):
            label = label.flatten()
            global_step += 1
            sys.stdout.flush()
            features = F.normalize(backbone(img))
            x_grad, loss_v = module_partial_fc.forward_backward(
                label, features, opt_pfc)
            sys.stdout.flush()
            (features.multiply(x_grad)).backward()
            sys.stdout.flush()
            opt_backbone.step()
            opt_pfc.step()
            module_partial_fc.update()
            opt_backbone.clear_gradients()
            opt_pfc.clear_gradients()
            sys.stdout.flush()

            lr_backbone_value = opt_backbone._global_learning_rate().numpy()[0]
            lr_pfc_value = opt_backbone._global_learning_rate().numpy()[0]

            loss.update(loss_v, 1)
            callback_logging(global_step, loss, epoch, lr_backbone_value,
                             lr_pfc_value)
            sys.stdout.flush()
            callback_verification(global_step, backbone)
        callback_checkpoint(global_step, backbone, module_partial_fc)
        scheduler_backbone.step()
        scheduler_pfc.step()
    writer.close()
Ejemplo n.º 21
0
def train(cfg):
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    drop_last = True
    dataset = build_dataset(cfg.DATASET.DATASET_NAME,
        file_list=cfg.DATASET.TRAIN_FILE_LIST,
        mode=ModelPhase.TRAIN,
        shuffle=True,
        data_dir=cfg.DATASET.DATA_DIR,
        base_size= cfg.DATAAUG.BASE_SIZE, crop_size= cfg.DATAAUG.CROP_SIZE, rand_scale=True)

    def data_generator():
        if args.use_mpio:
            data_gen = dataset.multiprocess_generator(
                num_processes=cfg.DATALOADER.NUM_WORKERS,
                max_queue_size=cfg.DATALOADER.BUF_SIZE)
        else:
            data_gen = dataset.generator()

        batch_data = []
        for b in data_gen:
            batch_data.append(b)
            if len(batch_data) == (cfg.TRAIN_BATCH_SIZE // cfg.NUM_TRAINERS):
                for item in batch_data:
                    yield item[0], item[1], item[2]
                batch_data = []
        # If use sync batch norm strategy, drop last batch if number of samples
        # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues
        if not cfg.TRAIN.SYNC_BATCH_NORM:
            for item in batch_data:
                yield item[0], item[1], item[2]

    # Get device environment
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()

    # Get number of GPU
    dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places)
    print_info("#device count: {}".format(dev_count))
    cfg.TRAIN_BATCH_SIZE = dev_count * int(cfg.TRAIN_BATCH_SIZE_PER_GPU)
    print_info("#train_batch_size: {}".format(cfg.TRAIN_BATCH_SIZE))
    print_info("#batch_size_per_dev: {}".format(cfg.TRAIN_BATCH_SIZE_PER_GPU))

    py_reader, avg_loss, lr, pred, grts, masks = build_model(
        train_prog, startup_prog, phase=ModelPhase.TRAIN)
    py_reader.decorate_sample_generator(
        data_generator, batch_size=cfg.TRAIN_BATCH_SIZE_PER_GPU, drop_last=drop_last)

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    exec_strategy = fluid.ExecutionStrategy()
    # Clear temporary variables every 100 iteration
    if args.use_gpu:
        exec_strategy.num_threads = fluid.core.get_cuda_device_count()
    exec_strategy.num_iteration_per_drop_scope = 100
    build_strategy = fluid.BuildStrategy()

    if cfg.NUM_TRAINERS > 1 and args.use_gpu:
        dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
        exec_strategy.num_threads = 1

    if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu:
        if dev_count > 1:
            # Apply sync batch norm strategy
            print_info("Sync BatchNorm strategy is effective.")
            build_strategy.sync_batch_norm = True
        else:
            print_info(
                "Sync BatchNorm strategy will not be effective if GPU device"
                " count <= 1")
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_loss.name,
        exec_strategy=exec_strategy,
        build_strategy=build_strategy)

    # Resume training
    begin_epoch = cfg.SOLVER.BEGIN_EPOCH
    if cfg.TRAIN.RESUME_MODEL_DIR:
        begin_epoch = load_checkpoint(exe, train_prog)
    # Load pretrained model
    elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR):
        print_info('Pretrained model dir: ', cfg.TRAIN.PRETRAINED_MODEL_DIR)
        load_vars = []
        load_fail_vars = []

        def var_shape_matched(var, shape):
            """
            Check whehter persitable variable shape is match with current network
            """
            var_exist = os.path.exists(
                os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name))
            if var_exist:
                var_shape = parse_shape_from_file(
                    os.path.join(cfg.TRAIN.PRETRAINED_MODEL_DIR, var.name))
                return var_shape == shape
            return False

        for x in train_prog.list_vars():
            if isinstance(x, fluid.framework.Parameter):
                shape = tuple(fluid.global_scope().find_var(
                    x.name).get_tensor().shape())
                if var_shape_matched(x, shape):
                    load_vars.append(x)
                else:
                    load_fail_vars.append(x)

        fluid.io.load_vars(
            exe, dirname=cfg.TRAIN.PRETRAINED_MODEL_DIR, vars=load_vars)
        for var in load_vars:
            print_info("Parameter[{}] loaded sucessfully!".format(var.name))
        for var in load_fail_vars:
            print_info(
                "Parameter[{}] don't exist or shape does not match current network, skip"
                " to load it.".format(var.name))
        print_info("{}/{} pretrained parameters loaded successfully!".format(
            len(load_vars),
            len(load_vars) + len(load_fail_vars)))
    else:
        print_info(
            'Pretrained model dir {} not exists, training from scratch...'.
            format(cfg.TRAIN.PRETRAINED_MODEL_DIR))

    fetch_list = [avg_loss.name, lr.name]
    if args.debug:
        # Fetch more variable info and use streaming confusion matrix to
        # calculate IoU results if in debug mode
        np.set_printoptions(
            precision=4, suppress=True, linewidth=160, floatmode="fixed")
        fetch_list.extend([pred.name, grts.name, masks.name])
        cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)

    if args.use_vdl:
        if not args.vdl_log_dir:
            print_info("Please specify the log directory by --vdl_log_dir.")
            exit(1)

        from visualdl import LogWriter
        log_writer = LogWriter(args.vdl_log_dir)

    # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
    # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    step = 0
    all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.TRAIN_BATCH_SIZE
    if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.TRAIN_BATCH_SIZE and drop_last != True:
        all_step += 1
    all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1)

    avg_loss = 0.0
    timer = Timer()
    timer.start()
    if begin_epoch > cfg.SOLVER.NUM_EPOCHS:
        raise ValueError(
            ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format(
                begin_epoch, cfg.SOLVER.NUM_EPOCHS))

    if args.use_mpio:
        print_info("Use multiprocess reader")
    else:
        print_info("Use multi-thread reader")

    for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1):
        py_reader.start()
        while True:
            try:
                if args.debug:
                    # Print category IoU and accuracy to check whether the
                    # traning process is corresponed to expectation
                    loss, lr, pred, grts, masks = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    cm.calculate(pred, grts, masks)
                    avg_loss += np.mean(np.array(loss))
                    step += 1

                    if step % args.log_steps == 0:
                        speed = args.log_steps / timer.elapsed_time()
                        avg_loss /= args.log_steps
                        category_acc, mean_acc = cm.accuracy()
                        category_iou, mean_iou = cm.mean_iou()

                        print_info((
                            "epoch={}/{} step={}/{} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, cfg.SOLVER.NUM_EPOCHS, step, all_step, lr[0], avg_loss, mean_acc,
                                 mean_iou, speed,
                                 calculate_eta(all_step - step, speed)))
                        print_info("Category IoU: ", category_iou)
                        print_info("Category Acc: ", category_acc)
                        if args.use_vdl:
                            log_writer.add_scalar('Train/mean_iou', mean_iou,
                                                  step)
                            log_writer.add_scalar('Train/mean_acc', mean_acc,
                                                  step)
                            log_writer.add_scalar('Train/loss', avg_loss,
                                                  step)
                            log_writer.add_scalar('Train/lr', lr[0],
                                                  step)
                            log_writer.add_scalar('Train/step/sec', speed,
                                                  step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        cm.zero_matrix()
                        timer.restart()
                else:
                    # If not in debug mode, avoid unnessary log and calculate
                    loss, lr = exe.run(
                        program=compiled_train_prog,
                        fetch_list=fetch_list,
                        return_numpy=True)
                    avg_loss += np.mean(np.array(loss))
                    step += 1

                    if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
                        avg_loss /= args.log_steps
                        speed = args.log_steps / timer.elapsed_time()
                        print((
                            "epoch={}/{} step={}/{} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
                        ).format(epoch, cfg.SOLVER.NUM_EPOCHS, global_step, all_step, lr[0], avg_loss, speed,
                                 calculate_eta(all_step - global_step, speed)))
                        if args.use_vdl:
                            log_writer.add_scalar('Train/loss', avg_loss,
                                                  step)
                            log_writer.add_scalar('Train/lr', lr[0],
                                                  step)
                            log_writer.add_scalar('Train/speed', speed,
                                                  step)
                        sys.stdout.flush()
                        avg_loss = 0.0
                        timer.restart()

            except fluid.core.EOFException:
                py_reader.reset()
                break
            except Exception as e:
                print(e)

        if epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 and cfg.TRAINER_ID == 0:
            ckpt_dir = save_checkpoint(exe, train_prog, epoch)

            if args.do_eval:
                print("Evaluation start")
                _, mean_iou, _, mean_acc = evaluate(
                    cfg=cfg,
                    ckpt_dir=ckpt_dir,
                    use_gpu=args.use_gpu,
                    use_mpio=args.use_mpio)
                if args.use_vdl:
                    log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
                                          step)
                    log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
                                          step)

            # Use VisualDL to visualize results
            if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
                visualize(
                    cfg=cfg,
                    use_gpu=args.use_gpu,
                    vis_file_list=cfg.DATASET.VIS_FILE_LIST,
                    vis_dir="visual",
                    ckpt_dir=ckpt_dir,
                    log_writer=log_writer)

    # save final model
    if cfg.TRAINER_ID == 0:
        save_checkpoint(exe, train_prog, 'final')

    if args.use_vdl:
        log_writer.close()
Ejemplo n.º 22
0
    def train(self,
              train_dataset_src,
              train_dataset_tgt,
              val_dataset_tgt=None,
              val_dataset_src=None,
              optimizer=None,
              save_dir='output',
              iters=10000,
              batch_size=2,
              resume_model=None,
              save_interval=1000,
              log_iters=10,
              num_workers=0,
              use_vdl=False,
              keep_checkpoint_max=5,
              test_config=None):
        """
        Launch training.

        Args:
            train_dataset (paddle.io.Dataset): Used to read and process training datasets.
            val_dataset_tgt (paddle.io.Dataset, optional): Used to read and process validation datasets.
            optimizer (paddle.optimizer.Optimizer): The optimizer.
            save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
            iters (int, optional): How may iters to train the model. Defualt: 10000.
            batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
            resume_model (str, optional): The path of resume model.
            save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
            log_iters (int, optional): Display logging information at every log_iters. Default: 10.
            num_workers (int, optional): Num workers for data loader. Default: 0.
            use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False.
            keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
            test_config(dict, optional): Evaluation config.
        """
        start_iter = 0
        self.model.train()
        nranks = paddle.distributed.ParallelEnv().nranks
        local_rank = paddle.distributed.ParallelEnv().local_rank

        if resume_model is not None:
            logger.info(resume_model)
            start_iter = resume(self.model, optimizer, resume_model)
        load_ema_model(self.model, self.resume_ema)

        if not os.path.isdir(save_dir):
            if os.path.exists(save_dir):
                os.remove(save_dir)
            os.makedirs(save_dir)

        if nranks > 1:
            paddle.distributed.fleet.init(is_collective=True)
            optimizer = paddle.distributed.fleet.distributed_optimizer(
                optimizer)  # The return is Fleet object
            ddp_model = paddle.distributed.fleet.distributed_model(self.model)

        batch_sampler_src = paddle.io.DistributedBatchSampler(
            train_dataset_src,
            batch_size=batch_size,
            shuffle=True,
            drop_last=True)

        loader_src = paddle.io.DataLoader(
            train_dataset_src,
            batch_sampler=batch_sampler_src,
            num_workers=num_workers,
            return_list=True,
            worker_init_fn=worker_init_fn,
        )
        batch_sampler_tgt = paddle.io.DistributedBatchSampler(
            train_dataset_tgt,
            batch_size=batch_size,
            shuffle=True,
            drop_last=True)

        loader_tgt = paddle.io.DataLoader(
            train_dataset_tgt,
            batch_sampler=batch_sampler_tgt,
            num_workers=num_workers,
            return_list=True,
            worker_init_fn=worker_init_fn,
        )

        if use_vdl:
            from visualdl import LogWriter
            log_writer = LogWriter(save_dir)

        iters_per_epoch = len(batch_sampler_tgt)
        best_mean_iou = -1.0
        best_model_iter = -1
        reader_cost_averager = TimeAverager()
        batch_cost_averager = TimeAverager()
        save_models = deque()
        batch_start = time.time()

        iter = start_iter
        while iter < iters:
            for _, (data_src,
                    data_tgt) in enumerate(zip(loader_src, loader_tgt)):

                reader_cost_averager.record(time.time() - batch_start)
                loss_dict = {}

                #### training #####
                images_tgt = data_tgt[0]
                labels_tgt = data_tgt[1].astype('int64')
                images_src = data_src[0]
                labels_src = data_src[1].astype('int64')

                edges_src = data_src[2].astype('int64')
                edges_tgt = data_tgt[2].astype('int64')

                if nranks > 1:
                    logits_list_src = ddp_model(images_src)
                else:
                    logits_list_src = self.model(images_src)

                ##### source seg & edge loss ####
                loss_src_seg_main = self.celoss(logits_list_src[0], labels_src)
                loss_src_seg_aux = 0.1 * self.celoss(logits_list_src[1],
                                                     labels_src)

                loss_src_seg = loss_src_seg_main + loss_src_seg_aux
                loss_dict["source_main"] = loss_src_seg_main.numpy()[0]
                loss_dict["source_aux"] = loss_src_seg_aux.numpy()[0]
                loss = loss_src_seg
                del loss_src_seg, loss_src_seg_aux, loss_src_seg_main

                #### generate target pseudo label  ####
                with paddle.no_grad():
                    if nranks > 1:
                        logits_list_tgt = ddp_model(images_tgt)
                    else:
                        logits_list_tgt = self.model(images_tgt)

                    pred_P_1 = F.softmax(logits_list_tgt[0], axis=1)
                    labels_tgt_psu = paddle.argmax(pred_P_1.detach(), axis=1)

                    # aux label
                    pred_P_2 = F.softmax(logits_list_tgt[1], axis=1)
                    pred_c = (pred_P_1 + pred_P_2) / 2
                    labels_tgt_psu_aux = paddle.argmax(pred_c.detach(), axis=1)

                if self.edgeconstrain:
                    loss_src_edge = self.bceloss_src(
                        logits_list_src[2], edges_src)  # 1, 2 640, 1280
                    src_edge = paddle.argmax(
                        logits_list_src[2].detach().clone(),
                        axis=1)  # 1, 1, 640,1280
                    src_edge_acc = ((src_edge == edges_src).numpy().sum().astype('float32')\
                                                /functools.reduce(lambda a, b: a * b, src_edge.shape))*100

                    if (not self.src_only) and (iter > 200000):
                        ####  target seg & edge loss ####
                        logger.info("Add target edege loss")
                        edges_tgt = Func.mask_to_binary_edge(
                            labels_tgt_psu.detach().clone().numpy(),
                            radius=2,
                            num_classes=train_dataset_tgt.NUM_CLASSES)
                        edges_tgt = paddle.to_tensor(edges_tgt, dtype='int64')

                        loss_tgt_edge = self.bceloss_tgt(
                            logits_list_tgt[2], edges_tgt)
                        loss_edge = loss_tgt_edge + loss_src_edge
                    else:
                        loss_tgt_edge = paddle.zeros([1])
                        loss_edge = loss_src_edge

                    loss += loss_edge

                    loss_dict['target_edge'] = loss_tgt_edge.numpy()[0]
                    loss_dict['source_edge'] = loss_src_edge.numpy()[0]

                    del loss_edge, loss_tgt_edge, loss_src_edge

                #### target aug loss #######
                augs = augmentation.get_augmentation()
                images_tgt_aug, labels_tgt_aug = augmentation.augment(
                    images=images_tgt.cpu(),
                    labels=labels_tgt_psu.detach().cpu(),
                    aug=augs,
                    iters="{}_1".format(iter))
                images_tgt_aug = images_tgt_aug.cuda()
                labels_tgt_aug = labels_tgt_aug.cuda()

                _, labels_tgt_aug_aux = augmentation.augment(
                    images=images_tgt.cpu(),
                    labels=labels_tgt_psu_aux.detach().cpu(),
                    aug=augs,
                    iters="{}_2".format(iter))
                labels_tgt_aug_aux = labels_tgt_aug_aux.cuda()

                if nranks > 1:
                    logits_list_tgt_aug = ddp_model(images_tgt_aug)
                else:
                    logits_list_tgt_aug = self.model(images_tgt_aug)

                loss_tgt_aug_main = 0.1 * (self.celoss(logits_list_tgt_aug[0],
                                                       labels_tgt_aug))
                loss_tgt_aug_aux = 0.1 * (0.1 * self.celoss(
                    logits_list_tgt_aug[1], labels_tgt_aug_aux))

                loss_tgt_aug = loss_tgt_aug_aux + loss_tgt_aug_main

                loss += loss_tgt_aug

                loss_dict['target_aug_main'] = loss_tgt_aug_main.numpy()[0]
                loss_dict['target_aug_aux'] = loss_tgt_aug_aux.numpy()[0]
                del images_tgt_aug, labels_tgt_aug_aux, images_tgt, \
                    loss_tgt_aug, loss_tgt_aug_aux, loss_tgt_aug_main

                #### edge input seg; src & tgt edge pull in ######
                if self.edgepullin:
                    src_edge_logit = logits_list_src[2]
                    feat_src = paddle.concat(
                        [logits_list_src[0], src_edge_logit], axis=1).detach()

                    out_src = self.model.fusion(feat_src)
                    loss_src_edge_rec = self.celoss(out_src, labels_src)

                    tgt_edge_logit = logits_list_tgt_aug[2]
                    # tgt_edge_logit = paddle.to_tensor(
                    #     Func.mask_to_onehot(edges_tgt.squeeze().numpy(), 2)
                    #     ).unsqueeze(0).astype('float32')
                    feat_tgt = paddle.concat(
                        [logits_list_tgt[0], tgt_edge_logit], axis=1).detach()

                    out_tgt = self.model.fusion(feat_tgt)
                    loss_tgt_edge_rec = self.celoss(out_tgt, labels_tgt)

                    loss_edge_rec = loss_tgt_edge_rec + loss_src_edge_rec
                    loss += loss_edge_rec

                    loss_dict['src_edge_rec'] = loss_src_edge_rec.numpy()[0]
                    loss_dict['tgt_edge_rec'] = loss_tgt_edge_rec.numpy()[0]

                    del loss_tgt_edge_rec, loss_src_edge_rec

                #### mask input feature & pullin  ######
                if self.featurepullin:
                    # inner-class loss
                    feat_src = logits_list_src[0]
                    feat_tgt = logits_list_tgt_aug[0]
                    center_src_s, center_tgt_s = [], []

                    total_pixs = logits_list_src[0].shape[2] * \
                                    logits_list_src[0].shape[3]

                    for i in range(train_dataset_tgt.NUM_CLASSES):
                        pred = paddle.argmax(
                            logits_list_src[0].detach().clone(),
                            axis=1).unsqueeze(0)  # 1, 1, 640, 1280
                        sel_num = paddle.sum((pred == i).astype('float32'))
                        # ignore tensor that do not have features in this img
                        if sel_num > 0:
                            feat_sel_src = paddle.where(
                                (pred == i).expand_as(feat_src), feat_src,
                                paddle.zeros(feat_src.shape))
                            center_src = paddle.mean(feat_sel_src, axis=[
                                2, 3
                            ]) / (sel_num / total_pixs)  # 1, C

                            self.src_centers[i] = 0.99 * self.src_centers[
                                i] + (1 - 0.99) * center_src

                        pred = labels_tgt_aug.unsqueeze(0)  # 1, 1,  512, 512
                        sel_num = paddle.sum((pred == i).astype('float32'))
                        if sel_num > 0:
                            feat_sel_tgt = paddle.where(
                                (pred == i).expand_as(feat_tgt), feat_tgt,
                                paddle.zeros(feat_tgt.shape))
                            center_tgt = paddle.mean(feat_sel_tgt, axis=[
                                2, 3
                            ]) / (sel_num / total_pixs)

                            self.tgt_centers[i] = 0.99 * self.tgt_centers[
                                i] + (1 - 0.99) * center_tgt
                        center_src_s.append(center_src)
                        center_tgt_s.append(center_tgt)

                    if iter >= 3000:  # average center structure alignment
                        src_centers = paddle.concat(self.src_centers, axis=0)
                        tgt_centers = paddle.concat(self.tgt_centers,
                                                    axis=0)  # 19, 2048

                        relatmat_src = paddle.matmul(src_centers,
                                                     src_centers,
                                                     transpose_y=True)  # 19,19
                        relatmat_tgt = paddle.matmul(tgt_centers,
                                                     tgt_centers,
                                                     transpose_y=True)

                        loss_intra_relate = self.klloss(relatmat_src, (relatmat_tgt+relatmat_src)/2) \
                                            + self.klloss(relatmat_tgt, (relatmat_tgt+relatmat_src)/2)

                        loss_pix_align_src = self.mseloss(
                            paddle.to_tensor(center_src_s),
                            paddle.to_tensor(
                                self.src_centers).detach().clone())
                        loss_pix_align_tgt = self.mseloss(
                            paddle.to_tensor(center_tgt_s),
                            paddle.to_tensor(
                                self.tgt_centers).detach().clone())

                        loss_feat_align = loss_pix_align_src + loss_pix_align_tgt + loss_intra_relate

                        loss += loss_feat_align

                        loss_dict['loss_pix_align_src'] = \
                                loss_pix_align_src.numpy()[0]
                        loss_dict['loss_pix_align_tgt'] = \
                                loss_pix_align_tgt.numpy()[0]
                        loss_dict['loss_intra_relate'] = \
                                loss_intra_relate.numpy()[0]

                        del loss_pix_align_tgt, loss_pix_align_src, loss_intra_relate,

                    self.tgt_centers = [
                        item.detach().clone() for item in self.tgt_centers
                    ]
                    self.src_centers = [
                        item.detach().clone() for item in self.src_centers
                    ]

                loss.backward()
                del loss
                loss = sum(loss_dict.values())

                optimizer.step()
                self.ema.update_params()

                with paddle.no_grad():
                    ##### log & save #####
                    lr = optimizer.get_lr()
                    # update lr
                    if isinstance(optimizer, paddle.distributed.fleet.Fleet):
                        lr_sche = optimizer.user_defined_optimizer._learning_rate
                    else:
                        lr_sche = optimizer._learning_rate
                    if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler):
                        lr_sche.step()

                    if self.cfg['save_edge']:
                        tgt_edge = paddle.argmax(
                            logits_list_tgt_aug[2].detach().clone(),
                            axis=1)  # 1, 1, 640,1280
                        src_feed_gt = paddle.argmax(
                            src_edge_logit.astype('float32'), axis=1)
                        tgt_feed_gt = paddle.argmax(
                            tgt_edge_logit.astype('float32'), axis=1)
                        logger.info('src_feed_gt_{}_{}_{}'.format(
                            src_feed_gt.shape, src_feed_gt.max(),
                            src_feed_gt.min()))
                        logger.info('tgt_feed_gt_{}_{}_{}'.format(
                            tgt_feed_gt.shape, max(tgt_feed_gt),
                            min(tgt_feed_gt)))
                        save_edge(src_feed_gt, 'src_feed_gt_{}'.format(iter))
                        save_edge(tgt_feed_gt, 'tgt_feed_gt_{}'.format(iter))
                        save_edge(tgt_edge, 'tgt_pred_{}'.format(iter))
                        save_edge(src_edge,
                                  'src_pred_{}_{}'.format(iter, src_edge_acc))
                        save_edge(edges_src, 'src_gt_{}'.format(iter))
                        save_edge(edges_tgt, 'tgt_gt_{}'.format(iter))

                    self.model.clear_gradients()

                    batch_cost_averager.record(time.time() - batch_start,
                                               num_samples=batch_size)

                    iter += 1
                    if (iter) % log_iters == 0 and local_rank == 0:
                        label_tgt_acc = ((labels_tgt == labels_tgt_psu).numpy().sum().astype('float32')\
                                            /functools.reduce(lambda a, b: a * b, labels_tgt_psu.shape))*100

                        remain_iters = iters - iter
                        avg_train_batch_cost = batch_cost_averager.get_average(
                        )
                        avg_train_reader_cost = reader_cost_averager.get_average(
                        )
                        eta = calculate_eta(remain_iters, avg_train_batch_cost)
                        logger.info(
                            "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, tgt_pix_acc: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}"
                            .format(
                                (iter - 1) // iters_per_epoch + 1, iter, iters,
                                loss, label_tgt_acc, lr, avg_train_batch_cost,
                                avg_train_reader_cost,
                                batch_cost_averager.get_ips_average(), eta))

                        if use_vdl:
                            log_writer.add_scalar('Train/loss', loss, iter)
                            # Record all losses if there are more than 2 losses.
                            if len(loss_dict) > 1:
                                for name, loss in loss_dict.items():
                                    log_writer.add_scalar(
                                        'Train/loss_' + name, loss, iter)

                            log_writer.add_scalar('Train/lr', lr, iter)
                            log_writer.add_scalar('Train/batch_cost',
                                                  avg_train_batch_cost, iter)
                            log_writer.add_scalar('Train/reader_cost',
                                                  avg_train_reader_cost, iter)
                            log_writer.add_scalar('Train/tgt_label_acc',
                                                  label_tgt_acc, iter)

                        reader_cost_averager.reset()
                        batch_cost_averager.reset()

                    if (iter % save_interval == 0 or iter
                            == iters) and (val_dataset_tgt is not None):
                        num_workers = 4 if num_workers > 0 else 0  # adjust num_worker=4

                        if test_config is None:
                            test_config = {}
                        self.ema.apply_shadow()
                        self.ema.model.eval()

                        PA_tgt, _, MIoU_tgt, _ = val.evaluate(
                            self.model,
                            val_dataset_tgt,
                            num_workers=num_workers,
                            **test_config)

                        if (iter % (save_interval * 30)) == 0 \
                            and self.cfg['eval_src']:  # add evaluate on src
                            PA_src, _, MIoU_src, _ = val.evaluate(
                                self.model,
                                val_dataset_src,
                                num_workers=num_workers,
                                **test_config)
                            logger.info(
                                '[EVAL] The source mIoU is ({:.4f}) at iter {}.'
                                .format(MIoU_src, iter))

                        self.ema.restore()
                        self.model.train()

                    if (iter % save_interval == 0
                            or iter == iters) and local_rank == 0:
                        current_save_dir = os.path.join(
                            save_dir, "iter_{}".format(iter))
                        if not os.path.isdir(current_save_dir):
                            os.makedirs(current_save_dir)
                        paddle.save(
                            self.model.state_dict(),
                            os.path.join(current_save_dir, 'model.pdparams'))
                        paddle.save(
                            self.ema.shadow,
                            os.path.join(current_save_dir,
                                         'model_ema.pdparams'))
                        paddle.save(
                            optimizer.state_dict(),
                            os.path.join(current_save_dir, 'model.pdopt'))
                        save_models.append(current_save_dir)
                        if len(save_models) > keep_checkpoint_max > 0:
                            model_to_remove = save_models.popleft()
                            shutil.rmtree(model_to_remove)

                        if val_dataset_tgt is not None:
                            if MIoU_tgt > best_mean_iou:
                                best_mean_iou = MIoU_tgt
                                best_model_iter = iter
                                best_model_dir = os.path.join(
                                    save_dir, "best_model")
                                paddle.save(
                                    self.model.state_dict(),
                                    os.path.join(best_model_dir,
                                                 'model.pdparams'))

                            logger.info(
                                '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.'
                                .format(best_mean_iou, best_model_iter))

                        if use_vdl:
                            log_writer.add_scalar('Evaluate/mIoU', MIoU_tgt,
                                                  iter)
                            log_writer.add_scalar('Evaluate/PA', PA_tgt, iter)

                            if self.cfg['eval_src']:
                                log_writer.add_scalar('Evaluate/mIoU_src',
                                                      MIoU_src, iter)
                                log_writer.add_scalar('Evaluate/PA_src',
                                                      PA_src, iter)

                    batch_start = time.time()

            self.ema.update_buffer()

        # # Calculate flops.
        if local_rank == 0:

            def count_syncbn(m, x, y):
                x = x[0]
                nelements = x.numel()
                m.total_ops += int(2 * nelements)

            _, c, h, w = images_src.shape
            flops = paddle.flops(
                self.model, [1, c, h, w],
                custom_ops={paddle.nn.SyncBatchNorm: count_syncbn})

        # Sleep for half a second to let dataloader release resources.
        time.sleep(0.5)
        if use_vdl:
            log_writer.close()
Ejemplo n.º 23
0
def train(args):
    writer = LogWriter(logdir=args.logdir)

    rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
    world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))

    gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
    place = paddle.CUDAPlace(gpu_id)

    if world_size > 1:
        import paddle.distributed.fleet as fleet
        from .utils.data_parallel import sync_gradients, sync_params

        strategy = fleet.DistributedStrategy()
        strategy.without_graph_optimization = True
        fleet.init(is_collective=True, strategy=strategy)

    if args.use_synthetic_dataset:
        trainset = SyntheticDataset(args.num_classes, fp16=args.fp16)
    else:
        trainset = CommonDataset(root_dir=args.data_dir,
                                 label_file=args.label_file,
                                 fp16=args.fp16,
                                 is_bin=args.is_bin)

    num_image = len(trainset)
    total_batch_size = args.batch_size * world_size
    steps_per_epoch = num_image // total_batch_size
    if args.train_unit == 'epoch':
        warmup_steps = steps_per_epoch * args.warmup_num
        total_steps = steps_per_epoch * args.train_num
        decay_steps = [x * steps_per_epoch for x in args.decay_boundaries]
        total_epoch = args.train_num
    else:
        warmup_steps = args.warmup_num
        total_steps = args.train_num
        decay_steps = [x for x in args.decay_boundaries]
        total_epoch = (total_steps + steps_per_epoch - 1) // steps_per_epoch

    if rank == 0:
        logging.info('world_size: {}'.format(world_size))
        logging.info('total_batch_size: {}'.format(total_batch_size))
        logging.info('warmup_steps: {}'.format(warmup_steps))
        logging.info('steps_per_epoch: {}'.format(steps_per_epoch))
        logging.info('total_steps: {}'.format(total_steps))
        logging.info('total_epoch: {}'.format(total_epoch))
        logging.info('decay_steps: {}'.format(decay_steps))

    base_lr = total_batch_size * args.lr / 512
    lr_scheduler = paddle.optimizer.lr.PiecewiseDecay(
        boundaries=decay_steps,
        values=[
            base_lr * (args.lr_decay**i) for i in range(len(decay_steps) + 1)
        ])
    if warmup_steps > 0:
        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
            lr_scheduler, warmup_steps, 0, base_lr)

    if args.fp16:
        paddle.set_default_dtype("float16")

    margin_loss_params = eval("losses.{}".format(args.loss))()
    backbone = eval("backbones.{}".format(args.backbone))(
        num_features=args.embedding_size, dropout=args.dropout)
    classifier = eval("classifiers.{}".format(args.classifier))(
        rank=rank,
        world_size=world_size,
        num_classes=args.num_classes,
        margin1=margin_loss_params.margin1,
        margin2=margin_loss_params.margin2,
        margin3=margin_loss_params.margin3,
        scale=margin_loss_params.scale,
        sample_ratio=args.sample_ratio,
        embedding_size=args.embedding_size,
        fp16=args.fp16)

    backbone.train()
    classifier.train()

    optimizer = paddle.optimizer.Momentum(parameters=[{
        'params':
        backbone.parameters(),
    }, {
        'params':
        classifier.parameters(),
    }],
                                          learning_rate=lr_scheduler,
                                          momentum=args.momentum,
                                          weight_decay=args.weight_decay)

    if args.fp16:
        optimizer._dtype = 'float32'

    if world_size > 1:
        # sync backbone params for data parallel
        sync_params(backbone.parameters())

    if args.do_validation_while_train:
        callback_verification = CallBackVerification(
            args.validation_interval_step,
            rank,
            args.batch_size,
            args.val_targets,
            args.data_dir,
            fp16=args.fp16,
        )

    callback_logging = CallBackLogging(args.log_interval_step, rank,
                                       world_size, total_steps,
                                       args.batch_size, writer)

    checkpoint = Checkpoint(
        rank=rank,
        world_size=world_size,
        embedding_size=args.embedding_size,
        num_classes=args.num_classes,
        model_save_dir=os.path.join(args.output, args.backbone),
        checkpoint_dir=args.checkpoint_dir,
        max_num_last_checkpoint=args.max_num_last_checkpoint)

    start_epoch = 0
    global_step = 0
    loss_avg = AverageMeter()
    if args.resume:
        extra_info = checkpoint.load(backbone,
                                     classifier,
                                     optimizer,
                                     for_train=True)
        start_epoch = extra_info['epoch'] + 1
        lr_state = extra_info['lr_state']
        # there last_epoch means last_step in for PiecewiseDecay
        # since we always use step style for lr_scheduler
        global_step = lr_state['last_epoch']
        lr_scheduler.set_state_dict(lr_state)

    train_loader = paddle.io.DataLoader(
        trainset,
        places=place,
        num_workers=args.num_workers,
        batch_sampler=paddle.io.DistributedBatchSampler(
            dataset=trainset,
            batch_size=args.batch_size,
            shuffle=True,
            drop_last=True))

    scaler = LSCGradScaler(
        enable=args.fp16,
        init_loss_scaling=args.init_loss_scaling,
        incr_ratio=args.incr_ratio,
        decr_ratio=args.decr_ratio,
        incr_every_n_steps=args.incr_every_n_steps,
        decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
        use_dynamic_loss_scaling=args.use_dynamic_loss_scaling)

    for epoch in range(start_epoch, total_epoch):
        train_reader_cost = 0.0
        train_run_cost = 0.0
        total_samples = 0
        reader_start = time.time()
        for step, (img, label) in enumerate(train_loader):
            train_reader_cost += time.time() - reader_start
            global_step += 1
            train_start = time.time()
            with paddle.amp.auto_cast(enable=args.fp16):
                features = backbone(img)
                loss_v = classifier(features, label)

            scaler.scale(loss_v).backward()
            if world_size > 1:
                # data parallel sync backbone gradients
                sync_gradients(backbone.parameters())

            scaler.step(optimizer)
            classifier.step(optimizer)
            optimizer.clear_grad()
            classifier.clear_grad()

            train_run_cost += time.time() - train_start
            total_samples += len(img)

            lr_value = optimizer.get_lr()
            loss_avg.update(loss_v.item(), 1)
            callback_logging(
                global_step,
                loss_avg,
                epoch,
                lr_value,
                avg_reader_cost=train_reader_cost / args.log_interval_step,
                avg_batch_cost=(train_reader_cost + train_run_cost) /
                args.log_interval_step,
                avg_samples=total_samples / args.log_interval_step,
                ips=total_samples / (train_reader_cost + train_run_cost))

            if args.do_validation_while_train:
                callback_verification(global_step, backbone)
            lr_scheduler.step()

            if global_step >= total_steps:
                break
            sys.stdout.flush()
            if rank is 0 and global_step > 0 and global_step % args.log_interval_step == 0:
                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0
            reader_start = time.time()
        checkpoint.save(backbone,
                        classifier,
                        optimizer,
                        epoch=epoch,
                        for_train=True)
    writer.close()
Ejemplo n.º 24
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(
        dg.parallel.Env().dev_id) if args.use_gpu else fluid.CPUPlace()
    fluid.enable_dygraph(place)

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output,
                                    'log')) if local_rank == 0 else None

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(
            1 / (cfg['train']['warm_up_step'] *
                 (cfg['train']['learning_rate']**2)),
            cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(
            cfg['train']['grad_clip_thresh']))
    reader = LJSpeechLoader(cfg['audio'],
                            place,
                            args.data,
                            args.alignments_path,
                            cfg['train']['batch_size'],
                            nranks,
                            local_rank,
                            shuffle=True).reader
    iterator = iter(tqdm(reader))

    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     optimizer=optimizer,
                                     checkpoint_dir=os.path.join(
                                         args.output, 'checkpoints'),
                                     iteration=args.iteration,
                                     checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    while global_step <= cfg['train']['max_iteration']:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm(reader))
            batch = next(iterator)

        (character, mel, pos_text, pos_mel, alignment) = batch

        global_step += 1

        #Forward
        result = model(character,
                       pos_text,
                       mel_pos=pos_mel,
                       length_target=alignment)
        mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
        mel_loss = layers.mse_loss(mel_output, mel)
        mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
        duration_loss = layers.mean(
            layers.abs(
                layers.elementwise_sub(duration_predictor_output, alignment)))
        total_loss = mel_loss + mel_postnet_loss + duration_loss

        if local_rank == 0:
            writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
            writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(),
                              global_step)
            writer.add_scalar('duration_loss', duration_loss.numpy(),
                              global_step)
            writer.add_scalar('learning_rate',
                              optimizer._learning_rate.step().numpy(),
                              global_step)

        if parallel:
            total_loss = model.scale_loss(total_loss)
            total_loss.backward()
            model.apply_collective_grads()
        else:
            total_loss.backward()
        optimizer.minimize(total_loss)
        model.clear_gradients()

        # save checkpoint
        if local_rank == 0 and global_step % cfg['train'][
                'checkpoint_interval'] == 0:
            io.save_parameters(os.path.join(args.output, 'checkpoints'),
                               global_step, model, optimizer)

    if local_rank == 0:
        writer.close()
Ejemplo n.º 25
0
def train(model,
          train_dataset,
          val_dataset=None,
          optimizer=None,
          save_dir='output',
          iters=10000,
          batch_size=2,
          resume_model=None,
          save_interval=1000,
          log_iters=10,
          num_workers=0,
          use_vdl=False,
          losses=None):
    nranks = paddle.distributed.ParallelEnv().nranks
    local_rank = paddle.distributed.ParallelEnv().local_rank

    start_iter = 0
    if resume_model is not None:
        start_iter = resume(model, optimizer, resume_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        # Initialize parallel training environment.
        paddle.distributed.init_parallel_env()
        strategy = paddle.distributed.prepare_context()
        ddp_model = paddle.DataParallel(model, strategy)

    batch_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    loader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        num_workers=num_workers,
        return_list=True,
    )

    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    timer = Timer()
    avg_loss = 0.0
    iters_per_epoch = len(batch_sampler)
    best_mean_iou = -1.0
    best_model_iter = -1
    train_reader_cost = 0.0
    train_batch_cost = 0.0
    timer.start()

    iter = start_iter
    while iter < iters:
        for data in loader:
            iter += 1
            if iter > iters:
                break
            train_reader_cost += timer.elapsed_time()
            images = data[0]
            labels = data[1].astype('int64')
            if nranks > 1:
                logits = ddp_model(images)
                loss = loss_computation(logits, labels, losses)
                loss.backward()
            else:
                logits = model(images)
                loss = loss_computation(logits, labels, losses)
                loss.backward()
            optimizer.step()
            lr = optimizer.get_lr()
            if isinstance(optimizer._learning_rate,
                          paddle.optimizer.lr.LRScheduler):
                optimizer._learning_rate.step()
            model.clear_gradients()
            avg_loss += loss.numpy()[0]
            train_batch_cost += timer.elapsed_time()
            if (iter) % log_iters == 0 and local_rank == 0:
                avg_loss /= log_iters
                avg_train_reader_cost = train_reader_cost / log_iters
                avg_train_batch_cost = train_batch_cost / log_iters
                train_reader_cost = 0.0
                train_batch_cost = 0.0
                remain_iters = iters - iter
                eta = calculate_eta(remain_iters, avg_train_batch_cost)
                logger.info(
                    "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
                            avg_loss, lr, avg_train_batch_cost,
                            avg_train_reader_cost, eta))
                if use_vdl:
                    log_writer.add_scalar('Train/loss', avg_loss, iter)
                    log_writer.add_scalar('Train/lr', lr, iter)
                    log_writer.add_scalar('Train/batch_cost',
                                          avg_train_batch_cost, iter)
                    log_writer.add_scalar('Train/reader_cost',
                                          avg_train_reader_cost, iter)
                avg_loss = 0.0

            if (iter % save_interval == 0
                    or iter == iters) and (val_dataset is not None):
                num_workers = 1 if num_workers > 0 else 0
                mean_iou, acc = evaluate(
                    model, val_dataset, num_workers=num_workers)
                model.train()

            if (iter % save_interval == 0 or iter == iters) and local_rank == 0:
                current_save_dir = os.path.join(save_dir,
                                                "iter_{}".format(iter))
                if not os.path.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                paddle.save(model.state_dict(),
                            os.path.join(current_save_dir, 'model.pdparams'))
                paddle.save(optimizer.state_dict(),
                            os.path.join(current_save_dir, 'model.pdopt'))

                if val_dataset is not None:
                    if mean_iou > best_mean_iou:
                        best_mean_iou = mean_iou
                        best_model_iter = iter
                        best_model_dir = os.path.join(save_dir, "best_model")
                        paddle.save(
                            model.state_dict(),
                            os.path.join(best_model_dir, 'model.pdparams'))
                    logger.info(
                        '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.'
                        .format(best_mean_iou, best_model_iter))

                    if use_vdl:
                        log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter)
                        log_writer.add_scalar('Evaluate/Acc', acc, iter)
            timer.restart()

    # Sleep for half a second to let dataloader release resources.
    time.sleep(0.5)
    if use_vdl:
        log_writer.close()
Ejemplo n.º 26
0
def train(model,
          train_dataset,
          places=None,
          eval_dataset=None,
          optimizer=None,
          save_dir='output',
          iters=10000,
          batch_size=2,
          resume_model=None,
          save_interval_iters=1000,
          log_iters=10,
          num_classes=None,
          num_workers=8,
          use_vdl=False):
    ignore_index = model.ignore_index
    nranks = ParallelEnv().nranks

    start_iter = 0
    if resume_model is not None:
        start_iter = resume(model, optimizer, resume_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        strategy = fluid.dygraph.prepare_context()
        ddp_model = fluid.dygraph.DataParallel(model, strategy)

    batch_sampler = DistributedBatchSampler(train_dataset,
                                            batch_size=batch_size,
                                            shuffle=True,
                                            drop_last=True)
    loader = DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        places=places,
        num_workers=num_workers,
        return_list=True,
    )

    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    timer = Timer()
    avg_loss = 0.0
    iters_per_epoch = len(batch_sampler)
    best_mean_iou = -1.0
    best_model_iter = -1
    train_reader_cost = 0.0
    train_batch_cost = 0.0
    timer.start()

    iter = 0
    while iter < iters:
        for data in loader:
            iter += 1
            if iter > iters:
                break
            train_reader_cost += timer.elapsed_time()
            images = data[0]
            labels = data[1].astype('int64')
            if nranks > 1:
                loss = ddp_model(images, labels)
                # apply_collective_grads sum grads over multiple gpus.
                loss = ddp_model.scale_loss(loss)
                loss.backward()
                ddp_model.apply_collective_grads()
            else:
                loss = model(images, labels)
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()
            avg_loss += loss.numpy()[0]
            lr = optimizer.current_step_lr()
            train_batch_cost += timer.elapsed_time()
            if (iter) % log_iters == 0 and ParallelEnv().local_rank == 0:
                avg_loss /= log_iters
                avg_train_reader_cost = train_reader_cost / log_iters
                avg_train_batch_cost = train_batch_cost / log_iters
                train_reader_cost = 0.0
                train_batch_cost = 0.0
                remain_iters = iters - iter
                eta = calculate_eta(remain_iters, avg_train_batch_cost)
                logger.info(
                    "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}"
                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
                            avg_loss * nranks, lr, avg_train_batch_cost,
                            avg_train_reader_cost, eta))
                if use_vdl:
                    log_writer.add_scalar('Train/loss', avg_loss * nranks,
                                          iter)
                    log_writer.add_scalar('Train/lr', lr, iter)
                    log_writer.add_scalar('Train/batch_cost',
                                          avg_train_batch_cost, iter)
                    log_writer.add_scalar('Train/reader_cost',
                                          avg_train_reader_cost, iter)
                avg_loss = 0.0

            if (iter % save_interval_iters == 0
                    or iter == iters) and ParallelEnv().local_rank == 0:
                current_save_dir = os.path.join(save_dir,
                                                "iter_{}".format(iter))
                if not os.path.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                fluid.save_dygraph(model.state_dict(),
                                   os.path.join(current_save_dir, 'model'))
                fluid.save_dygraph(optimizer.state_dict(),
                                   os.path.join(current_save_dir, 'model'))

                if eval_dataset is not None:
                    mean_iou, avg_acc = evaluate(model,
                                                 eval_dataset,
                                                 model_dir=current_save_dir,
                                                 num_classes=num_classes,
                                                 ignore_index=ignore_index,
                                                 iter_id=iter)
                    if mean_iou > best_mean_iou:
                        best_mean_iou = mean_iou
                        best_model_iter = iter
                        best_model_dir = os.path.join(save_dir, "best_model")
                        fluid.save_dygraph(
                            model.state_dict(),
                            os.path.join(best_model_dir, 'model'))
                    logger.info(
                        'Current evaluated best model in eval_dataset is iter_{}, miou={:4f}'
                        .format(best_model_iter, best_mean_iou))

                    if use_vdl:
                        log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter)
                        log_writer.add_scalar('Evaluate/aAcc', avg_acc, iter)
                    model.train()
            timer.restart()
    if use_vdl:
        log_writer.close()
Ejemplo n.º 27
0
def train(model,
          train_dataset,
          val_dataset=None,
          optimizer=None,
          save_dir='output',
          iters=10000,
          batch_size=2,
          resume_model=None,
          save_interval=1000,
          log_iters=10,
          num_workers=0,
          use_vdl=False,
          losses=None,
          keep_checkpoint_max=5,
          eval_begin_iters=None):
    """
    Launch training.
    Args:
        model(nn.Layer): A matting model.
        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
        optimizer (paddle.optimizer.Optimizer): The optimizer.
        save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
        iters (int, optional): How may iters to train the model. Defualt: 10000.
        batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
        resume_model (str, optional): The path of resume model.
        save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
        log_iters (int, optional): Display logging information at every log_iters. Default: 10.
        num_workers (int, optional): Num workers for data loader. Default: 0.
        use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False.
        losses (dict, optional): A dict of loss, refer to the loss function of the model for details. Default: None.
        keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
        eval_begin_iters (int): The iters begin evaluation. It will evaluate at iters/2 if it is None. Defalust: None.
    """
    model.train()
    nranks = paddle.distributed.ParallelEnv().nranks
    local_rank = paddle.distributed.ParallelEnv().local_rank

    start_iter = 0
    if resume_model is not None:
        start_iter = resume(model, optimizer, resume_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        # Initialize parallel environment if not done.
        if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
        ):
            paddle.distributed.init_parallel_env()
            ddp_model = paddle.DataParallel(model)
        else:
            ddp_model = paddle.DataParallel(model)

    batch_sampler = paddle.io.DistributedBatchSampler(train_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=True,
                                                      drop_last=True)

    loader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        num_workers=num_workers,
        return_list=True,
    )

    if use_vdl:
        from visualdl import LogWriter
        log_writer = LogWriter(save_dir)

    avg_loss = defaultdict(float)
    iters_per_epoch = len(batch_sampler)
    best_sad = np.inf
    best_model_iter = -1
    reader_cost_averager = TimeAverager()
    batch_cost_averager = TimeAverager()
    save_models = deque()
    batch_start = time.time()

    iter = start_iter
    while iter < iters:
        for data in loader:
            iter += 1
            if iter > iters:
                break
            reader_cost_averager.record(time.time() - batch_start)

            # model input
            if nranks > 1:
                logit_dict = ddp_model(data)
            else:
                logit_dict = model(data)
            loss_dict = model.loss(logit_dict, data, losses)

            loss_dict['all'].backward()

            optimizer.step()
            lr = optimizer.get_lr()
            if isinstance(optimizer._learning_rate,
                          paddle.optimizer.lr.LRScheduler):
                optimizer._learning_rate.step()
            model.clear_gradients()

            for key, value in loss_dict.items():
                avg_loss[key] += value.numpy()[0]
            batch_cost_averager.record(time.time() - batch_start,
                                       num_samples=batch_size)

            if (iter) % log_iters == 0 and local_rank == 0:
                for key, value in avg_loss.items():
                    avg_loss[key] = value / log_iters
                remain_iters = iters - iter
                avg_train_batch_cost = batch_cost_averager.get_average()
                avg_train_reader_cost = reader_cost_averager.get_average()
                eta = calculate_eta(remain_iters, avg_train_batch_cost)
                logger.info(
                    "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.5f}, ips={:.4f} samples/sec | ETA {}"
                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
                            avg_loss['all'], lr, avg_train_batch_cost,
                            avg_train_reader_cost,
                            batch_cost_averager.get_ips_average(), eta))
                # print loss
                loss_str = '[TRAIN] [LOSS] '
                loss_str = loss_str + 'all={:.4f}'.format(avg_loss['all'])
                for key, value in avg_loss.items():
                    if key != 'all':
                        loss_str = loss_str + ' ' + key + '={:.4f}'.format(
                            value)
                logger.info(loss_str)
                if use_vdl:
                    for key, value in avg_loss.items():
                        log_tag = 'Train/' + key
                        log_writer.add_scalar(log_tag, value, iter)

                    log_writer.add_scalar('Train/lr', lr, iter)
                    log_writer.add_scalar('Train/batch_cost',
                                          avg_train_batch_cost, iter)
                    log_writer.add_scalar('Train/reader_cost',
                                          avg_train_reader_cost, iter)

                for key in avg_loss.keys():
                    avg_loss[key] = 0.
                reader_cost_averager.reset()
                batch_cost_averager.reset()

            # save model
            if (iter % save_interval == 0
                    or iter == iters) and local_rank == 0:
                current_save_dir = os.path.join(save_dir,
                                                "iter_{}".format(iter))
                if not os.path.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                paddle.save(model.state_dict(),
                            os.path.join(current_save_dir, 'model.pdparams'))
                paddle.save(optimizer.state_dict(),
                            os.path.join(current_save_dir, 'model.pdopt'))
                save_models.append(current_save_dir)
                if len(save_models) > keep_checkpoint_max > 0:
                    model_to_remove = save_models.popleft()
                    shutil.rmtree(model_to_remove)

            # eval model
            if eval_begin_iters is None:
                eval_begin_iters = iters // 2
            if (iter % save_interval == 0 or iter == iters) and (
                    val_dataset is not None
            ) and local_rank == 0 and iter >= eval_begin_iters:
                num_workers = 1 if num_workers > 0 else 0
                sad, mse = evaluate(model,
                                    val_dataset,
                                    num_workers=0,
                                    print_detail=True,
                                    save_results=False)
                model.train()

            # save best model and add evaluation results to vdl
            if (iter % save_interval == 0
                    or iter == iters) and local_rank == 0:
                if val_dataset is not None and iter >= eval_begin_iters:
                    if sad < best_sad:
                        best_sad = sad
                        best_model_iter = iter
                        best_model_dir = os.path.join(save_dir, "best_model")
                        paddle.save(
                            model.state_dict(),
                            os.path.join(best_model_dir, 'model.pdparams'))
                    logger.info(
                        '[EVAL] The model with the best validation sad ({:.4f}) was saved at iter {}.'
                        .format(best_sad, best_model_iter))

                    if use_vdl:
                        log_writer.add_scalar('Evaluate/SAD', sad, iter)
                        log_writer.add_scalar('Evaluate/MSE', mse, iter)

            batch_start = time.time()

    # Sleep for half a second to let dataloader release resources.
    time.sleep(0.5)
    if use_vdl:
        log_writer.close()
Ejemplo n.º 28
0
def train(model,
          train_dataset,
          val_dataset=None,
          optimizer=None,
          loss_computation=None,
          save_dir='output',
          iters=10000,
          batch_size=2,
          resume_model=None,
          save_interval=1000,
          log_iters=10,
          num_workers=0,
          keep_checkpoint_max=5):
    """
    Launch training.

    Args:
        model(nn.Layer): A sementic segmentation model.
        train_dataset (paddle.io.Dataset): Used to read and process training datasets.
        val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets.
        optimizer (paddle.optimizer.Optimizer): The optimizer.
        loss_computation (nn.Layer): A loss function.
        save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'.
        iters (int, optional): How may iters to train the model. Defualt: 10000.
        batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2.
        resume_model (str, optional): The path of resume model.
        save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000.
        log_iters (int, optional): Display logging information at every log_iters. Default: 10.
        num_workers (int, optional): Num workers for data loader. Default: 0.
        keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5.
    """
    model.train()
    nranks = paddle.distributed.ParallelEnv().nranks
    local_rank = paddle.distributed.ParallelEnv().local_rank

    start_iter = 0
    if resume_model is not None:
        start_iter = resume(model, optimizer, resume_model)

    if not os.path.isdir(save_dir):
        if os.path.exists(save_dir):
            os.remove(save_dir)
        os.makedirs(save_dir)

    if nranks > 1:
        # Initialize parallel environment if not done.
        if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized(
        ):
            paddle.distributed.init_parallel_env()
            ddp_model = paddle.DataParallel(model)
        else:
            ddp_model = paddle.DataParallel(model)

    batch_sampler = paddle.io.DistributedBatchSampler(train_dataset,
                                                      batch_size=batch_size,
                                                      shuffle=True,
                                                      drop_last=True)

    loader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=batch_sampler,
        num_workers=num_workers,
        return_list=True,
    )

    # VisualDL log
    log_writer = LogWriter(save_dir)

    avg_loss = 0.0
    avg_loss_dict = {}
    iters_per_epoch = len(batch_sampler)

    reader_cost_averager = TimeAverager()
    batch_cost_averager = TimeAverager()
    save_models = deque()
    batch_start = time.time()

    iter = start_iter
    while iter < iters:
        for data in loader:
            iter += 1
            if iter > iters:
                break
            reader_cost_averager.record(time.time() - batch_start)
            images = data[0]
            targets = data[1]

            if nranks > 1:
                predictions = ddp_model(images)
            else:
                predictions = model(images)

            loss_dict = loss_computation(predictions, targets)
            loss = sum(loss for loss in loss_dict.values())
            loss.backward()

            optimizer.step()
            lr = optimizer.get_lr()
            if isinstance(optimizer._learning_rate,
                          paddle.optimizer.lr.LRScheduler):
                optimizer._learning_rate.step()
            model.clear_gradients()
            avg_loss += loss.numpy()[0]  # get the value
            if len(avg_loss_dict) == 0:
                avg_loss_dict = {k: v.numpy()[0] for k, v in loss_dict.items()}
            else:
                for key, value in loss_dict.items():
                    avg_loss_dict[key] += value.numpy()[0]

            batch_cost_averager.record(time.time() - batch_start,
                                       num_samples=batch_size)

            if (iter) % log_iters == 0 and local_rank == 0:
                avg_loss /= log_iters
                for key, value in avg_loss_dict.items():
                    avg_loss_dict[key] /= log_iters

                remain_iters = iters - iter
                avg_train_batch_cost = batch_cost_averager.get_average()
                avg_train_reader_cost = reader_cost_averager.get_average()
                eta = calculate_eta(remain_iters, avg_train_batch_cost)
                logger.info(
                    "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.5f} | ETA {}"
                    .format((iter - 1) // iters_per_epoch + 1, iter, iters,
                            avg_loss, lr, avg_train_batch_cost,
                            avg_train_reader_cost, eta))

                ######################### VisualDL Log ##########################
                log_writer.add_scalar('Train/loss', avg_loss, iter)
                # Record all losses if there are more than 2 losses.
                for key, value in avg_loss_dict.items():
                    log_tag = 'Train/' + key
                    log_writer.add_scalar(log_tag, value, iter)

                log_writer.add_scalar('Train/lr', lr, iter)
                log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost,
                                      iter)
                log_writer.add_scalar('Train/reader_cost',
                                      avg_train_reader_cost, iter)
                #################################################################

                avg_loss = 0.0
                avg_loss_list = {}
                reader_cost_averager.reset()
                batch_cost_averager.reset()

            if (iter % save_interval == 0
                    or iter == iters) and local_rank == 0:
                current_save_dir = os.path.join(save_dir,
                                                "iter_{}".format(iter))
                if not os.path.isdir(current_save_dir):
                    os.makedirs(current_save_dir)
                paddle.save(model.state_dict(),
                            os.path.join(current_save_dir, 'model.pdparams'))
                paddle.save(optimizer.state_dict(),
                            os.path.join(current_save_dir, 'model.pdopt'))
                save_models.append(current_save_dir)
                if len(save_models) > keep_checkpoint_max > 0:
                    model_to_remove = save_models.popleft()
                    shutil.rmtree(model_to_remove)

            batch_start = time.time()

    # Sleep for half a second to let dataloader release resources.
    time.sleep(0.5)
    log_writer.close()