Ejemplo n.º 1
0
def train(args):
    """
    Train main function.
    """
    if args.is_distributed:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    train_generator = task.reader.data_generator(input_file=args.train_file,
                                                 num_epochs=args.num_epochs,
                                                 num_part=trainers_num,
                                                 part_id=trainer_id,
                                                 phase="train")
    valid_generator = task.reader.data_generator(
        input_file=args.valid_file,
        num_part=dev_count,
        part_id=gpu_id,
        phase="distributed_valid" if args.is_distributed else "valid")

    # run training
    model_timer = Timer()
    for step, data in enumerate(train_generator(), 1):
        model_timer.start()
        metrics = task.train_step(model, data)
        model_timer.pause()
        if step % args.log_steps == 0:
            time_cost = model_timer.pass_time
            current_epoch, current_file_index, total_file = task.reader.get_train_progress(
            )
            print(
                f"[train][{current_epoch}] progress: {current_file_index}/{total_file} "
                f"step: {step}, time: {time_cost:.3f}, "
                f"speed: {args.log_steps / time_cost:.3f} steps/s")
            print("\tcurrent lr:", metrics.pop('scheduled_lr'))
            print("\t" + task.show_metrics(metrics))
            model_timer.reset()

        if step % args.validation_steps == 0:
            evaluate(task, model, valid_generator, args, dev_count, gpu_id)

        if step % args.save_steps == 0:
            save_path = f"{args.save_path}/step_{step}"
            model.save(save_path, is_checkpoint=True)
Ejemplo n.º 2
0
def train_epoch(ep, model_object, model, train_loader, test_loader=None):
    model.train()
    train_timer = Timer('M')
    model_object.criterion.train()
    model_object.train_metrics.reset()
    print_counter = 0  # counter for printing

    for datum in train_loader:

        model_object.optimizer.zero_grad()
        loss, prob, tgt, _ = predict(ep, model_object, model, datum)
        loss.backward()

        # grad normalization
        grad = clip_grad_norm_(model.parameters(), 10)
        model_object.train_metrics.collect(y_prob=prob, y_true=tgt)

        # update network
        model_object.optimizer.step()

        print_counter += 1
        if print_counter % model_object.print_freq == 0 and model_object.log_path is not None:
            train_auc = model_object.train_metrics.roc_auc()
            train_ap = model_object.train_metrics.average_precision()
            status = '[epoch {}], train batch {} - batch loss: {:.5f}, running train auc: {:.5f}, running train ap: {:.5f} - time taken: {:.2f} mins'
            status = status.format(ep, model_object.criterion.n_batch_train,
                                   model_object.criterion.get_running_loss(),
                                   train_auc, train_ap, train_timer.time())
            log_status(model_object.log_path, status, init=False) if (
                model_object.log_path and model_object.verbose > 0) else None

            # evaluate on validation set
            if test_loader is not None:
                train_timer.pause()
                valid_epoch(ep, model_object, model, test_loader)
                train_timer.resume()
                model.train()

    # print and log status
    train_loss = model_object.criterion.get_running_loss()
    train_auc = model_object.train_metrics.roc_auc()
    train_acc = model_object.train_metrics.accuracy()
    train_ap = model_object.train_metrics.average_precision()

    if model_object.log_path:
        status = '[epoch {}], train loss: {:.5f}, train acc: {:.5f}, train auc: {:.5f}, train ap: {:.5f}, time taken: {:.2f} mins'
        status = status.format(ep, train_loss, train_acc, train_auc, train_ap,
                               train_timer.time_since_start())
        log_status(model_object.log_path, status,
                   init=False) if model_object.verbose > 0 else None

    # reset running criterion losses and timer
    model_object.criterion.reset_train()
    train_timer.reset()
Ejemplo n.º 3
0
def train(args):
    """
    Train main function.
    """
    if args.is_distributed:
        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
        fleet.init(role)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    train_generator = task.get_data_loader(model,
                                           input_file=args.train_file,
                                           num_epochs=args.num_epochs,
                                           num_part=trainers_num,
                                           part_id=trainer_id,
                                           phase="train")
    valid_generator = task.get_data_loader(
        model,
        input_file=args.valid_file,
        num_part=dev_count,
        part_id=gpu_id,
        phase="distributed_valid" if args.is_distributed else "valid")

    # run training
    timer = Timer()
    timer.start()
    if args.Model.model == 'NSPModel':
        best_metrics = 0.0
    else:
        best_metrics = 10000
    shuffledatafile()
    for step, data in enumerate(train_generator(), args.start_step + 1):
        outputs = task.train_step(model, data)
        timer.pause()
        if step % args.log_steps == 0:
            time_cost = timer.pass_time
            current_epoch, current_file_index, total_file = task.reader.get_train_progress(
            )
            print(
                f"[train][{current_epoch}] progress: {current_file_index}/{total_file} "
                f"step: {step}, time: {time_cost:.3f}, "
                f"speed: {args.log_steps / time_cost:.3f} steps/s")
            print(f"\tcurrent lr: {outputs.pop('scheduled_lr'):.7f}")
            metrics = task.get_metrics(outputs)
            print("\t" + ", ".join(f"{k}: {v:.4f}"
                                   for k, v in metrics.items()))
            timer.reset()

        if step % args.validation_steps == 0:

            # shuffledatafile()
            metrics = evaluate(task, model, valid_generator, args, dev_count,
                               gpu_id, step)
            if args.Model.model == 'NSPModel' and metrics[
                    'nsp_acc'] > best_metrics:
                best_metrics = metrics['nsp_acc']
                save_path = f"{args.save_path}/step_{step}_{best_metrics}"
                model.save(save_path, is_checkpoint=True)

            elif args.Model.model == 'Plato' and metrics['loss'] < best_metrics:
                best_metrics = metrics['loss']
                save_path = f"{args.save_path}/step_{step}_{best_metrics}"
                model.save(save_path, is_checkpoint=True)
        # if step % args.save_steps == 0 and trainer_id == 0:
        #     save_path = f"{args.save_path}/step_{step}"
        #     model.save(save_path, is_checkpoint=True)
        #     with open(save_path + ".finish", "w") as f:
        #         pass

        timer.start()