Exemple #1
0
def train(parameters, config, gpu_list, do_test=False, local_rank=-1):
    epoch = config.getint("train", "epoch")
    batch_size = config.getint("train", "batch_size")

    output_time = config.getint("output", "output_time")
    test_time = config.getint("output", "test_time")

    output_path = os.path.join(config.get("output", "model_path"), config.get("output", "model_name"))
    if os.path.exists(output_path):
        logger.warning("Output path exists, check whether need to change a name of model")
    os.makedirs(output_path, exist_ok=True)

    trained_epoch = parameters["trained_epoch"] + 1
    model = parameters["model"]
    optimizer = parameters["optimizer"]
    dataset = parameters["train_dataset"]
    global_step = parameters["global_step"]
    output_function = parameters["output_function"]

    if do_test:
        init_formatter(config, ["test"])
        test_dataset = init_test_dataset(config)

    if trained_epoch == 0:
        shutil.rmtree(
            os.path.join(config.get("output", "tensorboard_path"), config.get("output", "model_name")), True)

    os.makedirs(os.path.join(config.get("output", "tensorboard_path"), config.get("output", "model_name")),
                exist_ok=True)

    writer = SummaryWriter(os.path.join(config.get("output", "tensorboard_path"), config.get("output", "model_name")),
                           config.get("output", "model_name"))

    step_size = config.getint("train", "step_size")
    gamma = config.getfloat("train", "lr_multiplier")
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
    exp_lr_scheduler.step(trained_epoch)

    logger.info("Training start....")

    print("Epoch  Stage  Iterations  Time Usage    Loss    Output Information")

    total_len = len(dataset)
    more = ""
    if total_len < 10000:
        more = "\t"
    for epoch_num in range(trained_epoch, epoch):
        start_time = timer()
        current_epoch = epoch_num

        exp_lr_scheduler.step(current_epoch)

        acc_result = None
        total_loss = 0

        output_info = ""
        step = -1
        for step, data in enumerate(dataset):
            for key in data.keys():
                if isinstance(data[key], torch.Tensor):
                    if len(gpu_list) > 0:
                        data[key] = Variable(data[key].cuda())
                    else:
                        data[key] = Variable(data[key])

            optimizer.zero_grad()

            results = model(data, config, gpu_list, acc_result, "train")

            loss, acc_result = results["loss"], results["acc_result"]
            total_loss += float(loss)

            loss.backward()
            optimizer.step()

            if step % output_time == 0 and local_rank <= 0:
                output_info = output_function(acc_result, config)

                delta_t = timer() - start_time

                output_value(current_epoch, "train", "%d/%d" % (step + 1, total_len), "%s/%s" % (
                    gen_time_str(delta_t), gen_time_str(delta_t * (total_len - step - 1) / (step + 1))),
                             "%.3lf" % (total_loss / (step + 1)), output_info, '\r', config)

            global_step += 1
            writer.add_scalar(config.get("output", "model_name") + "_train_iter", float(loss), global_step)
            # break

        if local_rank <= 0:
            output_info = output_function(acc_result, config)
            delta_t = timer() - start_time
            output_value(current_epoch, "train", "%d/%d" % (step + 1, total_len), "%s/%s" % (
                gen_time_str(delta_t), gen_time_str(delta_t * (total_len - step - 1) / (step + 1))),
                        "%.3lf" % (total_loss / (step + 1)), output_info, None, config)

        if step == -1:
            logger.error("There is no data given to the model in this epoch, check your data.")
            raise NotImplementedError

        if local_rank <= 0:
            checkpoint(os.path.join(output_path, "%d.pkl" % current_epoch), model, optimizer, current_epoch, config,
                    global_step)
            writer.add_scalar(config.get("output", "model_name") + "_train_epoch", float(total_loss) / (step + 1),
                            current_epoch)

        if current_epoch % test_time == 0:
            with torch.no_grad():
                valid(model, parameters["valid_dataset"], current_epoch, writer, config, gpu_list, output_function)
                if do_test:
                    valid(model, test_dataset, current_epoch, writer, config, gpu_list, output_function, mode="test")
        if local_rank >= 0:
            torch.distributed.barrier()
Exemple #2
0
def train(parameters, config, gpu_list):
    epoch = config.getint("train", "epoch")
    batch_size = config.getint("train", "batch_size")

    output_time = config.getint("output", "output_time")
    test_time = config.getint("output", "test_time")

    output_path = os.path.join(config.get("output", "model_path"),
                               config.get("output", "model_name"))
    if os.path.exists(output_path):
        logger.warning(
            "Output path exists, check whether need to change a name of model")
    os.makedirs(output_path, exist_ok=True)

    trained_epoch = parameters["trained_epoch"]
    model = parameters["model"]
    optimizer = parameters["optimizer"]
    dataset = parameters["train_dataset"]
    global_step = parameters["global_step"]
    output_function = parameters["output_function"]

    if config.getboolean('train', 'fp16'):
        opt_level = 'O2'
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=opt_level)

    step_size = config.getint("train", "step_size")
    gamma = config.getfloat("train", "lr_multiplier")
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                           step_size=step_size,
                                           gamma=gamma)
    exp_lr_scheduler.step(trained_epoch)

    logger.info("Training start....")

    print("Epoch  Stage  Iterations  Time Usage    Loss    Output Information")

    total_len = len(dataset)
    more = ""
    if total_len < 10000:
        more = "\t"
    for epoch_num in range(trained_epoch, epoch):
        start_time = timer()
        current_epoch = epoch_num

        exp_lr_scheduler.step(current_epoch)

        acc_result = None
        total_loss = 0

        output_info = ""
        step = -1

        try:
            train_steps = config.getint('train', 'train_steps')
        except:
            train_steps = 1

        for step, data in enumerate(dataset):
            for key in data.keys():
                if isinstance(data[key], torch.Tensor):
                    if len(gpu_list) > 0:
                        data[key] = Variable(data[key].cuda())
                    else:
                        data[key] = Variable(data[key])

            data['epoch'] = epoch_num
            results = model(data, config, gpu_list, acc_result, "train")

            loss, acc_result = results["loss"], results["acc_result"]
            loss = loss.mean()
            total_loss += float(loss)
            loss = loss / train_steps
            if config.getboolean('train', 'fp16'):
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if step % train_steps == 0:
                if config.getboolean('train', 'fp16'):
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), 1.0)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()

            if step % output_time == 0:
                output_info = output_function(acc_result, config, mode='train')

                delta_t = timer() - start_time

                output_value(
                    current_epoch, "train", "%d/%d" % (step + 1, total_len),
                    "%s/%s" % (gen_time_str(delta_t),
                               gen_time_str(delta_t * (total_len - step - 1) /
                                            (step + 1))),
                    "%.3lf" % (total_loss / (step + 1)), output_info, '\r',
                    config)

            global_step += 1

        output_value(
            current_epoch, "train", "%d/%d" % (step + 1, total_len), "%s/%s" %
            (gen_time_str(delta_t),
             gen_time_str(delta_t * (total_len - step - 1) / (step + 1))),
            "%.3lf" % (total_loss / (step + 1)), output_info, None, config)

        if step == -1:
            logger.error(
                "There is no data given to the model in this epoch, check your data."
            )
            raise NotImplementedError
        if config.getboolean('train', 'pre_train'):
            save_path = os.path.join(output_path, 'epoch_%d' % current_epoch)
            os.makedirs(save_path, exist_ok=True)
            model.save_pretrained(save_path)
        #else:
        checkpoint(os.path.join(output_path, "%d.pkl" % current_epoch), model,
                   optimizer, current_epoch, config, global_step)

        if not config.getboolean('train', 'no_valid'):
            if current_epoch % test_time == 0:
                with torch.no_grad():
                    valid(model, parameters["valid_dataset"], current_epoch,
                          config, gpu_list, output_function)
def train(parameters, config, gpu_list):
    epoch = config.getint("train", "epoch")
    batch_size = config.getint("train", "batch_size")

    output_file = config.get("output", "output_file") if config.get(
        "output", "output_file") != "None" else None
    if output_file is not None:
        if os.path.exists(output_file):
            os.remove(output_file)

    output_time = config.getint("output", "output_time")
    test_time = config.getint("output", "test_time")

    output_path = os.path.join(config.get("output", "model_path"),
                               config.get("output", "model_name"))
    if os.path.exists(output_path):
        logger.warning(
            "Output path exists, check whether need to change a name of model")
    os.makedirs(output_path, exist_ok=True)

    trained_epoch = parameters["trained_epoch"]
    model = parameters["model"]
    optimizer = parameters["optimizer"]
    dataset = parameters["train_dataset"]
    global_step = parameters["global_step"]
    output_function = parameters["output_function"]

    step_size = config.getint("train", "step_size")
    gamma = config.getfloat("train", "lr_multiplier")
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer,
                                           step_size=step_size,
                                           gamma=gamma)
    exp_lr_scheduler.step(trained_epoch)

    logger.info("Training start....")

    if config.get("model", "model_name") in ["Pipeline"]:
        print(
            "Epoch  Stage  Iterations  Time Usage    Loss   \tOutput Information"
        )
    else:
        print(
            "Epoch  Stage  Iterations  Time Usage    Loss    Output Information"
        )

    total_len = len(dataset)
    more = ""
    if total_len < 10000:
        more = "\t"
    for epoch_num in range(trained_epoch, epoch):
        start_time = timer()
        current_epoch = epoch_num

        exp_lr_scheduler.step(current_epoch)

        acc_result = [None, None]
        total_loss = None

        output_info = ""
        step = -1
        for step, data in enumerate(dataset):
            for key in data.keys():
                if isinstance(data[key], torch.Tensor):
                    if len(gpu_list) > 0:
                        data[key] = Variable(data[key].cuda())
                    else:
                        data[key] = Variable(data[key])

            optimizer.zero_grad()

            results = model(data, config, gpu_list, acc_result, "train")

            loss, acc_result = results["loss"], results["acc_result"]

            if type(loss) == list:
                loss[0].backward()
                loss[1].backward()
                if total_loss == None:
                    total_loss = [0, 0]
                total_loss[0] += float(loss[0])
                total_loss[1] += float(loss[1])
            else:
                loss.backward()
                if total_loss == None:
                    total_loss = 0
                total_loss += float(loss)
            optimizer.step()

            if step % output_time == 0:
                output_stage_1 = output_function(acc_result[0], config)
                output_stage_2 = output_function(acc_result[1], config)

                delta_t = timer() - start_time

                if type(loss) == list:
                    output_value(
                        current_epoch, "train",
                        "%d/%d" % (step + 1, total_len), "%s/%s" %
                        (gen_time_str(delta_t),
                         gen_time_str(delta_t * (total_len - step - 1) /
                                      (step + 1))), "%.3lf, %.3lf\t" %
                        (total_loss[0] / (step + 1), total_loss[1] /
                         (step + 1)), output_stage_1 + output_stage_2, '\r',
                        config)
                else:
                    output_value(
                        current_epoch, "train",
                        "%d/%d" % (step + 1, total_len), "%s/%s" %
                        (gen_time_str(delta_t),
                         gen_time_str(delta_t * (total_len - step - 1) /
                                      (step + 1))),
                        "%.3lf" % (total_loss / (step + 1)),
                        output_stage_1 + output_stage_2, '\r', config)

            global_step += 1

        if type(loss) == list:
            output_value(
                current_epoch, "train", "%d/%d" % (step + 1, total_len),
                "%s/%s" % (gen_time_str(delta_t),
                           gen_time_str(delta_t * (total_len - step - 1) /
                                        (step + 1))), "%.3lf, %.3lf\t" %
                (total_loss[0] / (step + 1), total_loss[1] / (step + 1)),
                output_stage_1 + output_stage_2, None, config)
        else:
            output_value(
                current_epoch, "train", "%d/%d" % (step + 1, total_len),
                "%s/%s" % (gen_time_str(delta_t),
                           gen_time_str(delta_t * (total_len - step - 1) /
                                        (step + 1))),
                "%.3lf" % (total_loss / (step + 1)), output_info, None, config)

        if output_file is not None:
            f = open(output_file, "a")
            f.write("epoch = %d" % epoch_num +
                    json.dumps(print_IoU(acc_result[1])) + "\n")
            f.close()

        if step == -1:
            logger.error(
                "There is no data given to the model in this epoch, check your data."
            )
            raise NotImplementedError

        checkpoint(os.path.join(output_path, "%d.pkl" % current_epoch), model,
                   optimizer, current_epoch, config, global_step)

        if current_epoch % test_time == 0:
            with torch.no_grad():
                valid(model, parameters["valid_dataset"], current_epoch, None,
                      config, gpu_list, output_function)