Exemple #1
0
def test(db, system_config, model, args):
    split = args.split
    testiter = args.testiter
    debug = args.debug
    suffix = args.suffix

    result_dir = system_config.result_dir
    result_dir = os.path.join(result_dir, str(testiter), split)

    if suffix is not None:
        result_dir = os.path.join(result_dir, suffix)

    make_dirs([result_dir])

    test_iter = system_config.max_iter if testiter is None else testiter
    print("loading parameters at iteration: {}".format(test_iter))

    print("building neural network...")
    nnet = NetworkFactory(system_config, model, test=True)
    print("loading parameters...")
    nnet.load_params(test_iter)

    nnet.cuda()
    nnet.eval_mode()
    test_func(system_config, db, nnet, result_dir, debug=debug)
Exemple #2
0
def test(db, system_config, model, args):
    split = args.split
    testiter = args.testiter
    debug = args.debug
    suffix = args.suffix

    # 输出的文件夹result_dir+testiter+split
    result_dir = system_config.result_dir
    result_dir = os.path.join(result_dir, str(testiter), split)

    # 后缀的添加
    if suffix is not None:
        result_dir = os.path.join(result_dir, suffix)

    # 创建文件夹
    make_dirs([result_dir])

    # 赋值test_iter 如果没有传入就用预设值
    test_iter = system_config.max_iter if testiter is None else testiter
    print("loading parameters at iteration: {}".format(test_iter))

    # 构建神经网络
    print("building neural network...")
    nnet = NetworkFactory(system_config, model)
    print("loading parameters...")
    nnet.load_params(test_iter)

    nnet.cuda()
    nnet.eval_mode()
    test_func(system_config, db, nnet, result_dir, debug=debug)
Exemple #3
0
def test(db, system_config, model, args):
    print("\033[0;35;46m" + "{}".format(" ") * 100 + "\033[0m")
    print("\033[0;33m " +
          "现在位置:{}/{}/.{}".format(os.getcwd(), os.path.basename(__file__),
                                  sys._getframe().f_code.co_name) + "\033[0m")
    split = args.split
    testiter = args.testiter
    debug = args.debug
    suffix = args.suffix

    print("\033[0;36m " + "{}".format("测试用的配置参数") + "\033[0m")
    print("\033[1;36m " + "split: {}, testiter:{}, debug:{}, suffix:{}".format(
        split, testiter, debug, suffix) + "\033[0m")
    # 输出的文件夹result_dir+testiter+split
    result_dir = system_config.result_dir
    result_dir = os.path.join(result_dir, str(testiter), split)

    # 后缀的添加
    if suffix is not None:
        result_dir = os.path.join(result_dir, suffix)

    # 创建文件夹
    make_dirs([result_dir])
    print("\033[1;36m " + "result_dir:{}".format(result_dir) + "\033[0m")

    # 赋值test_iter 如果没有传入就用预设值
    test_iter = system_config.max_iter if testiter is None else testiter
    print("\033[1;36m " +
          "loading parameters at iteration(在迭代次数为test_iter的缓存文件中加载参数): " +
          "\033[0m" + "{}".format(test_iter))

    # 构建神经网络
    print("\033[0;36m " + "{}".format("building neural network(创建神经网络)...") +
          "\033[0m")
    nnet = NetworkFactory(system_config, model)
    print("\033[0;36m " + "{}".format("loading parameters(加载参数)...") +
          "\033[0m")
    nnet.load_params(test_iter)

    nnet.cuda()
    nnet.eval_mode()
    test_func(system_config, db, nnet, result_dir, debug=debug)
Exemple #4
0
def train(training_dbs, validation_db, system_config, model, args):
    # reading arguments from command
    start_iter = args.start_iter
    distributed = args.distributed
    world_size = args.world_size
    initialize = args.initialize
    gpu = args.gpu
    rank = args.rank

    # reading arguments from json file
    batch_size = system_config.batch_size
    learning_rate = system_config.learning_rate
    max_iteration = system_config.max_iter
    pretrained_model = system_config.pretrain
    stepsize = system_config.stepsize
    snapshot = system_config.snapshot
    val_iter = system_config.val_iter
    display = system_config.display
    decay_rate = system_config.decay_rate
    stepsize = system_config.stepsize

    print("Process {}: building model...".format(rank))
    nnet = NetworkFactory(system_config,
                          model,
                          distributed=distributed,
                          gpu=gpu)
    if initialize:
        nnet.save_params(0)
        exit(0)

    # queues storing data for training
    training_queue = Queue(system_config.prefetch_size)
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_config.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(system_config, training_dbs,
                                        training_queue, data_sampling_func,
                                        True)
    if val_iter:
        validation_tasks = init_parallel_jobs(system_config, [validation_db],
                                              validation_queue,
                                              data_sampling_func, False)

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("Process {}: loading from pretrained model".format(rank))
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        nnet.load_params(start_iter)
        learning_rate /= (decay_rate**(start_iter // stepsize))
        nnet.set_lr(learning_rate)
        print(
            "Process {}: training starts from iteration {} with learning_rate {}"
            .format(rank, start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    if rank == 0:
        print("training start...")
    nnet.cuda()
    nnet.train_mode()
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            # training = pinned_training_queue.get(block=True)
            # training_loss = nnet.train(**training)
            #
            # if display and iteration % display == 0:
            #     print("Process {}: training loss at iteration {}: {}".format(rank, iteration, training_loss.item()))
            # del training_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                print("Process {}: validation loss at iteration {}: {}".format(
                    rank, iteration, validation_loss.item()))
                nnet.train_mode()

            if iteration % snapshot == 0 and rank == 0:
                nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    terminate_tasks(training_tasks)
    terminate_tasks(validation_tasks)
Exemple #5
0
def train(training_dbs, validation_db, system_config, model, args):
    print("\033[0;33m " +
          "现在位置:{}/{}/.{}".format(os.getcwd(), os.path.basename(__file__),
                                  sys._getframe().f_code.co_name) + "\033[0m")

    # reading arguments from command
    start_iter = args.start_iter
    distributed = args.distributed
    world_size = args.world_size
    initialize = args.initialize
    gpu = args.gpu
    rank = args.rank

    # reading arguments from json file
    batch_size = system_config.batch_size
    learning_rate = system_config.learning_rate
    max_iteration = system_config.max_iter
    pretrained_model = system_config.pretrain
    stepsize = system_config.stepsize
    snapshot = system_config.snapshot
    val_iter = system_config.val_iter
    display = system_config.display
    decay_rate = system_config.decay_rate
    stepsize = system_config.stepsize

    print("\033[1;36m " + "Process {}: building model(生成模型中)...".format(rank) +
          "\033[0m")
    nnet = NetworkFactory(system_config,
                          model,
                          distributed=distributed,
                          gpu=gpu)

    if initialize:
        nnet.save_params(0)
        exit(0)

    # 开4个队列去存数据
    # queues storing data for training
    training_queue = Queue(system_config.prefetch_size)
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_config.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(system_config, training_dbs,
                                        training_queue, data_sampling_func,
                                        True)
    if val_iter:
        validation_tasks = init_parallel_jobs(system_config, [validation_db],
                                              validation_queue,
                                              data_sampling_func, False)

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    # 看是否有先训练的模型
    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("Process {}: loading from pretrained model".format(rank))
        nnet.load_pretrained_params(pretrained_model)

    # 看有没有开始的迭代器
    if start_iter:
        nnet.load_params(start_iter)
        learning_rate /= (decay_rate**(start_iter // stepsize))
        nnet.set_lr(learning_rate)
        print(
            "Process {}: training starts from iteration {} with learning_rate {}"
            .format(rank, start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    # 训练模型
    if rank == 0:
        print("\033[1;36m " + "training start(训练开始)...".format(rank) +
              "\033[0m")

    nnet.cuda()
    nnet.train_mode()

    #
    with stdout_to_tqdm() as save_stdout:
        # Tqdm 是一个快速,可扩展的Python进度条,可以在 Python 长循环中添加一个进度提示信息
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            training_loss = nnet.train(**training)

            # 如果设置了display的步长,我们在步长整数倍时展示损失函数的值
            if display and iteration % display == 0:
                print(
                    "\033[1;36m " +
                    "Process(进程){}: iteration(迭代数) [{}]时的training loss(损失函数值):"
                    .format(rank, iteration) + "\033[0m" +
                    "{}".format(training_loss.item()))
            del training_loss

            # 如果设置了变量迭代器步长[这边是验证集了]
            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                print("\033[1;33m " + "Process {}:".format(rank) + "\033[0m" +
                      "\033[1;36m " +
                      "validation loss at iteration {}:".format(iteration) +
                      "\033[0m" + "{}".format(validation_loss.item()))
                nnet.train_mode()

            # 快照步长
            if iteration % snapshot == 0 and rank == 0:
                nnet.save_params(iteration)

            # 学习率更新步长
            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                print("\033[1;35m " + "此时学习率更新为:" + "\033[0m" +
                      "{}".format(learning_rate))
                nnet.set_lr(learning_rate)

    # sending signal to kill the thread
    # 杀掉进程的消息
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    # 结束任务的消息
    terminate_tasks(training_tasks)
    terminate_tasks(validation_tasks)
Exemple #6
0
def train(train_logger, training_dbs, validation_db, system_config, model,
          args):
    # reading arguments from command
    start_iter = args.start_iter
    distributed = args.distributed
    world_size = args.world_size
    initialize = args.initialize
    gpu = args.gpu
    rank = args.rank

    # reading arguments from json file
    batch_size = system_config.batch_size
    learning_rate = system_config.learning_rate
    max_iteration = system_config.max_iter
    pretrained_model = system_config.pretrain
    snapshot = system_config.snapshot
    val_iter = system_config.val_iter
    display = system_config.display
    decay_rate = system_config.decay_rate
    stepsize = system_config.stepsize

    train_logger.train_logging("Process {}: building model...".format(rank))
    nnet = NetworkFactory(system_config,
                          model,
                          distributed=distributed,
                          gpu=gpu)
    if initialize:
        nnet.save_params(0)
        exit(0)

    # queues storing data for training
    training_queue = Queue(system_config.prefetch_size)
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_config.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(train_logger, system_config,
                                        training_dbs, training_queue,
                                        data_sampling_func, True)
    if val_iter:
        validation_tasks = init_parallel_jobs(train_logger, system_config,
                                              [validation_db],
                                              validation_queue,
                                              data_sampling_func, False)

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        train_logger.train_logging(
            "Process {}: loading from pretrained model".format(rank))
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        nnet.load_params(start_iter)
        learning_rate /= (decay_rate**(start_iter // stepsize))
        nnet.set_lr(learning_rate)
        train_logger.train_logging(
            "Process {}: training starts from iteration {} with learning_rate {}"
            .format(rank, start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    if rank == 0:
        train_logger.train_logging("training start...")
    nnet.cuda()
    nnet.train_mode()
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            training_loss = nnet.train(**training)

            train_logger.tb_logging('Train/loss',
                                    {'tloss': training_loss.item()}, iteration)

            if display and iteration % display == 0:
                train_logger.train_logging(
                    "Process {}: training loss at iteration {}: {}".format(
                        rank, iteration, training_loss.item()))
            del training_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                # calculate validation loss
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                train_logger.train_logging(
                    "Process {}: validation loss at iteration {}: {}".format(
                        rank, iteration, validation_loss.item()))
                train_logger.tb_logging('Val/loss',
                                        {'vloss': validation_loss.item()},
                                        iteration)
                nnet.train_mode()

            if iteration % snapshot == 0 and rank == 0:
                nnet.eval_mode()
                # calculate validation mAP
                val_split = system_config.val_split
                mAP, _, detect_average_time = test(validation_db,
                                                   system_config,
                                                   nnet,
                                                   val_iter,
                                                   val_split,
                                                   debug=True)
                train_logger.train_logging(
                    "Process {}: mAP at iteration {}: {}".format(
                        rank, iteration, mAP))
                train_logger.train_logging(
                    "Detect average time: {}".format(detect_average_time))
                nnet.train_mode()

            if iteration % snapshot == 0 and rank == 0:
                nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)

            # dc = 0
            # handle = nvmlDeviceGetHandleByIndex(dc)
            # res = nvmlDeviceGetUtilizationRates(handle)
            # gpu_util = res.gpu
            # res = nvmlDeviceGetMemoryInfo(handle)
            # gpu_mem = res.used / 1024 / 1024
            # train_logger.tb_logging('data/NV', {'gpu-util': gpu_util, 'gpu-mem': gpu_mem}, iteration)

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    terminate_tasks(training_tasks)
    terminate_tasks(validation_tasks)
Exemple #7
0
def train(training_dbs, validation_db, system_config, model, args):
    # reading arguments from command
    start_iter = args.start_iter
    initialize = args.initialize
    gpu = args.gpu

    # reading arguments from json file
    learning_rate = system_config.learning_rate
    max_iteration = system_config.max_iter
    pretrained_model = system_config.pretrain
    stepsize = system_config.stepsize
    snapshot = system_config.snapshot
    val_iter = system_config.val_iter
    display = system_config.display
    decay_rate = system_config.decay_rate

    print("building model...")
    nnet = NetworkFactory(system_config, model, gpu=gpu)
    if initialize:
        nnet.save_params(0)
        exit(0)

    # queues storing data for training
    training_queue = Queue(system_config.prefetch_size)
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_config.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # allocating resources for parallel reading
    # parallel read train data to queue
    # 每个worker对应一份training_db,生成workder个并行读数据的进程
    training_tasks = init_parallel_jobs(system_config, training_dbs,
                                        training_queue, data_sampling_func,
                                        True)
    if val_iter:
        validation_tasks = init_parallel_jobs(system_config, [validation_db],
                                              validation_queue,
                                              data_sampling_func, False)

    #设置进程信号量,线程负责把数据从training_queue读到pinned_training_queue中
    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("loading from pretrained model")
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        nnet.load_params(start_iter)
        learning_rate /= (decay_rate**(start_iter // stepsize))
        learning_rate = max(1e-4, learning_rate)
        nnet.set_lr(learning_rate)
        print("training starts from iteration {} with learning_rate {}".format(
            start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    print("training start, max iteration {}".format(max_iteration))

    nnet.cuda()
    nnet.train_mode()
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            training_loss = nnet.train(training["xs"], training["ys"])

            if display and iteration % display == 0:
                print("training loss at iteration {}: {}".format(
                    iteration, training_loss.item()))


#                 writer.add_scalar('training_loss', training_loss, global_step=iteration)

            print("[log-loss]:{}={}".format(iteration, training_loss.item()))
            writer.add_scalar('train_loss',
                              training_loss,
                              global_step=iteration)

            del training_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(validation["xs"],
                                                validation["ys"])
                print("[log-validation-loss]:{}={}".format(
                    iteration, validation_loss.item()))
                writer.add_scalar('validation_loss',
                                  validation_loss,
                                  global_step=iteration)

                nnet.train_mode()

            if iteration % snapshot == 0:
                nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                learning_rate = max(5e-5, learning_rate)
                nnet.set_lr(learning_rate)
                print("set learning rate {}".format(learning_rate))

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    terminate_tasks(training_tasks)
    terminate_tasks(validation_tasks)

    writer.close()