Exemple #1
0
def train(training_dbs, validation_db, system_config, model, args):
    # reading arguments from command
    start_iter = args.start_iter
    distributed = args.distributed
    world_size = args.world_size
    initialize = args.initialize
    gpu = args.gpu
    rank = args.rank

    # reading arguments from json file
    batch_size = system_config.batch_size
    learning_rate = system_config.learning_rate
    max_iteration = system_config.max_iter
    pretrained_model = system_config.pretrain
    stepsize = system_config.stepsize
    snapshot = system_config.snapshot
    val_iter = system_config.val_iter
    display = system_config.display
    decay_rate = system_config.decay_rate
    stepsize = system_config.stepsize

    print("Process {}: building model...".format(rank))
    nnet = NetworkFactory(system_config,
                          model,
                          distributed=distributed,
                          gpu=gpu)
    if initialize:
        nnet.save_params(0)
        exit(0)

    # queues storing data for training
    training_queue = Queue(system_config.prefetch_size)
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_config.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(system_config, training_dbs,
                                        training_queue, data_sampling_func,
                                        True)
    if val_iter:
        validation_tasks = init_parallel_jobs(system_config, [validation_db],
                                              validation_queue,
                                              data_sampling_func, False)

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("Process {}: loading from pretrained model".format(rank))
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        nnet.load_params(start_iter)
        learning_rate /= (decay_rate**(start_iter // stepsize))
        nnet.set_lr(learning_rate)
        print(
            "Process {}: training starts from iteration {} with learning_rate {}"
            .format(rank, start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    if rank == 0:
        print("training start...")
    nnet.cuda()
    nnet.train_mode()
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            # training = pinned_training_queue.get(block=True)
            # training_loss = nnet.train(**training)
            #
            # if display and iteration % display == 0:
            #     print("Process {}: training loss at iteration {}: {}".format(rank, iteration, training_loss.item()))
            # del training_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                print("Process {}: validation loss at iteration {}: {}".format(
                    rank, iteration, validation_loss.item()))
                nnet.train_mode()

            if iteration % snapshot == 0 and rank == 0:
                nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    terminate_tasks(training_tasks)
    terminate_tasks(validation_tasks)
Exemple #2
0
def train(training_dbs, validation_db, system_config, model, args):
    print("\033[0;33m " +
          "现在位置:{}/{}/.{}".format(os.getcwd(), os.path.basename(__file__),
                                  sys._getframe().f_code.co_name) + "\033[0m")

    # reading arguments from command
    start_iter = args.start_iter
    distributed = args.distributed
    world_size = args.world_size
    initialize = args.initialize
    gpu = args.gpu
    rank = args.rank

    # reading arguments from json file
    batch_size = system_config.batch_size
    learning_rate = system_config.learning_rate
    max_iteration = system_config.max_iter
    pretrained_model = system_config.pretrain
    stepsize = system_config.stepsize
    snapshot = system_config.snapshot
    val_iter = system_config.val_iter
    display = system_config.display
    decay_rate = system_config.decay_rate
    stepsize = system_config.stepsize

    print("\033[1;36m " + "Process {}: building model(生成模型中)...".format(rank) +
          "\033[0m")
    nnet = NetworkFactory(system_config,
                          model,
                          distributed=distributed,
                          gpu=gpu)

    if initialize:
        nnet.save_params(0)
        exit(0)

    # 开4个队列去存数据
    # queues storing data for training
    training_queue = Queue(system_config.prefetch_size)
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_config.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(system_config, training_dbs,
                                        training_queue, data_sampling_func,
                                        True)
    if val_iter:
        validation_tasks = init_parallel_jobs(system_config, [validation_db],
                                              validation_queue,
                                              data_sampling_func, False)

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    # 看是否有先训练的模型
    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("Process {}: loading from pretrained model".format(rank))
        nnet.load_pretrained_params(pretrained_model)

    # 看有没有开始的迭代器
    if start_iter:
        nnet.load_params(start_iter)
        learning_rate /= (decay_rate**(start_iter // stepsize))
        nnet.set_lr(learning_rate)
        print(
            "Process {}: training starts from iteration {} with learning_rate {}"
            .format(rank, start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    # 训练模型
    if rank == 0:
        print("\033[1;36m " + "training start(训练开始)...".format(rank) +
              "\033[0m")

    nnet.cuda()
    nnet.train_mode()

    #
    with stdout_to_tqdm() as save_stdout:
        # Tqdm 是一个快速,可扩展的Python进度条,可以在 Python 长循环中添加一个进度提示信息
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            training_loss = nnet.train(**training)

            # 如果设置了display的步长,我们在步长整数倍时展示损失函数的值
            if display and iteration % display == 0:
                print(
                    "\033[1;36m " +
                    "Process(进程){}: iteration(迭代数) [{}]时的training loss(损失函数值):"
                    .format(rank, iteration) + "\033[0m" +
                    "{}".format(training_loss.item()))
            del training_loss

            # 如果设置了变量迭代器步长[这边是验证集了]
            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                print("\033[1;33m " + "Process {}:".format(rank) + "\033[0m" +
                      "\033[1;36m " +
                      "validation loss at iteration {}:".format(iteration) +
                      "\033[0m" + "{}".format(validation_loss.item()))
                nnet.train_mode()

            # 快照步长
            if iteration % snapshot == 0 and rank == 0:
                nnet.save_params(iteration)

            # 学习率更新步长
            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                print("\033[1;35m " + "此时学习率更新为:" + "\033[0m" +
                      "{}".format(learning_rate))
                nnet.set_lr(learning_rate)

    # sending signal to kill the thread
    # 杀掉进程的消息
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    # 结束任务的消息
    terminate_tasks(training_tasks)
    terminate_tasks(validation_tasks)
Exemple #3
0
def train(training_dbs, validation_db, system_config, model, args):
    # reading arguments from command
    start_iter = args.start_iter
    initialize = args.initialize
    gpu = args.gpu

    # reading arguments from json file
    learning_rate = system_config.learning_rate
    max_iteration = system_config.max_iter
    pretrained_model = system_config.pretrain
    stepsize = system_config.stepsize
    snapshot = system_config.snapshot
    val_iter = system_config.val_iter
    display = system_config.display
    decay_rate = system_config.decay_rate

    print("building model...")
    nnet = NetworkFactory(system_config, model, gpu=gpu)
    if initialize:
        nnet.save_params(0)
        exit(0)

    # queues storing data for training
    training_queue = Queue(system_config.prefetch_size)
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_config.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # allocating resources for parallel reading
    # parallel read train data to queue
    # 每个worker对应一份training_db,生成workder个并行读数据的进程
    training_tasks = init_parallel_jobs(system_config, training_dbs,
                                        training_queue, data_sampling_func,
                                        True)
    if val_iter:
        validation_tasks = init_parallel_jobs(system_config, [validation_db],
                                              validation_queue,
                                              data_sampling_func, False)

    #设置进程信号量,线程负责把数据从training_queue读到pinned_training_queue中
    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        print("loading from pretrained model")
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        nnet.load_params(start_iter)
        learning_rate /= (decay_rate**(start_iter // stepsize))
        learning_rate = max(1e-4, learning_rate)
        nnet.set_lr(learning_rate)
        print("training starts from iteration {} with learning_rate {}".format(
            start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    print("training start, max iteration {}".format(max_iteration))

    nnet.cuda()
    nnet.train_mode()
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            training_loss = nnet.train(training["xs"], training["ys"])

            if display and iteration % display == 0:
                print("training loss at iteration {}: {}".format(
                    iteration, training_loss.item()))


#                 writer.add_scalar('training_loss', training_loss, global_step=iteration)

            print("[log-loss]:{}={}".format(iteration, training_loss.item()))
            writer.add_scalar('train_loss',
                              training_loss,
                              global_step=iteration)

            del training_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(validation["xs"],
                                                validation["ys"])
                print("[log-validation-loss]:{}={}".format(
                    iteration, validation_loss.item()))
                writer.add_scalar('validation_loss',
                                  validation_loss,
                                  global_step=iteration)

                nnet.train_mode()

            if iteration % snapshot == 0:
                nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                learning_rate = max(5e-5, learning_rate)
                nnet.set_lr(learning_rate)
                print("set learning rate {}".format(learning_rate))

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    terminate_tasks(training_tasks)
    terminate_tasks(validation_tasks)

    writer.close()
Exemple #4
0
def train(train_logger, training_dbs, validation_db, system_config, model,
          args):
    # reading arguments from command
    start_iter = args.start_iter
    distributed = args.distributed
    world_size = args.world_size
    initialize = args.initialize
    gpu = args.gpu
    rank = args.rank

    # reading arguments from json file
    batch_size = system_config.batch_size
    learning_rate = system_config.learning_rate
    max_iteration = system_config.max_iter
    pretrained_model = system_config.pretrain
    snapshot = system_config.snapshot
    val_iter = system_config.val_iter
    display = system_config.display
    decay_rate = system_config.decay_rate
    stepsize = system_config.stepsize

    train_logger.train_logging("Process {}: building model...".format(rank))
    nnet = NetworkFactory(system_config,
                          model,
                          distributed=distributed,
                          gpu=gpu)
    if initialize:
        nnet.save_params(0)
        exit(0)

    # queues storing data for training
    training_queue = Queue(system_config.prefetch_size)
    validation_queue = Queue(5)

    # queues storing pinned data for training
    pinned_training_queue = queue.Queue(system_config.prefetch_size)
    pinned_validation_queue = queue.Queue(5)

    # allocating resources for parallel reading
    training_tasks = init_parallel_jobs(train_logger, system_config,
                                        training_dbs, training_queue,
                                        data_sampling_func, True)
    if val_iter:
        validation_tasks = init_parallel_jobs(train_logger, system_config,
                                              [validation_db],
                                              validation_queue,
                                              data_sampling_func, False)

    training_pin_semaphore = threading.Semaphore()
    validation_pin_semaphore = threading.Semaphore()
    training_pin_semaphore.acquire()
    validation_pin_semaphore.acquire()

    training_pin_args = (training_queue, pinned_training_queue,
                         training_pin_semaphore)
    training_pin_thread = threading.Thread(target=pin_memory,
                                           args=training_pin_args)
    training_pin_thread.daemon = True
    training_pin_thread.start()

    validation_pin_args = (validation_queue, pinned_validation_queue,
                           validation_pin_semaphore)
    validation_pin_thread = threading.Thread(target=pin_memory,
                                             args=validation_pin_args)
    validation_pin_thread.daemon = True
    validation_pin_thread.start()

    if pretrained_model is not None:
        if not os.path.exists(pretrained_model):
            raise ValueError("pretrained model does not exist")
        train_logger.train_logging(
            "Process {}: loading from pretrained model".format(rank))
        nnet.load_pretrained_params(pretrained_model)

    if start_iter:
        nnet.load_params(start_iter)
        learning_rate /= (decay_rate**(start_iter // stepsize))
        nnet.set_lr(learning_rate)
        train_logger.train_logging(
            "Process {}: training starts from iteration {} with learning_rate {}"
            .format(rank, start_iter + 1, learning_rate))
    else:
        nnet.set_lr(learning_rate)

    if rank == 0:
        train_logger.train_logging("training start...")
    nnet.cuda()
    nnet.train_mode()
    with stdout_to_tqdm() as save_stdout:
        for iteration in tqdm(range(start_iter + 1, max_iteration + 1),
                              file=save_stdout,
                              ncols=80):
            training = pinned_training_queue.get(block=True)
            training_loss = nnet.train(**training)

            train_logger.tb_logging('Train/loss',
                                    {'tloss': training_loss.item()}, iteration)

            if display and iteration % display == 0:
                train_logger.train_logging(
                    "Process {}: training loss at iteration {}: {}".format(
                        rank, iteration, training_loss.item()))
            del training_loss

            if val_iter and validation_db.db_inds.size and iteration % val_iter == 0:
                nnet.eval_mode()
                # calculate validation loss
                validation = pinned_validation_queue.get(block=True)
                validation_loss = nnet.validate(**validation)
                train_logger.train_logging(
                    "Process {}: validation loss at iteration {}: {}".format(
                        rank, iteration, validation_loss.item()))
                train_logger.tb_logging('Val/loss',
                                        {'vloss': validation_loss.item()},
                                        iteration)
                nnet.train_mode()

            if iteration % snapshot == 0 and rank == 0:
                nnet.eval_mode()
                # calculate validation mAP
                val_split = system_config.val_split
                mAP, _, detect_average_time = test(validation_db,
                                                   system_config,
                                                   nnet,
                                                   val_iter,
                                                   val_split,
                                                   debug=True)
                train_logger.train_logging(
                    "Process {}: mAP at iteration {}: {}".format(
                        rank, iteration, mAP))
                train_logger.train_logging(
                    "Detect average time: {}".format(detect_average_time))
                nnet.train_mode()

            if iteration % snapshot == 0 and rank == 0:
                nnet.save_params(iteration)

            if iteration % stepsize == 0:
                learning_rate /= decay_rate
                nnet.set_lr(learning_rate)

            # dc = 0
            # handle = nvmlDeviceGetHandleByIndex(dc)
            # res = nvmlDeviceGetUtilizationRates(handle)
            # gpu_util = res.gpu
            # res = nvmlDeviceGetMemoryInfo(handle)
            # gpu_mem = res.used / 1024 / 1024
            # train_logger.tb_logging('data/NV', {'gpu-util': gpu_util, 'gpu-mem': gpu_mem}, iteration)

    # sending signal to kill the thread
    training_pin_semaphore.release()
    validation_pin_semaphore.release()

    # terminating data fetching processes
    terminate_tasks(training_tasks)
    terminate_tasks(validation_tasks)
Exemple #5
0
    def Train(self, display_interval=100):
                # reading arguments from command
        start_iter  = self.system_dict["training"]["params"]["start_iter"]
        distributed = self.system_dict["model"]["params"]["distributed"]
        world_size  = self.system_dict["model"]["params"]["world_size"]
        initialize  = self.system_dict["model"]["params"]["initialize"]
        gpu         = None
        rank        = self.system_dict["model"]["params"]["rank"]

        # reading arguments from json file
        batch_size       = self.system_dict["dataset"]["params"]["batch_size"]
        learning_rate    = self.system_dict["training"]["params"]["lr"]
        max_iteration    = self.system_dict["training"]["params"]["total_iterations"]
        pretrained_model = None;

        stepsize         = int(self.system_dict["training"]["params"]["total_iterations"]*0.8)
        snapshot         = int(self.system_dict["training"]["params"]["total_iterations"]*0.5)
        val_iter         = self.system_dict["training"]["params"]["val_interval"]
        display          = display_interval
        decay_rate       = self.system_dict["local"]["system_config"].decay_rate

        print("start_iter       = {}".format(start_iter));
        print("distributed      = {}".format(distributed));
        print("world_size       = {}".format(world_size));
        print("initialize       = {}".format(initialize));
        print("batch_size       = {}".format(batch_size));
        print("learning_rate    = {}".format(learning_rate));
        print("max_iteration    = {}".format(max_iteration));
        print("stepsize         = {}".format(stepsize));
        print("snapshot         = {}".format(snapshot));
        print("val_iter         = {}".format(val_iter));
        print("display          = {}".format(display));
        print("decay_rate       = {}".format(decay_rate));



        print("Process {}: building model...".format(rank))
        self.system_dict["local"]["nnet"] = NetworkFactory(self.system_dict["local"]["system_config"], 
                                self.system_dict["local"]["model"], distributed=distributed, gpu=gpu)


        # queues storing data for training
        training_queue   = Queue(self.system_dict["local"]["system_config"].prefetch_size)
        validation_queue = Queue(5)

        # queues storing pinned data for training
        pinned_training_queue   = queue.Queue(self.system_dict["local"]["system_config"].prefetch_size)
        pinned_validation_queue = queue.Queue(5)


        # allocating resources for parallel reading
        training_tasks = init_parallel_jobs(self.system_dict["local"]["system_config"], 
                                            self.system_dict["local"]["training_dbs"], 
                                            training_queue, data_sampling_func, True)


        
        if self.system_dict["dataset"]["val"]["status"]:
            validation_tasks = init_parallel_jobs(self.system_dict["local"]["system_config"], 
                                                    [self.system_dict["local"]["validation_db"]], 
                                                    validation_queue, data_sampling_func, False)


        training_pin_semaphore   = threading.Semaphore()
        validation_pin_semaphore = threading.Semaphore()
        training_pin_semaphore.acquire()
        validation_pin_semaphore.acquire()

        training_pin_args   = (training_queue, pinned_training_queue, training_pin_semaphore)
        training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args)
        training_pin_thread.daemon = True
        training_pin_thread.start()

        validation_pin_args   = (validation_queue, pinned_validation_queue, validation_pin_semaphore)
        validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args)
        validation_pin_thread.daemon = True
        validation_pin_thread.start()
        
        if pretrained_model is not None:
            if not os.path.exists(pretrained_model):
                raise ValueError("pretrained model does not exist")
            print("Process {}: loading from pretrained model".format(rank))
            self.system_dict["local"]["nnet"].load_pretrained_params(pretrained_model)

        if start_iter:
            self.system_dict["local"]["nnet"].load_params(start_iter)
            learning_rate /= (decay_rate ** (start_iter // stepsize))
            self.system_dict["local"]["nnet"].set_lr(learning_rate)
            print("Process {}: training starts from iteration {} with learning_rate {}".format(rank, start_iter + 1, learning_rate))
        else:
            self.system_dict["local"]["nnet"].set_lr(learning_rate)


        if rank == 0:
            print("training start...")

        self.system_dict["local"]["nnet"].cuda()
        self.system_dict["local"]["nnet"].train_mode()   

        if(self.system_dict["dataset"]["val"]["status"]):
            old_val_loss = 100000.0;
            with stdout_to_tqdm() as save_stdout:
                for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80):
                    training = pinned_training_queue.get(block=True)
                    training_loss = self.system_dict["local"]["nnet"].train(**training)

                    if display and iteration % display == 0:
                        print("Process {}: training loss at iteration {}: {}".format(rank, iteration, training_loss.item()))
                    del training_loss

                    if val_iter and self.system_dict["local"]["validation_db"].db_inds.size and iteration % val_iter == 0:
                        self.system_dict["local"]["nnet"].eval_mode()
                        validation = pinned_validation_queue.get(block=True)
                        validation_loss = self.system_dict["local"]["nnet"].validate(**validation)
                        print("Process {}: validation loss at iteration {}: {}".format(rank, iteration, validation_loss.item()))
                        if(validation_loss < old_val_loss):
                            print("Loss Reduced from {} to {}".format(old_val_loss, validation_loss))
                            self.system_dict["local"]["nnet"].save_params("best");
                            old_val_loss = validation_loss;
                        else:
                            print("validation loss did not go below {}, current loss - {}".format(old_val_loss, validation_loss))

                        self.system_dict["local"]["nnet"].train_mode()
                        

                    if iteration % stepsize == 0:
                        learning_rate /= decay_rate
                        self.system_dict["local"]["nnet"].set_lr(learning_rate)

            self.system_dict["local"]["nnet"].save_params("final");

            # sending signal to kill the thread
            training_pin_semaphore.release()
            validation_pin_semaphore.release()

            # terminating data fetching processes
            terminate_tasks(training_tasks)
            terminate_tasks(validation_tasks)


        else:
            with stdout_to_tqdm() as save_stdout:
                for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80):
                    training = pinned_training_queue.get(block=True)
                    training_loss = self.system_dict["local"]["nnet"].train(**training)

                    if display and iteration % display == 0:
                        print("Process {}: training loss at iteration {}: {}".format(rank, iteration, training_loss.item()))
                    del training_loss


                    if(iteration % val_iter == 0):
                        self.system_dict["local"]["nnet"].save_params("intermediate");                     

                    if iteration % stepsize == 0:
                        learning_rate /= decay_rate
                        self.system_dict["local"]["nnet"].set_lr(learning_rate)

            self.system_dict["local"]["nnet"].save_params("final");

            # sending signal to kill the thread
            training_pin_semaphore.release()

            # terminating data fetching processes
            terminate_tasks(training_tasks)