Python SimpleQueue Examples

Programming Language: Python

Namespace/Package Name: torch.multiprocessing

Method/Function: SimpleQueue

Examples at hotexamples.com: 8

Python SimpleQueue - 8 examples found. These are the top rated real world Python examples of torch.multiprocessing.SimpleQueue extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: dataloader.py Project: fsun2000/iFlow

    def __init__(self, loader):
        self.dataset = loader.dataset
        self.collate_fn = loader.collate_fn
        self.batch_sampler = loader.batch_sampler
        self.num_workers = loader.num_workers
        self.pin_memory = loader.pin_memory and torch.cuda.is_available()
        self.timeout = loader.timeout
        self.done_event = threading.Event()

        self.sample_iter = iter(self.batch_sampler)

        if self.num_workers > 0:
            self.worker_init_fn = loader.worker_init_fn
            self.index_queues = [
                multiprocessing.Queue() for _ in range(self.num_workers)
            ]
            self.worker_queue_idx = 0
            self.worker_result_queue = multiprocessing.SimpleQueue()
            self.batches_outstanding = 0
            self.worker_pids_set = False
            self.shutdown = False
            self.send_idx = 0
            self.rcvd_idx = 0
            self.reorder_dict = {}

            base_seed = torch.LongTensor(1).random_()[0]
            self.workers = [
                multiprocessing.Process(
                    target=_ms_loop,
                    args=(self.dataset, self.index_queues[i],
                          self.worker_result_queue, self.collate_fn,
                          base_seed + i, self.worker_init_fn, i))
                for i in range(self.num_workers)
            ]

            if self.pin_memory or self.timeout > 0:
                self.data_queue = queue.Queue()
                if self.pin_memory:
                    maybe_device_id = torch.cuda.current_device()
                else:
                    # do not initialize cuda context if not necessary
                    maybe_device_id = None
                self.worker_manager_thread = threading.Thread(
                    target=_worker_manager_loop,
                    args=(self.worker_result_queue, self.data_queue,
                          self.done_event, self.pin_memory, maybe_device_id))
                self.worker_manager_thread.daemon = True
                self.worker_manager_thread.start()
            else:
                self.data_queue = self.worker_result_queue

            for w in self.workers:
                w.daemon = True  # ensure that the worker exits on process exit
                w.start()

            _update_worker_pids(id(self), tuple(w.pid for w in self.workers))
            _set_SIGCHLD_handler()
            self.worker_pids_set = True

            # prime the prefetch loop
            for _ in range(2 * self.num_workers):
                self._put_indices()

Example #2

Show file

        shared_average_model.load_state_dict(
            torch.load(args.model, map_location="cpu"))
    if args.memory and os.path.isdir(args.memory):
        memory.load(args.memory)
        print("Load memory from CheckPoint {}, memory len: {}".format(
            args.memory, len(memory)))
    if args.data and os.path.isfile(args.data):
        T.set(torch.load(args.data)[0])
        BEST.set(torch.load(args.data)[1])
        scores = torch.load(args.data)[2]
        m_scores = torch.load(args.data)[3]
        pre_best = BEST.value()
        print("Load data from CheckPoint {}, T: {}.BEST: {}".format(
            args.data, T.value(), BEST.value()))

    memory_queue = mp.SimpleQueue()
    model_queue = mp.SimpleQueue()
    processes = []
    p2_list = ["ReiwaThunder", "RHEA_PI", "Toothless", "FalzAI"]
    if not args.evaluate:
        # Start training agents
        for rank in range(1, args.num_processes + 1):
            model_queue.put(
                (shared_model.state_dict(), shared_average_model.state_dict()))
            p2 = p2_list[(rank - 1) % len(p2_list)]
            p = mp.Process(target=actor,
                           args=(rank, args, T, BEST, memory_queue,
                                 model_queue, p2))
            p.start()
            print('Process ' + str(rank) + ' started')
            processes.append(p)

Example #3

Show file

File: train_gwpe_dynamic.py Project: tanghyd/gravflows

def train(
    rank: int,
    world_size: int,
    lr: float = 5e-4,
    batch_size: int = 1000,
    epochs: int = 500,
    interval: int = 10,
    save: int = 100,
    num_workers: int = 4,
    num_basis: int = 100,
    dataset: str = 'datasets',
    load_dir: Optional[str] = None,
    load_epoch: Optional[int] = None,
    coefficient_noise: bool = False,
    verbose: bool = False,
    use_zero: bool = False,
):
    assert 0 < batch_size, "batch_size must be a positive integer."
    assert 0 < epochs, "epochs must be a positive integer."
    assert (0 <= interval) and (
        interval <= epochs
    ), "Interval must be a non-negative integer between 0 and epochs."
    assert (0 <= save) and (
        save <=
        epochs), "Save must be a non-negative integer between 0 and epochs."

    # setup data distributed parallel training
    setup_nccl(rank, world_size)  # world size is total gpus
    torch.cuda.set_device(rank)  # rank is gpu index

    # directories
    if rank == 0: print(f"Loading data from {dataset}...")
    data_dir = Path(f'/mnt/datahole/daniel/gravflows/{dataset}/train/')
    val_dir = Path(f'/mnt/datahole/daniel/gravflows/{dataset}/validation/')

    noise_dir = Path('/mnt/datahole/daniel/gwosc/O1')
    psd_dir = Path(f"/mnt/datahole/daniel/gravflows/{dataset}/train/PSD/")
    basis_dir = Path(f'/mnt/datahole/daniel/gravflows/{dataset}/basis/')

    log_dir = f"{datetime.now().strftime('%b%d_%H-%M-%S')}_{os.uname().nodename}"

    save_dir = Path('gwpe/model_weights/')
    experiment_dir = save_dir / log_dir
    experiment_dir.mkdir(parents=True, exist_ok=True)

    # config files
    waveform_params_ini = str(data_dir / 'config_files/parameters.ini')
    extrinsics_ini = 'gwpe/config_files/extrinsics.ini'

    static_args_ini = str(data_dir / 'config_files/static_args.ini')

    # validation

    # training data
    # dataset = BasisCoefficientsDataset(
    #     data_dir=data_dir,
    #     basis_dir=basis_dir,
    #     static_args_ini=static_args_ini,
    #     parameters_ini=waveform_params_ini,
    # )

    # dataset = BasisEncoderDataset(
    #     n=num_basis,
    #     data_dir=data_dir,
    #     basis_dir=basis_dir,
    #     static_args_ini=static_args_ini,
    #     intrinsics_ini=waveform_params_ini,
    #     extrinsics_ini=extrinsics_ini,
    #     psd_dir=psd_dir,
    #     ifos=['H1','L1'],
    #     ref_ifo='H1',
    #     downcast=True,
    #     add_noise=True,
    #     coefficient_noise=coefficient_noise,
    # )

    dataset = LFIGWDataset(
        n=100,
        data_dir=data_dir,
        basis_dir=basis_dir,
        static_args_ini=static_args_ini,
        data_file='coefficients.npy',
        intrinsics_ini=waveform_params_ini,
        extrinsics_ini=extrinsics_ini,
        psd_dir=psd_dir,
        ifos=['H1', 'L1'],
        ref_ifo='H1',
        downcast=True,
        add_noise=True,
        distance_scale=True,
        time_shift=False,
    )

    sampler = DistributedSampler(
        dataset,
        shuffle=False,
        num_replicas=world_size,
        rank=rank,
        seed=rank,
    )

    dataloader = DataLoader(
        dataset,
        shuffle=False,
        num_workers=num_workers,
        batch_size=batch_size,
        sampler=sampler,
        pin_memory=True,
        persistent_workers=True,
        prefetch_factor=2,
        worker_init_fn=dataset._worker_init_fn,
        collate_fn=dataset._collate_fn,
    )

    # validation data
    val_dataset = LFIGWDataset(
        n=100,
        data_dir=data_dir,
        basis_dir=basis_dir,
        static_args_ini=static_args_ini,
        data_file='coefficients.npy',
        intrinsics_ini=waveform_params_ini,
        extrinsics_ini=extrinsics_ini,
        psd_dir=psd_dir,
        ifos=['H1', 'L1'],
        ref_ifo='H1',
        downcast=True,
        add_noise=True,
        coefficient_noise=coefficient_noise,
        distance_scale=True,
        time_shift=False,
    )

    # val_dataset = BasisCoefficientsDataset(
    #     data_dir=val_dir,
    #     basis_dir=basis_dir,
    #     static_args_ini=static_args_ini,
    #     parameters_ini=[waveform_params_ini, extrinsics_ini],
    #     coefficient_noise=coefficient_noise,
    # )

    val_sampler = DistributedSampler(
        val_dataset,
        shuffle=False,
        num_replicas=world_size,
        rank=rank,
        seed=rank,
    )

    val_loader = DataLoader(
        val_dataset,
        shuffle=False,
        num_workers=num_workers,
        batch_size=batch_size,
        sampler=val_sampler,
        pin_memory=True,
        prefetch_factor=4,
        worker_init_fn=val_dataset._worker_init_fn,
        collate_fn=val_dataset._collate_fn,
    )

    # validation data
    if interval != 0:

        # specify indices in validation dataset to validate samples
        min_idx = val_dataset.parameters.distance.argmin()
        max_idx = val_dataset.parameters.distance.argmax()
        median_idx = val_dataset.parameters.loc[
            val_dataset.parameters.distance == val_dataset.parameters.distance.
            quantile(interpolation='nearest')].index[0]

        if rank == 0:
            figure_titles = [
                'GW150914', 'Min Distance', f'Median Distance', f'Max Distance'
            ]

            # validation ground truths for posterior sampling
            val_gts = torch.stack([
                torch.zeros(len(val_dataset.parameters.columns),
                            dtype=torch.float32),  # gw150914 dummy gt
                torch.tensor(val_dataset.parameters.iloc[min_idx].values,
                             dtype=torch.float32),  # rank 1
                torch.tensor(val_dataset.parameters.iloc[median_idx].values,
                             dtype=torch.float32),  # rank 2
                torch.tensor(val_dataset.parameters.iloc[max_idx].values,
                             dtype=torch.float32),  # rank 3
            ])

        with torch.no_grad():
            # load data from file manually (rather than using val_dataset._worker_init_fn)
            val_coefficients = np.load(val_dataset.data_dir /
                                       val_dataset.data_file,
                                       mmap_mode='c')

            # generate coefficients on cpu - we want to send this to tensorboard (rank 0) before sending to gpus
            val_coefficients = torch.cat([
                torch.from_numpy(
                    generate_gw150914_context(num_basis, noise_dir, psd_dir,
                                              basis_dir,
                                              static_args_ini))[None],
                torch.tensor(val_coefficients[[min_idx, median_idx, max_idx]]),
            ],
                                         dim=0).to(dtype=torch.complex64)

            # place one of each stacked tensor onto corresponding gpu rank
            val_context = val_coefficients[
                rank] * val_dataset.standardization[:, :num_basis]
            val_context = val_context.to(device=rank)
            val_context = torch.cat([val_context.real, val_context.imag],
                                    dim=0)
            val_context = val_context.reshape(val_context.shape[0] *
                                              val_context.shape[1])[None]

    else:
        figure_titles = None
        val_gts = None
        val_coefficients = None

    # set torch profiling runs
    # wait = 1  # ignore first batch
    # warmup = 1
    # active = 4
    # repeat = 2

    # tensorboard
    if rank == 0:
        # tb = SummaryWriter(f'gwpe/runs/{log_dir}')
        queue = mp.SimpleQueue()
        tb_process = mp.Process(target=tensorboard_writer,
                                args=(
                                    queue,
                                    f'gwpe/runs/{log_dir}',
                                    val_dataset.generator.parameters,
                                    val_dataset.generator.latex,
                                    static_args_ini,
                                    basis_dir,
                                    num_basis,
                                    val_coefficients,
                                    val_gts,
                                    figure_titles,
                                ))
        tb_process.start()

    # instantiate neural spline coupling flow
    flow = flows.create_NDE_model(
        input_dim=14,  # we do not predict coalescence time 
        context_dim=4 * num_basis,
        num_flow_steps=15,
        base_transform_kwargs={
            'base_transform_type': 'rq-coupling',
            'batch_norm': True,
            'num_transform_blocks': 10,
            'activation': 'elu',
        })

    flow = flow.to(rank)
    print_peak_memory("Max memory allocated after creating local model", rank)

    # sync_bn_flow = nn.SyncBatchNorm.convert_sync_batchnorm(flow)
    flow = DDP(flow, device_ids=[rank], output_device=rank)

    print_peak_memory("Max memory allocated after creating DDP", rank)

    if use_zero:
        #https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html
        from torch.distributed.optim import ZeroRedundancyOptimizer
        optimizer = ZeroRedundancyOptimizer(
            flow.parameters(),
            optimizer_class=torch.optim.Adam,
            lr=lr,
            parameters_as_bucket_view=True,
        )
        # optimizer = torch.optim.Adam(flow.parameters(), lr=lr)
    else:
        optimizer = torch.optim.Adam(flow.parameters(), lr=lr)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max=epochs)

    if load_dir is not None and load_epoch is not None:
        print(f'Loading model from {load_dir} at epoch {load_epoch}.')
        flow.module.load_state_dict(
            torch.load(f'gwpe/model_weights/{load_dir}/flow_{load_epoch}.pt',
                       map_location=rank))
        optimizer.load_state_dict(
            torch.load(
                f'gwpe/model_weights/{load_dir}/optimizer_{load_epoch}.pt',
                map_location=rank))
        if Path(f'gwpe/model_weights/{load_dir}/scheduler_{load_epoch}.pt'
                ).is_file():
            scheduler.load_state_dict(
                torch.load(
                    f'gwpe/model_weights/{load_dir}/scheduler_{load_epoch}.pt',
                    map_location=rank))

    # run training loop
    flow.train()
    train_loss = torch.zeros((1, ), device=rank, requires_grad=False)
    val_loss = torch.zeros((1, ), device=rank, requires_grad=False)

    disable_pbar = False if verbose and (rank
                                         == 0) else True  # tqdm progress bar
    with tqdm(total=len(dataloader) * epochs,
              disable=disable_pbar,
              desc=f'[{log_dir}] Training',
              postfix={'epoch': 0}) as progress:
        # with torch.profiler.profile(
        #     activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        #     schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat),
        #     on_trace_ready=torch.profiler.tensorboard_trace_handler(f'gwpe/runs/{log_dir}'),
        #     record_shapes=True,
        #     with_stack=True
        # ) as profiler:

        for epoch in range(1, 1 + epochs):
            if rank == 0:
                progress.set_postfix({'epoch': epoch})
                progress.set_description(f'[{log_dir}] Training', refresh=True)

            # let all processes sync up before starting with a new epoch of training
            flow.train()
            distributed.barrier()

            iterator = iter(dataloader)
            coefficients, parameters = next(iterator)

            coefficients = coefficients.to(rank, non_blocking=True)
            parameters = parameters.to(rank, non_blocking=True)

            complete = False
            while not complete:
                optimizer.zero_grad()

                # if profile:
                # https://github.com/guyang3532/kineto/blob/readme/tb_plugin/docs/gpu_utilization.md
                ## WARNING: profiler may not handle async pinned memory transfer properly?
                # i.e. may not record CPU vs GPU wall times correctly
                # may be related to reported blocks per SM/achieved occupancy negative bug
                # this was an open issue for pytorch 1.9 as of july 9 - nightly may fix it
                # https://github.com/pytorch/kineto/issues/325#issuecomment-869362218
                # if (step >= (wait + warmup + active) * repeat):
                #     break

                # negative log-likelihood conditional on strain over mini-batch
                loss = -flow.module.log_prob(parameters,
                                             context=coefficients).mean()

                try:
                    # async get data from CPU and move to GPU during model forward
                    coefficients, parameters = next(iterator)

                    coefficients = coefficients.to(rank, non_blocking=True)
                    parameters = parameters.to(rank, non_blocking=True)

                except StopIteration:
                    # exit while loop if iterator is complete
                    complete = True

                loss.backward()

                print_peak_memory(
                    "Max memory allocated before optimizer step()", rank)
                optimizer.step()
                print_peak_memory(
                    "Max memory allocated after optimizer step()", rank)

                # if profile: profiler.step()

                # total loss summed over each sample in batch
                train_loss += loss.detach() * coefficients.shape[0]
                if rank == 0: progress.update(1)

            scheduler.step()

            # gather total loss during epoch between each GPU worker as list of tensors
            world_loss = [
                torch.ones_like(train_loss) for _ in range(world_size)
            ]
            distributed.all_gather(world_loss, train_loss)
            train_loss *= 0.0  # reset loss for next epoch

            if (interval != 0) and (epoch % interval == 0):
                # evaluate model on validation dataset
                flow.eval()
                with torch.no_grad():

                    iterator = iter(enumerate(val_loader))
                    step, (coefficients, parameters) = next(iterator)
                    coefficients = coefficients.to(rank, non_blocking=True)
                    parameters = parameters.to(rank, non_blocking=True)

                    if rank == 0:
                        val_progress = int(100 * step / len(val_loader))
                        progress.set_description(
                            f'[{log_dir}] Validating ({val_progress}%)',
                            refresh=True)

                    complete = False
                    while not complete:

                        # negative log-likelihood conditional on strain over mini-batch
                        loss = -flow.module.log_prob(
                            parameters, context=coefficients).mean()

                        try:
                            # async get data from CPU and move to GPU during model forward
                            step, (coefficients, parameters) = next(iterator)
                            coefficients = coefficients.to(rank,
                                                           non_blocking=True)
                            parameters = parameters.to(rank, non_blocking=True)

                            if rank == 0:
                                val_progress = int(100 * step /
                                                   len(val_loader))
                                progress.set_description(
                                    f'[{log_dir}] Validating ({val_progress}%)',
                                    refresh=True)

                        except StopIteration:
                            # exit while loop if iterator is complete
                            complete = True

                        # total loss summed over each sample in batch
                        val_loss += loss.detach() * coefficients.shape[0]

                    # gather total loss during epoch between each GPU worker as list of tensors
                    world_val_loss = [
                        torch.ones_like(val_loss) for _ in range(world_size)
                    ]
                    distributed.all_gather(world_val_loss, val_loss)
                    val_loss *= 0.0  # reset loss for next epoch

                    # validation posteriors
                    if rank == 0:
                        progress.set_description(
                            f'[{log_dir}] Sampling posteriors', refresh=True)

                    samples = flows.sample_flow(
                        flow.module,
                        n=10000,
                        context=val_context,
                        output_device='cuda',
                        dtype=torch.float32,
                    )[0]

                    # gather samples from all gpus
                    world_samples = [
                        torch.ones_like(samples) for _ in range(world_size)
                    ]
                    distributed.all_gather(world_samples, samples)

            if (rank == 0):
                progress.set_description(f'[{log_dir}] Sending to TensorBoard',
                                         refresh=True)

                scalars = {
                    'loss/train':
                    torch.cat(world_loss).sum().item() /
                    len(dataloader.dataset)
                }

                # every "interval" we generate samples for vis, else None
                corner_samples = None  # reset to None for epochs where there is no corner plot
                if (interval != 0) and (epoch % interval == 0):

                    scalars['loss/validation'] = torch.cat(
                        world_val_loss).sum().item() / len(val_loader.dataset)

                    # convert gw150914 samples to cpu and undo standardization
                    corner_samples = torch.stack(world_samples).cpu()
                    corner_samples *= torch.from_numpy(val_dataset.std)
                    corner_samples += torch.from_numpy(val_dataset.mean)

                # send data to async process to generate matplotlib figures
                queue.put((epoch, scalars, corner_samples))

                if (save != 0) and (epoch % save == 0):
                    # save checkpoint and write computationally expensive data to tb
                    torch.save(flow.module.state_dict(),
                               experiment_dir / f'flow_{epoch}.pt')

                    # if use_zero:
                    #     # needs to be called on all ranks
                    #     optimizer.consolidate_state_dict(to=0)

                    torch.save(optimizer.state_dict(),
                               experiment_dir / f'optimizer_{epoch}.pt')

                    if scheduler is not None:
                        torch.save(scheduler.state_dict(),
                                   experiment_dir / f'scheduler_{epoch}.pt')

    # destroy processes from distributed training
    if rank == 0:
        # to do - graceful way to shutdown workers
        # need to send message back from child process
        sleep_time = 120
        for i in range(sleep_time):
            progress.set_description(
                f'[{log_dir}] Shutting down in {sleep_time - i}s',
                refresh=True)
            time.sleep(1)

        tb_process.terminate()

    cleanup_nccl()

Example #4

Show file

def train_process(p_id, word_count_actual, word2idx, word_list, freq, args,
                  model, word2morph, word2morph_mask, ctx2morph,
                  ctx2morph_mask):
    data_queue = mp.SimpleQueue()

    if args.opt == "Adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
    elif args.opt == "SGD":
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum)
    elif args.opt == 'SparseAdam':
        optimizer = optim.SparseAdam(model.parameters(), lr=args.lr)

    t = mp.Process(target=train_process_sent_producer,
                   args=(p_id, data_queue, word_count_actual, word2idx,
                         word_list, freq, args))
    t.start()

    # get from data_queue and feed to model
    prev_word_cnt = 0
    losses_cnt = 0
    total_loss = 0.0
    losses_file = open(args.losslog, 'w')
    lr = args.lr
    #mattrum_cnt = 0
    #non_mattrum_cnt = 0
    while True:
        d = data_queue.get()
        if d is None:
            break
        else:
            # lr anneal
            if args.anneal:
                if word_count_actual.value - prev_word_cnt > 10000:
                    lr = args.lr * (1 - word_count_actual.value /
                                    (args.iter * args.train_words))
                    if lr < 0.0001 * args.lr:
                        lr = 0.0001 * args.lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr
            else:
                lr = args.lr

            if args.cuda:
                data = Variable(torch.LongTensor(d).cuda(),
                                requires_grad=False)
            else:
                data = Variable(torch.LongTensor(d), requires_grad=False)

            if args.cbow == 1:
                optimizer.zero_grad()
                loss = model(data)
                loss.backward()
                optimizer.step()
                model.emb0_lookup.weight.data[args.vocab_size].fill_(0)
            elif args.cbow == 0:
                optimizer.zero_grad()
                #print("WORD")
                #print(data[3][0])
                loss = model(data, word2morph[data[:, 0]],
                             word2morph_mask[data[:, 0]],
                             ctx2morph[data[:, 1:2 + args.negative]],
                             ctx2morph_mask[data[:, 1:2 + args.negative]])
                loss.backward()
                #model.emb0morph_lookup.weight.data.grad[args.morph_size+1].fill_(0)
                optimizer.step()
                #model.emb0morph_lookup.weight.data[args.morph_size+1].zero_()

            losses_cnt += data.shape[0]
            total_loss += loss

            # output
            if word_count_actual.value - prev_word_cnt > 10000:
                avg_loss = total_loss / losses_cnt
                sys.stdout.write(
                    "\rAlpha: %0.8f, Loss: %0.8f, Progress: %0.2f, Words/sec: %f"
                    % (lr, avg_loss, word_count_actual.value /
                       (args.iter * args.train_words) * 100,
                       word_count_actual.value /
                       (time.monotonic() - args.t_start)))
                sys.stdout.flush()
                prev_word_cnt = word_count_actual.value
                losses_cnt = 0
                total_loss = 0.0
                losses_file.write(str(avg_loss.item()) + '\n')

    losses_file.close()
    t.join()

Example #5

Show file

def batch_training(fileprefix='', tasks=[]):

    if fileprefix:
        filename = '{}-main.out'.format(fileprefix)
        filepath = pathlib.Path(filename).resolve()
        if not filepath.parent.exists():
            filepath.parent.mkdir(parents=True)
        stdout_target = filepath.open('wt')
    else:
        stdout_target = sys.__stdout__

    with contextlib.redirect_stdout(stdout_target):

        print('System-wide logical CPUs:', psutil.cpu_count())
        print('System-wide physical CPUs:', psutil.cpu_count(logical=False))
        oversubscribe = 2
        ngpus = torch.cuda.device_count()
        nworkers = ngpus * oversubscribe
        curproc = psutil.Process()
        createtime = curproc.create_time()
        print('Main process {} on CPU {} with {} threads'.
            format(curproc.pid, curproc.cpu_num(), curproc.num_threads()))
        print('Presently available CPUs:', len(curproc.cpu_affinity()))
        print('Presently available GPUs:', ngpus)
        print('Worker processes:', nworkers)
        # load input tasks into queue
        task_queue = mp.SimpleQueue()
        for i,task in enumerate(tasks):
            print('Task',i+1,task)
            task_queue.put(task)
        # worker locks
        locks = []
        active_processes = []
        for i in range(nworkers):
            locks.append(mp.Lock())
            active_processes.append(None)
        # results queue
        result_queue = mp.SimpleQueue()
        itask = 0
        while not task_queue.empty():
            for ilock,lock in enumerate(locks):
                if lock.acquire(timeout=1):
                    # acquire lock and expect process == None
                    assert(active_processes[ilock] is None)
                    if task_queue.empty():
                        lock.release()
                        continue
                    train_kwargs = task_queue.get()
                    igpu = ilock%ngpus
                    args = (itask, ilock, igpu, fileprefix,
                            train_kwargs, result_queue)
                    p = mp.Process(target=gpu_worker, args=args)
                    print('  Launching task {}/{} on worker {} on GPU {}'.
                        format(itask, len(tasks), ilock, igpu))
                    itask += 1
                    p.start()
                    active_processes[ilock] = p
                else:
                    # locked and expect process != None
                    existing_process = active_processes[ilock]
                    assert(existing_process is not None)
                    if existing_process.exitcode is not None:
                        # process is complete; close and release
                        print('  Process {} finished'.format(existing_process.pid))
                        active_processes[ilock] = None
                        lock.release()
        print('Finished task loop')
        still_running = True
        while still_running:
            still_running = False
            for i,process in enumerate(active_processes):
                if process is None: continue
                if process.exitcode is None:
                    still_running = True
                    break
                else:
                    print('  Process {} finished'.format(process.pid))
                    active_processes[i] = None
            time.sleep(1)
        results = []
        while not result_queue.empty():
            results.append(result_queue.get())
        print('Tasks:', len(tasks), 'results:', len(results))
        def sort_func(element):
            return element[0]
        results = sorted(results, key=sort_func)
        for i,result in enumerate(results):
            print('Task {:3d} worker/GPU {:2d}/{:1d}  dt {:5.1f}s  max/med acc {:5.1f}%/{:5.1f}%  kw: {}'.
                format(*result[0:4], result[4].max(), np.median(result[4]), result[6]))
        delta_seconds = time.time() - createtime
        print('Main execution: {:.1f} s'.format(delta_seconds))

Example #6

Show file

File: SAC_apex_1agent.py Project: caoysh/RL_Soccer

        print("load training indicator")

    last_updated = 0
    last_deliver = 0
    last_saved = 0
    test_t = 0
    if args.cuda:
        global_ac.to(device)
        global_ac_targ.to(device)
        if args.cpc:
            global_cpc.to(device)

    for p in global_ac_targ.parameters():
        p.requires_grad = False

    buffer_q = mp.SimpleQueue()
    model_q = [mp.SimpleQueue() for _ in range(args.n_process + args.opp_num)
               ]  # + n opp test process
    evaluation_queue = list()
    processes = []
    # Process n for evaluation
    for rank in range(args.n_process + args.opp_num):  # + n opp test process
        # Test during training
        if rank < args.opp_num:
            # test processes
            p = mp.Process(target=test_func,
                           args=(rank, E, T, args, model_q[rank],
                                 torch.device("cpu"), tensorboard_dir))
        else:
            # actor processes
            model_q[rank].put(shared_ac.state_dict())

Example #7

Show file

File: a2c_dist.py Project: mavischer/DRRL

            axes[i_l, i_step].set_ylabel(l_names[i_l])
        if i_l == 0:
            axes[i_l, i_step].set_title(f"it {i_step}")


if __name__ == "__main__":
    mp.set_start_method(
        "fork"
    )  #fork is unix default and means child process inherits all resources from parent
    # process. in case problems occur, might use "forkserver"
    #create global network and pipeline
    g_net = DRRLnet(INP_W, INP_H, N_ACT, **NET_CONFIG)  # global network
    g_net.zero_grad()
    g_net.share_memory(
    )  # share the global parameters in multiprocessing #todo: check whether this makes a difference
    stats_queue = mp.SimpleQueue(
    )  #statistics about the episodes will be returned in this queue
    grads_queue = mp.SimpleQueue(
    )  #the calculated gradients will be returned as dicts in this queue
    start_cond = mp.Event(
    )  #condition object to signal processes to perform another iteration    # iteration
    # so worker process needs to be still alive when queue is accessed)
    if config["optimizer"] == "RMSprop":
        #RMSprop optimizer was used for the large state space, not the small ones and impala instead of a3c.
        # "Learning rate was tuned between 1e-5 and 2e-4" probably means they did hyperparameter search.
        # scheduling is also possible conveniently using torch torch.optim.lr_scheduler
        # perhaps use smaller decay term 0.9
        optimizer = torch.optim.RMSprop(g_net.parameters(),
                                        eps=0.1,
                                        lr=config["lr"])
    else:
        #Adam optimizer was used for the starcraft games with learning rate decaying linearly over 1e10 steps from

Example #8

Show file

from runner import Runner

if __name__ == "__main__":
    N_WORKERS = 1
    agent = Agent()
    if len(sys.argv) > 1:
        saveFile = sys.argv[1]
        print(f'Training agent from checkpoint: {saveFile}')
        checkpoint = torch.load(saveFile)
        agent.load_state_dict(checkpoint["model_state_dict"], strict=True)
        #agent.eval()

        #success = agent.load_state_dict(torch.load(saveFile))
        #print(f'Loading returned: {success}')

        #agent.eval()
        directory = './videos/car-racing/fromCheckpoint' + str(time.time())
        player = Player(agent=agent, directory=directory, train=True)
        #points = player.play()
        #print(f'loaded agent scored {points} Points')

        trainer = Trainer(gamma=0.99, agent=agent, workers=N_WORKERS)
        trainer.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

    else:
        print('Training new agent')
        trainer = Trainer(gamma=0.99, agent=deepcopy(agent), workers=N_WORKERS)
    queue = mp.SimpleQueue()
    runner = Runner(agent=agent, ix=0)

    trainer.train_one(runner, queue)