Esempio n. 1
0
    def __init__(
            self,
            _agent: AgentAbstract,
            _env: EnvAbstract,
            _epoch_max: int,
            _epoch_train: int,
            _train_update_target: int,
            _train_save: int,
            _process_core: int = None,
            _save_path: str = './save',
            _use_cmd: bool = True,
    ):
        self.agent: AgentAbstract = _agent
        self.agent.training()
        self.env: EnvAbstract = _env

        # multiprocessing for sampling
        self.mp = mp.get_context('spawn')
        self.process_core = _process_core
        self.pool = self.mp.Pool(self.process_core)

        # training control
        self.epoch = 0
        self.train_times = 0
        self.epoch_max = _epoch_max
        self.epoch_train = _epoch_train
        self.train_update_target = _train_update_target
        self.train_save = _train_save

        self.total_reward_buf = []

        self.save_path = _save_path
        self.use_cmd = _use_cmd
        if self.use_cmd:
            self.shell = TrainShell(self)
Esempio n. 2
0
    def mp_search(self, graph, other_info, model_id, train_data, test_data):
        ctx = mp.get_context()
        q = ctx.Queue()
        p = ctx.Process(target=train, args=(q, graph, train_data, test_data, self.trainer_args,
                                            self.metric, self.loss, self.verbose, self.path))
        try:
            p.start()
            search_results = self._search_common(q)
            metric_value, loss, graph = q.get(block=True)
            if time.time() >= self._timeout:
                raise TimeoutError
            if self.verbose and search_results:
                for (generated_graph, generated_other_info, new_model_id) in search_results:
                    verbose_print(generated_other_info, generated_graph, new_model_id)

            if metric_value is not None:
                self.add_model(metric_value, loss, graph, model_id)
                self.update(other_info, model_id, graph, metric_value)

        except (TimeoutError, queue.Empty) as e:
            raise TimeoutError from e
        finally:
            # terminate and join the subprocess to prevent any resource leak
            p.terminate()
            p.join()
Esempio n. 3
0
    def test_cuda_small_tensors(self):
        # Check multiple small tensors which will likely use the same
        # underlying cached allocation
        ctx = mp.get_context('spawn')
        tensors = []
        for i in range(5):
            device = i % 2
            tensors += [torch.arange(i * 5, (i + 1) * 5).cuda(device)]

        inq = ctx.Queue()
        outq = ctx.Queue()
        inq.put(tensors)
        p = ctx.Process(target=sum_tensors, args=(inq, outq))
        p.start()

        results = []
        for i in range(5):
            results.append(outq.get())
        p.join()

        for i, tensor in enumerate(tensors):
            v, device, tensor_size, storage_size = results[i]
            self.assertEqual(v, torch.arange(i * 5, (i + 1) * 5).sum())
            self.assertEqual(device, i % 2)
            self.assertEqual(tensor_size, 5)
            self.assertEqual(storage_size, 5)
    def __init__(self, device_ids=None, multiprocessing_method='spawn'):
        super().__init__()
        self.device_ids = tuple(device_ids)
        self.num_replicas = len(device_ids)
        self.rank = None

        self._mp = multiprocessing.get_context(multiprocessing_method)

        self._start_error_handler()
        self._start_multiprocessing()
Esempio n. 5
0
 def __init__(self, loader, prepro,
              sort_key, batchify,
              single_run=True, queue_size=8, fork=True):
     self._loader = loader
     self._prepro = prepro
     self._sort_key = sort_key
     self._batchify = batchify
     self._single_run = single_run
     if fork:
         ctx = mp.get_context('forkserver')
         self._queue = ctx.Queue(queue_size)
     else:
         # for easier debugging
         self._queue = None
     self._process = None
Esempio n. 6
0
    def test_event(self):
        ctx = mp.get_context('spawn')
        queue = ctx.Queue()
        ready = ctx.Event()
        done = ctx.Event()
        p = ctx.Process(target=cuda_multiply_two, args=(queue, ready, done))
        p.start()

        ready.wait()
        with torch.cuda.stream(torch.cuda.Stream()):
            tensor = torch.cuda.FloatTensor([1, 1, 1, 1])
            # Use a sleep kernel to test events. Without the event, the
            # multiply happens before the add.
            event = torch.cuda.Event(interprocess=True)
            torch.cuda._sleep(20000000)  # about 30 ms
            tensor.add_(1)
            event.record()
            queue.put((event, tensor))
            done.wait()  # must wait until subprocess records event
            event.synchronize()
            self.assertEqual(list(tensor), [4, 4, 4, 4])
        p.join()
Esempio n. 7
0
    def __call__(self, batch_size: int):
        def get_batches(hyper_batch):
            indexes = list(range(0, len(hyper_batch), batch_size))
            if not self._single_run:
                # random shuffle for training batches
                random.shuffle(hyper_batch)
                random.shuffle(indexes)
            hyper_batch.sort(key=self._sort_key)
            for i in indexes:
                batch = self._batchify(hyper_batch[i:i+batch_size])
                yield batch

        if self._queue is not None:
            ctx = mp.get_context('forkserver')
            self._process = ctx.Process(
                target=_batch2q,
                args=(self._loader, self._prepro,
                      self._queue, self._single_run)
            )
            self._process.start()
            while True:
                d = self._queue.get()
                if d is None:
                    break
                if isinstance(d, int):
                    print('\nepoch {} done'.format(d))
                    continue
                yield from get_batches(d)
            self._process.join()
        else:
            i = 0
            while True:
                for batch in self._loader:
                    yield from get_batches(self._prepro(batch))
                if self._single_run:
                    break
                i += 1
                print('\nepoch {} done'.format(i))
Esempio n. 8
0
 def setup(self, model):
     os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
     # pass in a state q
     smp = mp.get_context("spawn")
     self.mp_queue = smp.SimpleQueue()
Esempio n. 9
0
    def __init__(
        self,
        make_sampler_fn: Callable[..., TaskSampler],
        sampler_fn_args: Sequence[Dict[str, Any]] = None,
        auto_resample_when_done: bool = True,
        multiprocessing_start_method: Optional[str] = "forkserver",
        mp_ctx: Optional[BaseContext] = None,
        metrics_out_queue: mp.Queue = None,
        should_log: bool = True,
        max_processes: Optional[int] = None,
    ) -> None:

        self._is_waiting = False
        self._is_closed = True
        self.should_log = should_log
        self.max_processes = max_processes

        assert (
            sampler_fn_args is not None and len(sampler_fn_args) > 0
        ), "number of processes to be created should be greater than 0"

        self._num_task_samplers = len(sampler_fn_args)
        self._num_processes = (
            self._num_task_samplers
            if max_processes is None
            else min(max_processes, self._num_task_samplers)
        )

        self._auto_resample_when_done = auto_resample_when_done

        assert (multiprocessing_start_method is None) != (
            mp_ctx is None
        ), "Exactly one of `multiprocessing_start_method`, and `mp_ctx` must be not None."
        if multiprocessing_start_method is not None:
            assert multiprocessing_start_method in self._valid_start_methods, (
                "multiprocessing_start_method must be one of {}. Got '{}'"
            ).format(self._valid_start_methods, multiprocessing_start_method)
            self._mp_ctx = mp.get_context(multiprocessing_start_method)
        else:
            self._mp_ctx = typing.cast(BaseContext, mp_ctx)

        self.metrics_out_queue = metrics_out_queue or self._mp_ctx.Queue()

        self.npaused_per_process = [0] * self._num_processes
        self.sampler_index_to_process_ind_and_subprocess_ind: Optional[
            List[List[int]]
        ] = None
        self._reset_sampler_index_to_process_ind_and_subprocess_ind()

        self._workers: Optional[List] = None
        for args in sampler_fn_args:
            args["mp_ctx"] = self._mp_ctx
        (
            self._connection_read_fns,
            self._connection_write_fns,
        ) = self._spawn_workers(  # noqa
            make_sampler_fn=make_sampler_fn,
            sampler_fn_args_list=[
                args_list for args_list in self._partition_to_processes(sampler_fn_args)
            ],
        )

        self._is_closed = False

        for write_fn in self._connection_write_fns:
            write_fn((OBSERVATION_SPACE_COMMAND, None))

        observation_spaces = [
            space for read_fn in self._connection_read_fns for space in read_fn()
        ]

        if any(os is None for os in observation_spaces):
            raise NotImplementedError(
                "It appears that the `all_observation_spaces_equal`"
                " is not True for some task sampler created by"
                " VectorSampledTasks. This is not currently supported."
            )

        if any(observation_spaces[0] != os for os in observation_spaces):
            raise NotImplementedError(
                "It appears that the observation spaces of the samplers"
                " created in VectorSampledTasks are not equal."
                " This is not currently supported."
            )

        self.observation_space = observation_spaces[0]
        for write_fn in self._connection_write_fns:
            write_fn((ACTION_SPACE_COMMAND, None))
        self.action_spaces = [
            space for read_fn in self._connection_read_fns for space in read_fn()
        ]
Esempio n. 10
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--epochs', default='last', type=str)
    parser.add_argument('-d', '--devices', default='1', type=str)
    parser.add_argument('-v', '--verbose', default=False, action='store_true')
    parser.add_argument('--show_image',
                        '-s',
                        default=False,
                        action='store_true')
    parser.add_argument('--save_path', '-p', default=None)

    args = parser.parse_args()
    all_dev = parse_devices(args.devices)

    mp_ctx = mp.get_context('spawn')
    network = CPNet(config.num_classes, criterion=None)
    data_setting = {
        'img_root': config.img_root_folder,
        'gt_root': config.gt_root_folder,
        'train_source': config.train_source,
        'eval_source': config.eval_source
    }
    dataset = ADE(data_setting, 'val', None)

    with torch.no_grad():
        segmentor = SegEvaluator(dataset, config.num_classes,
                                 config.image_mean, config.image_std, network,
                                 config.eval_scale_array, config.eval_flip,
                                 all_dev, args.verbose, args.save_path,
                                 args.show_image)
Esempio n. 11
0
import torch
from torch import nn
from torch import optim
from torch import autograd
from torch import multiprocessing
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data import TensorDataset
import numpy as np
import os

multiprocessing = multiprocessing.get_context('spawn')


class Concept(nn.Module):
    def __init__(self):
        super(Concept, self).__init__()

        self.net = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=0),
            nn.BatchNorm2d(64, momentum=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(64, 64, kernel_size=3, padding=0),
            nn.BatchNorm2d(64, momentum=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64, momentum=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
Esempio n. 12
0
    def train(self):
        if self.multiprocessing and self.view_id < 0:
            import torch.multiprocessing as mp
            mp = mp.get_context('spawn')
            process = []
            start = time.time()
            for v in range(self.n_view):
                process.append(mp.Process(target=self.train_view, args=(v, )))
                process[v].daemon = True
                process[v].start()
            for p in process:
                p.join()
        elif self.view_id >= 0:
            start = time.time()
            self.train_view(self.view_id)
        else:
            start = time.time()
            for v in range(self.n_view):
                self.train_view(v)
        end = time.time()
        runing_time = end - start
        if self.runing_time:
            print('runing_time: ' + str(runing_time))
            return runing_time

        if not self.just_valid:
            # valid = [self.resutls[v][0] for v in range(self.n_view)]
            # test = [self.resutls[v][1] for v in range(self.n_view)]
            valid_fea, valid_lab, test_fea, test_lab = [], [], [], []
            for v in range(self.n_view):
                tmp = sio.loadmat('features/' + self.datasets + '_' + str(v) +
                                  '.mat')
                valid_fea.append(tmp['valid_fea'])
                valid_lab.append(tmp['valid_lab'].reshape([
                    -1,
                ]) if min(tmp['valid_lab'].shape) == 1 else tmp['valid_lab'])
                test_fea.append(tmp['test_fea'])
                test_lab.append(tmp['test_lab'].reshape([
                    -1,
                ]) if min(tmp['test_lab'].shape) == 1 else tmp['test_lab'])

            valid_results = utils.multi_test(valid_fea, valid_lab, self.MAP)
            test_results = utils.multi_test(test_fea, test_lab, self.MAP)
            print("valid results: " + self.view_result(valid_results) +
                  ",\t test resutls:" + self.view_result(test_results))
            sio.savemat(
                'features/' + self.datasets + '_SDML_test_feature_results.mat',
                {
                    'test': test_fea,
                    'test_labels': test_lab
                })

            return valid_results, test_results
        else:
            return np.concatenate(
                [np.array(loss).reshape([1, -1]) for loss in self.val_d_loss],
                axis=0), np.concatenate([
                    np.array(loss).reshape([1, -1]) for loss in self.tr_d_loss
                ],
                                        axis=0), np.concatenate([
                                            np.array(loss).reshape([1, -1])
                                            for loss in self.tr_ae_loss
                                        ],
                                                                axis=0)
Esempio n. 13
0
def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
    r"""Applies each `module` in :attr:`modules` in parallel on arguments
    contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
    on each of :attr:`devices`.
    Args:
        modules (Module): modules to be parallelized
        inputs (tensor): inputs to the modules
        devices (list of int or torch.device): CUDA devices
    :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
    :attr:`devices` (if given) should all have same length. Moreover, each
    element of :attr:`inputs` can either be a single object as the only argument
    to a module, or a collection of positional arguments.
    """
    assert len(modules) == len(inputs)
    if kwargs_tup is not None:
        assert len(modules) == len(kwargs_tup)
    else:
        kwargs_tup = ({}, ) * len(modules)
    if devices is not None:
        assert len(modules) == len(devices)
    else:
        devices = [None] * len(modules)
    devices = list(map(lambda x: _get_device_index(x, True), devices))
    context = mp.get_context('spawn')
    # lock = threading.Lock()
    # results = {}
    # results = []
    results_queue = context.Queue(len(devices))
    grad_enabled = torch.is_grad_enabled()

    def _worker(i, module, input, kwargs, device=None):
        torch.set_grad_enabled(grad_enabled)
        if device is None:
            device = get_a_var(input).get_device()
        try:
            with torch.cuda.device(device):
                # this also avoids accidental slicing of `input` if it is a Tensor
                if not isinstance(input, (list, tuple)):
                    input = (input, )
                output = module(*input, **kwargs)
            results_queue.put(output)
            # with lock:
            #     results[i] = output
        except Exception as e:
            results_queue.put(e)
            # with lock:
            #     results[i] = e

    if len(modules) > 1:
        processes = [
            context.Process(target=_worker,
                            args=(i, module, input, kwargs, device))
            for i, (
                module, input, kwargs,
                device) in enumerate(zip(modules, inputs, kwargs_tup, devices))
        ]

        for process in processes:
            process.start()
        for process in processes:
            process.join()
    else:
        _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])

    outputs = []
    for i in range(len(inputs)):
        output = results_queue.get()
        if isinstance(output, Exception):
            raise output
        outputs.append(output)
    return outputs
Esempio n. 14
0
 def test_cuda_parameter_sharing(self):
     param = Parameter(torch.arange(1., 26, device='cuda').view(5, 5))
     self._test_autograd_sharing(param, mp.get_context('spawn'), is_parameter=True)
Esempio n. 15
0
 def test_cuda(self):
     torch.cuda.FloatTensor([1])  # initialize CUDA outside of leak checker
     self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)
Esempio n. 16
0
def train(flags):  # pylint: disable=too-many-branches, too-many-statements
    if flags.xpid is None:
        flags.xpid = 'torchbeast-%s' % time.strftime('%Y%m%d-%H%M%S')
    plogger = file_writer.FileWriter(
        xpid=flags.xpid,
        xp_args=flags.__dict__,
        rootdir=flags.savedir,
        symlink_latest=False,
    )
    checkpointpath = os.path.expandvars(
        os.path.expanduser('%s/%s/%s' %
                           (flags.savedir, flags.xpid, 'model.tar')))

    T = flags.unroll_length
    B = flags.batch_size

    flags.device = None
    if not flags.disable_cuda and torch.cuda.is_available():
        logging.info(f'Using CUDA {flags.cuda_id}')
        flags.device = torch.device(f'cuda:{flags.cuda_id}')
    else:
        logging.info('Not using CUDA.')
        flags.device = torch.device('cpu')

    env = Net.create_env(flags)
    model = Net.make(flags, env)
    buffers = create_buffers(env.observation_space, len(env.action_space),
                             flags)

    model.share_memory()

    actor_processes = []
    ctx = mp.get_context('fork')
    free_queue = ctx.SimpleQueue()
    full_queue = ctx.SimpleQueue()

    for i in range(flags.num_actors):
        actor = ctx.Process(target=act,
                            args=(i, free_queue, full_queue, model, buffers,
                                  flags))
        actor.start()
        actor_processes.append(actor)

    learner_model = Net.make(flags, env).to(device=flags.device)

    optimizer = torch.optim.RMSprop(learner_model.parameters(),
                                    lr=flags.learning_rate,
                                    momentum=flags.momentum,
                                    eps=flags.epsilon,
                                    alpha=flags.alpha)

    def lr_lambda(epoch):
        return 1 - min(epoch * T * B, flags.total_frames) / flags.total_frames

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

    if flags.resume:
        save = torch.load(flags.resume, map_location='cpu')
        learner_model.load_state_dict(save['model_state_dict'])
        optimizer.load_state_dict(save['optimizer_state_dict'])
        if flags.resume_scheduler:
            scheduler.load_state_dict(save['scheduler_state_dict'])
        # tune only the embedding layer
        if flags.resume_strategy == 'emb':
            keep = []
            for group in optimizer.param_groups:
                if group['params'][0].size() == (len(learner_model.vocab),
                                                 flags.demb):
                    keep.append(group)
            optimizer.param_groups = keep

    logger = logging.getLogger('logfile')
    stat_keys = [
        'total_loss',
        'mean_episode_return',
        'pg_loss',
        'baseline_loss',
        'entropy_loss',
        'aux_loss',
        'mean_win_rate',
        'mean_episode_len',
    ]
    logger.info('# Step\t%s', '\t'.join(stat_keys))

    frames, stats = 0, {}

    def batch_and_learn(i, lock=threading.Lock()):
        """Thread target for the learning process."""
        nonlocal frames, stats
        timings = prof.Timings()
        while frames < flags.total_frames:
            timings.reset()
            batch = get_batch(free_queue, full_queue, buffers, flags, timings)
            tmp_t1 = time.time()
            stats = learn(model, learner_model, batch, optimizer, scheduler,
                          flags)
            tmp_t2 = time.time()
            print('learn time: ', tmp_t2 - tmp_t1)
            timings.time('learn')
            with lock:
                to_log = dict(frames=frames)
                to_log.update({k: stats[k] for k in stat_keys})
                plogger.log(to_log)
                frames += T * B
        print('learned frames: ', frames)
        if i == 0:
            logging.info('Batch and learn: %s', timings.summary())

    for m in range(flags.num_buffers):
        free_queue.put(m)

    threads = []
    for i in range(flags.num_threads):
        thread = threading.Thread(target=batch_and_learn,
                                  name='batch-and-learn-%d' % i,
                                  args=(i, ))
        thread.start()
        threads.append(thread)

    def checkpoint():
        if flags.disable_checkpoint:
            return
        logging.info('Saving checkpoint to %s', checkpointpath)
        torch.save(
            {
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'flags': vars(flags),
            }, checkpointpath)

    timer = timeit.default_timer
    try:
        last_checkpoint_time = timer()
        while frames < flags.total_frames:
            start_frames = frames
            start_time = timer()
            time.sleep(5)

            if timer() - last_checkpoint_time > 10 * 60:  # Save every 10 min.
                checkpoint()
                last_checkpoint_time = timer()

            fps = (frames - start_frames) / (timer() - start_time)
            if stats.get('episode_returns', None):
                mean_return = 'Return per episode: %.1f. ' % stats[
                    'mean_episode_return']
            else:
                mean_return = ''
            total_loss = stats.get('total_loss', float('inf'))
            logging.info('After %i frames: loss %f @ %.1f fps. %sStats:\n%s',
                         frames, total_loss, fps, mean_return,
                         pprint.pformat(stats))
    except KeyboardInterrupt:
        return  # Try joining actors then quit.
    else:
        for thread in threads:
            thread.join()
        logging.info('Learning finished after %d frames.', frames)
    finally:
        for _ in range(flags.num_actors):
            free_queue.put(None)
        for actor in actor_processes:
            actor.join(timeout=1)

    checkpoint()
    plogger.close()
Esempio n. 17
0
def train(flags):  # pylint: disable=too-many-branches, too-many-statements
    if flags.xpid is None:
        flags.xpid = "torchbeast-%s" % time.strftime("%Y%m%d-%H%M%S")
    plogger = file_writer.FileWriter(xpid=flags.xpid,
                                     xp_args=flags.__dict__,
                                     rootdir=flags.savedir)
    checkpointpath = os.path.expandvars(
        os.path.expanduser("%s/%s/%s" %
                           (flags.savedir, flags.xpid, "model.tar")))

    if flags.num_buffers is None:  # Set sensible default for num_buffers.
        flags.num_buffers = max(2 * flags.num_actors, flags.batch_size)
    if flags.num_actors >= flags.num_buffers:
        raise ValueError("num_buffers should be larger than num_actors")
    if flags.num_buffers < flags.batch_size:
        raise ValueError("num_buffers should be larger than batch_size")

    T = flags.unroll_length
    B = flags.batch_size

    flags.device = None
    if not flags.disable_cuda and torch.cuda.is_available():
        logging.info("Using CUDA.")
        flags.device = torch.device("cuda")
    else:
        logging.info("Not using CUDA.")
        flags.device = torch.device("cpu")

    env = create_env(flags)
    Net = get_model(flags)

    model = Net(env.observation_space.shape, env.action_space, flags.use_lstm)

    def get_action_shape(flags, env: gym.envs) -> typing.Tuple[int]:
        # return the action shape as a tuple
        if flags.env_type == 'mujoco':
            return env.action_space.shape
        if not flags.env_type == 'atari':
            raise ValueError("Unknown env_type")
        return (env.action_space.n, )

    action_shape = get_action_shape(flags, env)
    buffers = create_buffers(flags, env.observation_space.shape, action_shape,
                             get_num_agents(flags))

    model.share_memory()

    # Add initial RNN state.
    initial_agent_state_buffers = []
    for _ in range(flags.num_buffers):
        state = model.initial_state(batch_size=1)
        for t in state:
            t.share_memory_()
        initial_agent_state_buffers.append(state)

    actor_processes = []
    ctx = mp.get_context("fork")
    free_queue = ctx.SimpleQueue()
    full_queue = ctx.SimpleQueue()

    for i in range(flags.num_actors):
        actor = ctx.Process(
            target=act,
            args=(
                flags,
                i,
                free_queue,
                full_queue,
                model,
                buffers,
                initial_agent_state_buffers,
            ),
        )
        actor.start()
        actor_processes.append(actor)
    Net = get_model(flags)
    learner_model = Net(env.observation_space.shape, env.action_space,
                        flags.use_lstm).to(device=flags.device)

    optimizer = torch.optim.RMSprop(
        learner_model.parameters(),
        lr=flags.learning_rate,
        momentum=flags.momentum,
        eps=flags.epsilon,
        alpha=flags.alpha,
    )

    def lr_lambda(epoch):
        return 1 - min(epoch * T * B, flags.total_steps) / flags.total_steps

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

    logger = logging.getLogger("logfile")
    stat_keys = [
        "total_loss",
        "mean_episode_return",
        "pg_loss",
        "baseline_loss",
        "entropy_loss",
    ]
    logger.info("# Step\t%s", "\t".join(stat_keys))

    step, stats = 0, {}

    def batch_and_learn(i, lock=threading.Lock()):
        """Thread target for the learning process."""
        nonlocal step, stats
        timings = prof.Timings()
        while step < flags.total_steps:
            timings.reset()
            batch, agent_state = get_batch(
                flags,
                free_queue,
                full_queue,
                buffers,
                initial_agent_state_buffers,
                timings,
            )
            stats = learn(flags, model, learner_model, batch, agent_state,
                          optimizer, scheduler)
            timings.time("learn")
            with lock:
                to_log = dict(step=step)
                to_log.update({k: stats[k] for k in stat_keys})
                plogger.log(to_log)
                step += T * B

        if i == 0:
            logging.info("Batch and learn: %s", timings.summary())

    for m in range(flags.num_buffers):
        free_queue.put(m)

    threads = []
    for i in range(flags.num_learner_threads):
        thread = threading.Thread(target=batch_and_learn,
                                  name="batch-and-learn-%d" % i,
                                  args=(i, ))
        thread.start()
        threads.append(thread)

    def checkpoint():
        if flags.disable_checkpoint:
            return
        logging.info("Saving checkpoint to %s", checkpointpath)
        torch.save(
            {
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "scheduler_state_dict": scheduler.state_dict(),
                "flags": vars(flags),
            },
            checkpointpath,
        )

    timer = timeit.default_timer
    try:
        last_checkpoint_time = timer()
        while step < flags.total_steps:
            start_step = step
            start_time = timer()
            time.sleep(5)

            if timer() - last_checkpoint_time > 10 * 60:  # Save every 10 min.
                checkpoint()
                last_checkpoint_time = timer()

            sps = (step - start_step) / (timer() - start_time)
            if stats.get("episode_returns", None):
                mean_return = ("Return per episode: %.1f. " %
                               stats["mean_episode_return"])
            else:
                mean_return = ""
            total_loss = stats.get("total_loss", float("inf"))
            logging.info(
                "Steps %i @ %.1f SPS. Loss %f. %sStats:\n%s",
                step,
                sps,
                total_loss,
                mean_return,
                pprint.pformat(stats),
            )
            # for mujoco, need to manually print...
            print(f"sps is: {sps}")
            print(pprint.pformat(stats))
    except KeyboardInterrupt:
        return  # Try joining actors then quit.
    else:
        for thread in threads:
            thread.join()
        logging.info("Learning finished after %d steps.", step)
    finally:
        for _ in range(flags.num_actors):
            free_queue.put(None)
        for actor in actor_processes:
            actor.join(timeout=1)

    checkpoint()
    plogger.close()
Esempio n. 18
0
        if reg_scores is not None:
            reg_scores = reg_scores.view(-1, num_class, 2)
            reg_scores[:, :,
                       0] = reg_scores[:, :, 0] * stats[1, 0] + stats[0, 0]
            reg_scores[:, :,
                       1] = reg_scores[:, :, 1] * stats[1, 1] + stats[0, 1]

        # perform stpp on scores
        result_queue.put((dataset.video_list[index].id, rel_props.numpy(),
                          act_scores.cpu().numpy(), comp_scores.cpu().numpy(),
                          reg_scores.cpu().numpy(), 0))


if __name__ == '__main__':
    ctx = multiprocessing.get_context(
        'spawn'
    )  # this is crucial to using multiprocessing processes with PyTorch

    # This net is used to provides setup settings. It is not used for testing.

    checkpoint = torch.load(args.weights,
                            map_location=lambda storage, loc: storage.cuda(0))

    print("model epoch {} loss: {}".format(checkpoint['epoch'],
                                           checkpoint['best_loss']))
    base_dict = {
        '.'.join(k.split('.')[1:]): v
        for k, v in list(checkpoint['state_dict'].items())
    }
    stats = checkpoint['reg_stats'].cpu().numpy()
Esempio n. 19
0
class GPUProcess(mp.get_context("spawn").Process):
    def __init__(
        self,
        gpu_idx: GPURank,
        subprocess_init: Optional[Callable[[], None]] = None,
        embedding_storage_freelist: Optional[Set[torch.FloatStorage]] = None,
    ) -> None:
        super().__init__(daemon=True, name=f"GPU #{gpu_idx}")
        self.gpu_idx = gpu_idx

        self.master_endpoint, self.worker_endpoint = mp.get_context("spawn").Pipe()
        self.subprocess_init = subprocess_init
        self.sub_holder: Dict[
            Tuple[EntityName, Partition, SubPartition],
            Tuple[torch.nn.Parameter, RowAdagrad],
        ] = {}
        self.embedding_storage_freelist = embedding_storage_freelist

    @property
    def my_device(self) -> torch.device:
        return torch.device("cuda", index=self.gpu_idx)

    def run(self) -> None:
        torch.set_num_threads(1)
        torch.cuda.set_device(self.my_device)
        if self.subprocess_init is not None:
            self.subprocess_init()
        self.master_endpoint.close()

        for s in self.embedding_storage_freelist:
            assert s.is_shared()
            cudart = torch.cuda.cudart()
            res = cudart.cudaHostRegister(s.data_ptr(), s.size() * s.element_size(), 0)
            torch.cuda.check_error(res)
            assert s.is_pinned()
        logger.info(f"GPU subprocess {self.gpu_idx} up and running")
        while True:
            try:
                job: SubprocessArgs = self.worker_endpoint.recv()
            except EOFError:
                break

            stats = self.do_one_job(
                lhs_types=job.lhs_types,
                rhs_types=job.rhs_types,
                lhs_part=job.lhs_part,
                rhs_part=job.rhs_part,
                lhs_subpart=job.lhs_subpart,
                rhs_subpart=job.rhs_subpart,
                next_lhs_subpart=job.next_lhs_subpart,
                next_rhs_subpart=job.next_rhs_subpart,
                model=job.model,
                trainer=job.trainer,
                all_embs=job.all_embs,
                subpart_slices=job.subpart_slices,
                subbuckets=job.subbuckets,
                batch_size=job.batch_size,
                lr=job.lr,
            )

            self.worker_endpoint.send(
                SubprocessReturn(gpu_idx=self.gpu_idx, stats=stats)
            )

    def do_one_job(  # noqa
        self,
        lhs_types: Set[str],
        rhs_types: Set[str],
        lhs_part: Partition,
        rhs_part: Partition,
        lhs_subpart: SubPartition,
        rhs_subpart: SubPartition,
        next_lhs_subpart: Optional[SubPartition],
        next_rhs_subpart: Optional[SubPartition],
        model: MultiRelationEmbedder,
        trainer: Trainer,
        all_embs: Dict[Tuple[EntityName, Partition], FloatTensorType],
        subpart_slices: Dict[Tuple[EntityName, Partition, SubPartition], slice],
        subbuckets: Dict[
            Tuple[int, int], Tuple[LongTensorType, LongTensorType, LongTensorType]
        ],
        batch_size: int,
        lr: float,
    ) -> Stats:
        tk = TimeKeeper()

        for embeddings in all_embs.values():
            assert embeddings.is_pinned()

        occurrences: Dict[
            Tuple[EntityName, Partition, SubPartition], Set[Side]
        ] = defaultdict(set)
        for entity_name in lhs_types:
            occurrences[entity_name, lhs_part, lhs_subpart].add(Side.LHS)
        for entity_name in rhs_types:
            occurrences[entity_name, rhs_part, rhs_subpart].add(Side.RHS)

        if lhs_part != rhs_part:  # Bipartite
            assert all(len(v) == 1 for v in occurrences.values())

        tk.start("copy_to_device")
        for entity_name, part, subpart in occurrences.keys():
            if (entity_name, part, subpart) in self.sub_holder:
                continue
            embeddings = all_embs[entity_name, part]
            optimizer = trainer.partitioned_optimizers[entity_name, part]
            subpart_slice = subpart_slices[entity_name, part, subpart]

            # TODO have two permanent storages on GPU and move stuff in and out
            # from them
            # logger.info(f"GPU #{self.gpu_idx} allocating {(subpart_slice.stop - subpart_slice.start) * embeddings.shape[1] * 4:,} bytes")
            gpu_embeddings = torch.empty(
                (subpart_slice.stop - subpart_slice.start, embeddings.shape[1]),
                dtype=torch.float32,
                device=self.my_device,
            )
            gpu_embeddings.copy_(embeddings[subpart_slice], non_blocking=True)
            gpu_embeddings = torch.nn.Parameter(gpu_embeddings)
            gpu_optimizer = RowAdagrad([gpu_embeddings], lr=lr)
            (cpu_state,) = optimizer.state.values()
            (gpu_state,) = gpu_optimizer.state.values()
            # logger.info(f"GPU #{self.gpu_idx} allocating {(subpart_slice.stop - subpart_slice.start) * 4:,} bytes")
            gpu_state["sum"].copy_(cpu_state["sum"][subpart_slice], non_blocking=True)

            self.sub_holder[entity_name, part, subpart] = (
                gpu_embeddings,
                gpu_optimizer,
            )
        logger.debug(
            f"Time spent copying subparts to GPU: {tk.stop('copy_to_device'):.4f} s"
        )

        for (
            (entity_name, part, subpart),
            (gpu_embeddings, gpu_optimizer),
        ) in self.sub_holder.items():
            for side in occurrences[entity_name, part, subpart]:
                model.set_embeddings(entity_name, side, gpu_embeddings)
                trainer.partitioned_optimizers[
                    entity_name, part, subpart
                ] = gpu_optimizer

        tk.start("translate_edges")
        num_edges = subbuckets[lhs_subpart, rhs_subpart][0].shape[0]
        edge_perm = torch.randperm(num_edges)
        edges_lhs, edges_rhs, edges_rel = subbuckets[lhs_subpart, rhs_subpart]
        _C.shuffle(edges_lhs, edge_perm, os.cpu_count())
        _C.shuffle(edges_rhs, edge_perm, os.cpu_count())
        _C.shuffle(edges_rel, edge_perm, os.cpu_count())
        assert edges_lhs.is_pinned()
        assert edges_rhs.is_pinned()
        assert edges_rel.is_pinned()
        gpu_edges = EdgeList(
            EntityList.from_tensor(edges_lhs),
            EntityList.from_tensor(edges_rhs),
            edges_rel,
        ).to(self.my_device, non_blocking=True)
        logger.debug(f"GPU #{self.gpu_idx} got {num_edges} edges")
        logger.debug(
            f"Time spent copying edges to GPU: {tk.stop('translate_edges'):.4f} s"
        )

        tk.start("processing")
        stats = process_in_batches(
            batch_size=batch_size, model=model, batch_processor=trainer, edges=gpu_edges
        )
        logger.debug(f"Time spent processing: {tk.stop('processing'):.4f} s")

        next_occurrences: Dict[
            Tuple[EntityName, Partition, SubPartition], Set[Side]
        ] = defaultdict(set)
        if next_lhs_subpart is not None:
            for entity_name in lhs_types:
                next_occurrences[entity_name, lhs_part, next_lhs_subpart].add(Side.LHS)
        if next_rhs_subpart is not None:
            for entity_name in rhs_types:
                next_occurrences[entity_name, rhs_part, next_rhs_subpart].add(Side.RHS)

        tk.start("copy_from_device")
        for (entity_name, part, subpart), (gpu_embeddings, gpu_optimizer) in list(
            self.sub_holder.items()
        ):
            if (entity_name, part, subpart) in next_occurrences:
                continue
            embeddings = all_embs[entity_name, part]
            optimizer = trainer.partitioned_optimizers[entity_name, part]
            subpart_slice = subpart_slices[entity_name, part, subpart]

            embeddings[subpart_slice].data.copy_(
                gpu_embeddings.detach(), non_blocking=True
            )
            del gpu_embeddings
            (cpu_state,) = optimizer.state.values()
            (gpu_state,) = gpu_optimizer.state.values()
            cpu_state["sum"][subpart_slice].copy_(gpu_state["sum"], non_blocking=True)
            del gpu_state["sum"]
            del self.sub_holder[entity_name, part, subpart]
        logger.debug(
            f"Time spent copying subparts from GPU: {tk.stop('copy_from_device'):.4f} s"
        )

        logger.debug(f"do_one_job: Time unaccounted for: {tk.unaccounted():.4f} s")

        return stats
Esempio n. 20
0
from .zmq_adapter import InitSender, WaitAll, SendAll
import abc
import torch.multiprocessing as _mp
mp = _mp.get_context('spawn')

class Simulator(mp.Process):
    '''
    Wrapper for simulator.
    Functions to override:
        on_init: Initialization after the process has started.
        restart: restart the environment.
        terminal: property that tell whether the game has reached terminal
        get_key: from the key, get the content. e.g. ``get_key("s")`` will give the encoded state of the game.
        set_key: set the key from replies. e.g., ``set_key("a", 2)`` set the action to be 2 (and the underlying game can continue).
    '''
    def __init__(self, id, desc):
        '''
        Example:
        desc = dict(
            actor = dict(
                input = dict(s="", last_terminal=""),
                reply = dict(a="")
                connector = "name1"
            ),
            train = dict(
                input = dict(s="", r="", a=""),
                reply = None,
                connector = "name2"
            )
        }
        '''
Esempio n. 21
0
 def test_cuda(self):
     torch.cuda.FloatTensor([1])  # initialize CUDA outside of leak checker
     self._test_sharing(mp.get_context('spawn'), torch.cuda.FloatTensor)
Esempio n. 22
0
    def __init__(
        self,
        make_env_fn: Callable[..., Union[Env, RLEnv]] = _make_env_fn,
        env_fn_args: Sequence[Tuple] = None,
        auto_reset_done: bool = True,
        multiprocessing_start_method: str = "forkserver",
        workers_ignore_signals: bool = False,
    ) -> None:
        """..

        :param make_env_fn: function which creates a single environment. An
            environment can be of type :ref:`env.Env` or :ref:`env.RLEnv`
        :param env_fn_args: tuple of tuple of args to pass to the
            :ref:`_make_env_fn`.
        :param auto_reset_done: automatically reset the environment when
            done. This functionality is provided for seamless training
            of vectorized environments.
        :param multiprocessing_start_method: the multiprocessing method used to
            spawn worker processes. Valid methods are
            :py:`{'spawn', 'forkserver', 'fork'}`; :py:`'forkserver'` is the
            recommended method as it works well with CUDA. If :py:`'fork'` is
            used, the subproccess  must be started before any other GPU useage.
        :param workers_ignore_signals: Whether or not workers will ignore SIGINT and SIGTERM
            and instead will only exit when :ref:`close` is called
        """
        self._is_waiting = False
        self._is_closed = True

        assert (
            env_fn_args is not None and len(env_fn_args) > 0
        ), "number of environments to be created should be greater than 0"

        self._num_envs = len(env_fn_args)

        assert multiprocessing_start_method in self._valid_start_methods, (
            "multiprocessing_start_method must be one of {}. Got '{}'").format(
                self._valid_start_methods, multiprocessing_start_method)
        self._auto_reset_done = auto_reset_done
        self._mp_ctx = mp.get_context(multiprocessing_start_method)
        self._workers = []
        (
            self._connection_read_fns,
            self._connection_write_fns,
        ) = self._spawn_workers(  # noqa
            env_fn_args,
            make_env_fn,
            workers_ignore_signals=workers_ignore_signals,
        )

        self._is_closed = False

        for write_fn in self._connection_write_fns:
            write_fn((OBSERVATION_SPACE_COMMAND, None))
        self.observation_spaces = [
            read_fn() for read_fn in self._connection_read_fns
        ]
        for write_fn in self._connection_write_fns:
            write_fn((ACTION_SPACE_COMMAND, None))
        self.action_spaces = [
            read_fn() for read_fn in self._connection_read_fns
        ]
        for write_fn in self._connection_write_fns:
            write_fn((NUMBER_OF_EPISODES_COMMAND, None))
        self.number_of_episodes = [
            read_fn() for read_fn in self._connection_read_fns
        ]
        self._paused: List[Tuple] = []
Esempio n. 23
0
if __name__ == '__main__':
    parser = get_parser()
    argv = sys.argv[1:]
    args, _ = parser.parse_known_args(argv)

    if args.config == 'multiwoz':
        config = MultiWozConfig()
    else:
        raise NotImplementedError(
            'Config of the dataset {} not implemented'.format(args.config))

    init_logging_handler(args.log_dir)
    logging.debug(str(args))

    try:
        mp = mp.get_context('spawn')
    except RuntimeError:
        pass

    if args.pretrain:
        pass

    elif args.test:
        pass

    else:  # training
        current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
        logging.debug('train {}'.format(current_time))

        print("training start")
Esempio n. 24
0
            output[cnt: cnt + sc.size(0), :] = sc
            cnt += sc.size(0)
        act_scores, comp_scores, reg_scores = reorg_stpp.forward(output, prop_ticks, prop_scaling)

        if reg_scores is not None:
            reg_scores = reg_scores.view(-1, num_class, 2)
            reg_scores[:, :, 0] = reg_scores[:, :, 0] * stats[1, 0] + stats[0, 0]
            reg_scores[:, :, 1] = reg_scores[:, :, 1] * stats[1, 1] + stats[0, 1]

        # perform stpp on scores
        result_queue.put((dataset.video_list[index].id, rel_props.numpy(), act_scores.cpu().numpy(), \
               comp_scores.cpu().numpy(), reg_scores.cpu().numpy(), output.cpu().numpy()))


if __name__ == '__main__':
    ctx = multiprocessing.get_context('spawn')  # this is crucial to using multiprocessing processes with PyTorch

    # This net is used to provides setup settings. It is not used for testing.
    net = SSN(num_class, 2, 5, 2,
              args.modality, test_mode=True,
              base_model=args.arch, no_regression=args.no_regression, stpp_cfg=stpp_configs)

    if args.test_crops == 1:
        cropping = torchvision.transforms.Compose([
            GroupScale(net.scale_size),
            GroupCenterCrop(net.input_size),
        ])
    elif args.test_crops == 10:
        cropping = torchvision.transforms.Compose([
            GroupOverSample(net.input_size, net.scale_size)
        ])
Esempio n. 25
0
 def test_cuda_variable_sharing(self):
     for requires_grad in [True, False]:
         var = torch.arange(1., 26, device='cuda').view(5, 5).requires_grad_(requires_grad)
         self._test_autograd_sharing(var, mp.get_context('spawn'))