Exemple #1
0
    def log_fn(self, stop_event: Event):
        try:
            self._super_create_loggers()
            self.resposne_queue.put({
                k: self.__dict__[k]
                for k in ["save_dir", "tb_logdir", "is_sweep"]
            })

            while True:
                try:
                    cmd = self.draw_queue.get(True, 0.1)
                except EmptyQueue:
                    if stop_event.is_set():
                        break
                    else:
                        continue

                self._super_log(*cmd)
                self.resposne_queue.put(True)
        except:
            print("Logger process crashed.")
            raise
        finally:
            print("Logger: syncing")
            if self.use_wandb:
                wandb.join()

            stop_event.set()
            print("Logger process terminating...")
Exemple #2
0
class TensorEvent:
    """Basically a tuple of several torch.Tensors and a multiprocessing.Event.

  The Tensors can be used as "shared tensors" for passing intermediate tensors
  across processes.

  The Event should be used to signal that the consumer process has finished
  reading from the Tensors. When writing values to Tensors, the producer
  process should first check if Tensors are free, by calling event.wait(). If
  the Tensors are indeed free, then event.wait() will return at once. If not,
  then event.wait() will block until the consumer process calls event.set().
  Thus, the consumer should make sure that it calls event.set() AFTER the
  Tensors' contents have been copied to a safe area, such as the consumer's own
  local tensors.

  This class also includes an Array object living on shared memory, consisting
  of integers for indicating the valid region in each tensor. For example, if
  a process uses only 3 rows of a 4-row tensor, then the corresponding entry
  in the Array would be set to 3. Later, when values are read from the tensor
  by another process, that process would first check the Array value and know
  that it can ignore the final row.
  """
    def __init__(self, shapes, device, dtype=torch.float32):
        self.tensors = tuple(
            torch.empty(*shape, dtype=dtype, device=device)
            for shape in shapes)
        self.event = Event()
        self.event.set()
        self.valid_batch_sizes = Array('i', len(shapes))
Exemple #3
0
 def __init__(self, queue: Queue, collect_event: Event, actor_net, args):
     self._env = ManyUavEnv(1, True)
     self._queue = queue
     self._collect_event = collect_event
     self._actor = actor_net
     self._args = args
     self.event = Event()
Exemple #4
0
    def create_loggers(self):
        self._super_create_loggers = super().create_loggers
        self.stop_event = Event()
        self.proc = Process(target=self.log_fn, args=(self, self.stop_event))
        self.proc.start()

        atexit.register(self.finish)
def run_in_process_group(world_size, filename, fn, inputs):
    if torch.distributed.is_initialized():
        torch.distributed.destroy_process_group()
    processes = []
    q = Queue()
    wait_event = Event()

    # run the remaining processes
    # for rank in range(world_size - 1):
    for rank in range(world_size):
        p = Process(
            target=init_and_run_process,
            args=(rank, world_size, filename, fn, inputs[rank], q, wait_event),
        )
        p.start()
        processes.append(p)

    # fetch the results from the queue before joining, the background processes
    # need to be alive if the queue contains tensors. See
    # https://discuss.pytorch.org/t/using-torch-tensor-over-multiprocessing-queue-process-fails/2847/3  # noqa: B950
    results = []
    for _ in range(len(processes)):
        results.append(q.get())

    wait_event.set()

    for p in processes:
        p.join()
    return results
Exemple #6
0
    def __init__(self, worker_id, args):
        super().__init__()

        self.id = worker_id
        self.args = args
        # for master use, for worker use
        self.pipe_master, self.pipe_worker = Pipe()
        self.exit_event = Event()

        # determine n_e
        q, r = divmod(args.n_e, args.n_w)

        if r:
            print('Warning: n_e % n_w != 0')

        if worker_id == args.n_w - 1:
            self.n_e = n_e = q + r
        else:
            self.n_e = n_e = q

        print('Worker', self.id, '] n_e = %d' % n_e)

        self.env_start = worker_id * q
        self.env_slice = slice(self.env_start, self.env_start + n_e)
        self.env_range = range(self.env_start, self.env_start + n_e)
        self.envs = None

        self.start()
Exemple #7
0
def _worker_loop(dataset, job_queue: mp.Queue, result_queue: mp.Queue,
                 interrupt_event: mp.Event):
    logger = logging.getLogger("worker_loop")
    logger.debug("Worker started.")
    while True:
        logger.debug("Trying to fetch from job_queue.")
        if interrupt_event.is_set():
            logger.debug("Received interrupt signal, breaking.")
            break
        try:
            # This assumes that the job_queue is fully populated before the worker is started.
            index = job_queue.get_nowait()
            logger.debug("Fetch successful.")
        except Empty:
            logger.debug("Queue empty, setting up poison pill.")
            index = None
        if index is None or interrupt_event.is_set():
            logger.debug(
                "Fetched poison pill or received interrupt signal, breaking.")
            break
        try:
            logger.debug("Sampling index {} from dataset.".format(index))
            sample = dataset[index]
        except Exception:
            logger.debug("Dataset threw an exception.".format(index),
                         exc_info=1)
            result_queue.put((index, ExceptionWrapper(sys.exc_info())))
        else:
            logger.debug(
                "Putting sample at index {} in the result queue.".format(
                    index))
            result_queue.put((index, sample))
Exemple #8
0
 def __init__(self, shapes, device, dtype=torch.float32):
     self.tensors = tuple(
         torch.empty(*shape, dtype=dtype, device=device)
         for shape in shapes)
     self.event = Event()
     self.event.set()
     self.valid_batch_sizes = Array('i', len(shapes))
Exemple #9
0
 def __init__(self, port_out, front_sink_addr, verbose=False):
     super().__init__()
     self.port = port_out
     self.exit_flag = Event()
     self.logger = set_logger(colored('SINK', 'green'), verbose)
     self.front_sink_addr = front_sink_addr
     self.is_ready = Event()
     self.verbose = verbose
 def __init__(self, queue: Queue, collect_event: Event, actor_net, args,
              seed):
     self._queue = queue
     self._collect_event = collect_event
     self._actor = actor_net
     self._args = args
     self.event = Event()
     self.seed = seed
Exemple #11
0
def test_routine_as_process():
    r = DummyRoutine()
    e = Event()
    r.stop_event = e
    r.as_process()
    r.start()
    e.set()
    r.runner.join()
Exemple #12
0
class DataLoader:
    def __init__(self, data_store, epochs=1):
        """
          Start the batchCreator and sampleCreator.
          Read the memory config file, and create the right number
          of processes.
        """
        self.ds = data_store
        # Event to stop batch creator and sample creator
        self.stop_sc = Event()
        self.stop_bc = Event()

        # Start separate processes for sample_creator(s)
        self.epochs = epochs
        self.sc = SampleCreator(self.ds,
                                event=self.stop_sc,
                                epochs=self.epochs,
                                sampled=self.ds.points_sampled)
        self.sc.start()

        # Start batch_creator(s)
        self.bc = BatchCreator(self.ds, self.stop_bc)
        self.bc.start()

    def get_next_batch(self):
        """
         Get the next batch from the queue
        """
        if self.ds.batch_creator_done.full():
            return None

        # Access batches from data_store.batches and return the batch
        try:
            batch = self.ds.batches.get()
            return batch
        except Exception as e:
            print(e)
            return None

    def stop_batch_creation(self):
        # Attempt to gracefully terminate the processes
        self.stop_bc.set()
        self.stop_sc.set()
        time.sleep(1)

        # Terminate the processes forcefully
        self.bc.terminate()
        self.sc.terminate()

        # Wait for child processes to end
        self.bc.join()
        self.sc.join()
Exemple #13
0
def test_routine_no_runner():
    r = DummyRoutine(name="dummy")
    with pytest.raises(NoRunnerException):
        r.start()
    e = Event()
    r.stop_event = e
    r.as_thread()
    try:
        r.start()
    except NoRunnerException:
        pytest.fail("NoRunnerException was thrown...")
    e.set()
    r.runner.join()
def multiprocess_training_loader(process_number: int, _config,
                                 _queue: mp.Queue, _wait_for_exit: mp.Event,
                                 _local_file, _fasttext_vocab_cached_mapping,
                                 _fasttext_vocab_cached_data):

    # workflow: we tokenize the data files with the costly spacy before training in a preprocessing step
    # (and concat the tokens with single whitespaces), so here we only split on the whitepsaces
    _tokenizer = None
    if _config["preprocessed_tokenized"] == True:
        _tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

    if _config["token_embedder_type"] == "embedding":
        _token_indexers = {
            "tokens": SingleIdTokenIndexer(lowercase_tokens=True)
        }
        _vocab = Vocabulary.from_files(_config["vocab_directory"])

    elif _config["token_embedder_type"] == "fasttext":
        _token_indexers = {
            "tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"])
        }
        _vocab = FastTextVocab(_fasttext_vocab_cached_mapping,
                               _fasttext_vocab_cached_data,
                               _config["fasttext_max_subwords"])

    elif _config["token_embedder_type"] == "elmo":
        _token_indexers = {"tokens": ELMoTokenCharactersIndexer()}
        _vocab = None

    _triple_loader = IrTripleDatasetReader(
        lazy=True,
        tokenizer=_tokenizer,
        token_indexers=_token_indexers,
        max_doc_length=_config["max_doc_length"],
        max_query_length=_config["max_query_length"])

    _iterator = BucketIterator(batch_size=int(_config["batch_size_train"]),
                               sorting_keys=[("doc_pos_tokens", "num_tokens"),
                                             ("doc_neg_tokens", "num_tokens")])

    _iterator.index_with(_vocab)

    for training_batch in _iterator(_triple_loader.read(_local_file),
                                    num_epochs=1):

        _queue.put(
            training_batch)  # this moves the tensors in to shared memory

    _queue.close()  # indicate this local thread is done
    _wait_for_exit.wait(
    )  # keep this process alive until all the shared memory is used and not needed anymore
Exemple #15
0
def test_routine_crash_message(caplog):
    r = DummyCrashingRoutine()
    e = Event()
    r.stop_event = e
    r.as_thread()
    r.start()
    time.sleep(0.01)
    e.set()
    r.runner.join()
    logs_text_list = [
        record[2]
        for record in (record_tuple for record_tuple in caplog.record_tuples)
    ]
    assert any("The routine has crashed" in log for log in logs_text_list)
class Worker:
    def __init__(self, queue: Queue, collect_event: Event, actor_net, args,
                 seed):
        self._queue = queue
        self._collect_event = collect_event
        self._actor = actor_net
        self._args = args
        self.event = Event()
        self.seed = seed

    def run(self, episode):
        env = ManyUavEnv(self._args.agents, self.seed, self._args.reward_type)
        state = env.reset()
        while True:
            self.event.set()
            self._collect_event.wait()
            actions = []
            for i in range(self._args.agents):
                action = self._choose_action_with_exploration(state[i])
                actions.append(action)
            next_state, reward, done, info = env.step(
                np.array(actions) * self._args.action_bound)

            transition = []
            for i in range(self._args.agents):
                transition.append(
                    (state[i], actions[i], reward[i], next_state[i], done))

            self._queue.put(transition)

            state = next_state
            if done:
                state = env.reset()
                with episode.get_lock():
                    episode.value += 1
            if self._queue.qsize() >= self._args.update_interval:
                self._collect_event.clear()

    def _choose_action_with_exploration(self, state):
        action = self._choose_action(state)
        noise = np.random.normal(0, self._args.scale, (2, ))
        action = np.clip(action + noise, -1, 1)  # clip action between [-1, 1]
        return action

    def _choose_action(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).float().to(CHIP)
            action = self._actor(state)
        action = action.detach().cpu().numpy()
        return action
    def __init__(self, generator, max_queue_size=5, nb_worker=1):
        self.generator = generator
        self.nb_worker = nb_worker
        self.max_queue_size = max_queue_size

        self._queue = Queue()
        self._signal = Event()
        self._available_cv = Condition()
        self._full_cv = Condition()

        args = (generator, self._queue, self._signal, self._available_cv,
                self._full_cv, self.nb_worker, self.max_queue_size)
        self.working_process = Process(target=self.generator_process,
                                       args=args)
        self.working_process.daemon = True
        self.working_process.start()
Exemple #18
0
 def __init__(self, component_config, start_component=False):
     self.name = ""
     self.ROUTINES_FOLDER_PATH = "pipert/contrib/routines"
     self.MONITORING_SYSTEMS_FOLDER_PATH = "pipert/contrib/metrics_collectors"
     self.use_memory = False
     self.stop_event = Event()
     self.stop_event.set()
     self.queues = {}
     self._routines = {}
     self.metrics_collector = NullCollector()
     self.parent_logger = None
     self.logger = None
     self.setup_component(component_config)
     self.metrics_collector.setup()
     if start_component:
         self.run_comp()
    def __init__(self, n_workers, actor, args):
        self._now_episode = Value('i', 0)

        self.queue = Queue()
        self.collect_event = Event()

        self.worker = []
        for i in range(n_workers):
            self.worker.append(
                Worker(self.queue, self.collect_event, actor, args, i))
        self.process = [
            Process(target=self.worker[i].run, args=(self._now_episode, ))
            for i in range(n_workers)
        ]

        for p in self.process:
            p.start()
        print(f'Start {n_workers} workers.')
Exemple #20
0
    def _prefetch(in_queue: mp.Queue,
                  out_queue: mp.Queue,
                  batchsize: int,
                  shutdown_event: mp.Event,
                  target_device,
                  waiting_time=5):
        """Continuously prefetches complete trajectories dropped by
        the :py:class:`~.TrajectoryStore` for training.

        As long as shutdown is not set, this method
        pulls :py:attr:`batchsize` trajectories from :py:attr:`in_queue`,
        transforms them into batches using :py:meth:`~_to_batch()`
        and puts them onto the :py:attr:`out_queue`.

        This usually runs as an asynchronous :py:obj:`multiprocessing.Process`.

        Parameters
        ----------
        in_queue: :py:obj:`multiprocessing.Queue`
            A queue that delivers dropped trajectories from :py:class:`~.TrajectoryStore`.
        out_queue: :py:obj:`multiprocessing.Queue`
            A queue that delivers batches to :py:meth:`_loop()`.
        batchsize: `int`
            The number of trajectories that shall be processed into a batch.
        shutdown_event: :py:obj:`multiprocessing.Event`
            An event that breaks this methods internal loop.
        target_device: :py:obj:`torch.device`
            The target device of the batch.
        waiting_time: `float`
            Time the methods loop sleeps between each iteration.
        """

        while not shutdown_event.is_set():
            try:
                trajectories = [
                    in_queue.get(timeout=waiting_time)
                    for _ in range(batchsize)
                ]
            except queue.Empty:
                continue

            batch = Learner._to_batch(trajectories, target_device)
            # delete Tensors after usage to free memory (see torch multiprocessing)
            del trajectories

            try:
                out_queue.put(batch)
            except (AssertionError, ValueError):  # queue closed
                continue

        # delete Tensors after usage to free memory (see torch multiprocessing)
        del batch
        try:
            del trajectories
        except UnboundLocalError:  # already deleted
            pass
Exemple #21
0
class Worker:
    def __init__(self, queue: Queue, collect_event: Event, actor_net, args):
        self._env = ManyUavEnv(1, True)
        self._queue = queue
        self._collect_event = collect_event
        self._actor = actor_net
        self._args = args
        self.event = Event()

    def run(self, episode):
        state = self._env.reset()

        while True:
            self.event.set()
            self._collect_event.wait()

            action = self._choose_action_with_exploration(state)
            next_state, reward, done, info = self._env.step(
                action * self._args.action_bound)

            self._queue.put((state, action, reward, next_state, done))

            state = next_state
            if done:
                state = self._env.reset()
                with episode.get_lock():
                    episode.value += 1

            if self._queue.qsize() >= self._args.update_interval:
                self._collect_event.clear()

    def _choose_action_with_exploration(self, state):
        action = self._choose_action(state)
        noise = np.random.normal(0, self._args.scale, (2, ))
        action = np.clip(action + noise, -1, 1)  # clip action between [-1, 1]
        return action

    def _choose_action(self, state):
        with torch.no_grad():
            state = torch.from_numpy(state).float()
            action = self._actor(state)
        action = action.detach().numpy()
        return action
Exemple #22
0
 def __init__(self,
              endpoint="tcp://0.0.0.0:4242",
              name="",
              *args,
              **kwargs):
     """
     Args:
         endpoint: the endpoint the component's zerorpc server will listen
         in.
         *args: TBD
         **kwargs: TBD
     """
     super().__init__()
     self.name = name
     self.stop_event = Event()
     self.endpoint = endpoint
     self._routines = []
     self.zrpc = zerorpc.Server(self)
     self.zrpc.bind(endpoint)
Exemple #23
0
    def __init__(self,
                 fp16=False,
                 mean=(0., 0., 0.),
                 std=(1., 1., 1.),
                 pin_memory=True,
                 pca_jitter=False,
                 **kwargs):
        super().__init__(**kwargs)
        print('Using DALI CPU iterator')
        self.stream = torch.cuda.Stream()

        self.fp16 = fp16
        self.mean = torch.tensor(mean).cuda().view(1, 3, 1, 1)
        self.std = torch.tensor(std).cuda().view(1, 3, 1, 1)
        self.pin_memory = pin_memory
        self.pca_jitter = pca_jitter

        if self.fp16:
            self.mean = self.mean.half()
            self.std = self.std.half()

        self.proc_next_input = Event()
        self.done_event = Event()
        self.output_queue = queue.Queue(maxsize=5)
        self.preproc_thread = threading.Thread(
            target=_preproc_worker,
            kwargs={
                'dali_iterator': self._dali_iterator,
                'cuda_stream': self.stream,
                'fp16': self.fp16,
                'mean': self.mean,
                'std': self.std,
                'proc_next_input': self.proc_next_input,
                'done_event': self.done_event,
                'output_queue': self.output_queue,
                'pin_memory': self.pin_memory,
                'pca_jitter': self.pca_jitter
            })
        self.preproc_thread.daemon = True
        self.preproc_thread.start()

        self.proc_next_input.set()
Exemple #24
0
class WorkerManager:
    def __init__(self, n_workers, actor, args):
        self._now_episode = Value('i', 0)

        self.queue = Queue()
        self.collect_event = Event()

        self.worker = []
        for i in range(n_workers):
            self.worker.append(
                Worker(self.queue, self.collect_event, actor, args))
            time.sleep(1)

        self.process = [
            Process(target=self.worker[i].run, args=(self._now_episode, ))
            for i in range(n_workers)
        ]

        for p in self.process:
            p.start()
        print(f'Start {n_workers} workers.')

    def collect(self):
        result = []
        self.collect_event.set()
        while self.collect_event.is_set():
            # WAIT FOR DATA COLLECT END
            pass

        for w in self.worker:
            w.event.wait()

        while not self.queue.empty():
            result.append(self.queue.get())

        for w in self.worker:
            w.event.clear()
        return result

    def now_episode(self):
        value = self._now_episode.value
        return value
Exemple #25
0
    def __init__(self, data_store, epochs=1):
        """
          Start the batchCreator and sampleCreator.
          Read the memory config file, and create the right number
          of processes.
        """
        self.ds = data_store
        # Event to stop batch creator and sample creator
        self.stop_sc = Event()
        self.stop_bc = Event()

        # Start separate processes for sample_creator(s)
        self.epochs = epochs
        self.sc = SampleCreator(self.ds,
                                event=self.stop_sc,
                                epochs=self.epochs,
                                sampled=self.ds.points_sampled)
        self.sc.start()

        # Start batch_creator(s)
        self.bc = BatchCreator(self.ds, self.stop_bc)
        self.bc.start()
Exemple #26
0
def test_add_event_handler():
    r = DummyRoutine()
    e = Event()
    r.stop_event = e
    r.as_thread()
    r.add_event_handler(Events.BEFORE_LOGIC, dummy_before_handler)
    r.add_event_handler(Events.AFTER_LOGIC, dummy_after_handler)

    @r.on(Events.AFTER_LOGIC)
    def dummy_handler(_):
        pass

    r.start()
    r.runner.join()
    assert r.state.dummy == 667
Exemple #27
0
class DaliIteratorCPU(DaliIterator):
    """
    Wrapper class to decode the DALI iterator output & provide iterator that functions the same as torchvision
    Note that permutation to channels first, converting from 8 bit to float & normalization are all performed on GPU

    pipelines (Pipeline): DALI pipelines
    size (int): Number of examples in set
    fp16 (bool): Use fp16 as output format, f32 otherwise
    mean (tuple): Image mean value for each channel
    std (tuple): Image standard deviation value for each channel
    pin_memory (bool): Transfer input tensor to pinned memory, before moving to GPU
    """
    def __init__(self,
                 fp16=False,
                 mean=(0., 0., 0.),
                 std=(1., 1., 1.),
                 pin_memory=True,
                 pca_jitter=False,
                 **kwargs):
        super().__init__(**kwargs)
        print('Using DALI CPU iterator')
        self.stream = torch.cuda.Stream()

        self.fp16 = fp16
        self.mean = torch.tensor(mean).cuda().view(1, 3, 1, 1)
        self.std = torch.tensor(std).cuda().view(1, 3, 1, 1)
        self.pin_memory = pin_memory
        self.pca_jitter = pca_jitter

        if self.fp16:
            self.mean = self.mean.half()
            self.std = self.std.half()

        self.proc_next_input = Event()
        self.done_event = Event()
        self.output_queue = queue.Queue(maxsize=5)
        self.preproc_thread = threading.Thread(
            target=_preproc_worker,
            kwargs={
                'dali_iterator': self._dali_iterator,
                'cuda_stream': self.stream,
                'fp16': self.fp16,
                'mean': self.mean,
                'std': self.std,
                'proc_next_input': self.proc_next_input,
                'done_event': self.done_event,
                'output_queue': self.output_queue,
                'pin_memory': self.pin_memory,
                'pca_jitter': self.pca_jitter
            })
        self.preproc_thread.daemon = True
        self.preproc_thread.start()

        self.proc_next_input.set()

    def __next__(self):
        torch.cuda.current_stream().wait_stream(self.stream)
        data = self.output_queue.get()
        self.proc_next_input.set()
        if data is None:
            raise StopIteration
        return data

    def __del__(self):
        self.done_event.set()
        self.proc_next_input.set()
        torch.cuda.current_stream().wait_stream(self.stream)
        self.preproc_thread.join()
Exemple #28
0
    def buildCache(self, limit):
        # print("Building cache: ",
        #       self.cache_names[self.current_cache_build.value]
        # )
        dataset = MultimodalPatchesDatasetAll(
            self.dataset_dir,
            self.dataset_list,
            rejection_radius_position=self.rejection_radius_position,
            #self.images_path, list=train_sampled,
            numpatches=self.numpatches,
            numneg=self.numneg,
            pos_thr=self.pos_thr,
            reject=self.reject,
            mode=self.mode,
            rejection_radius=self.rejection_radius,
            dist_type=self.dist_type,
            patch_radius=self.patch_radius,
            use_depth=self.use_depth,
            use_normals=self.use_normals,
            use_silhouettes=self.use_silhouettes,
            color_jitter=self.color_jitter,
            greyscale=self.greyscale,
            maxres=self.maxres,
            scale_jitter=self.scale_jitter,
            photo_jitter=self.photo_jitter,
            uniform_negatives=self.uniform_negatives,
            needles=self.needles,
            render_only=self.render_only)
        n_triplets = len(dataset)

        if limit == -1:
            limit = n_triplets

        dataloader = DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=False,
            num_workers=1,  # self.num_workers
            collate_fn=MultimodalPatchesCache.my_collate)

        qmaxsize = 15
        data_queue = JoinableQueue(maxsize=qmaxsize)

        # cannot load to cuda from background, therefore use cpu device
        preloader_resume = Event()
        preloader = Process(target=MultimodalPatchesCache.generateTrainingData,
                            args=(data_queue, dataset, dataloader,
                                  self.batch_size, qmaxsize, preloader_resume,
                                  True, True))
        preloader.do_run_generate = True
        preloader.start()
        preloader_resume.set()

        i_batch = 0
        data = data_queue.get()
        i_batch = data[0]

        counter = 0
        while i_batch != -1:

            self.cache_builder_resume.wait()

            build_dataset_dir = os.path.join(
                self.cache_dir,
                self.cache_names[self.current_cache_build.value])
            batch_fname = os.path.join(build_dataset_dir,
                                       'batch_' + str(counter) + '.pt')

            # print("ibatch", i_batch,
            #        "___data___", data[3].shape, data[6].shape)

            anchor = data[1]
            pos = data[2]
            neg = data[3]
            anchor_r = data[4]
            pos_p = data[5]
            neg_p = data[6]
            c1 = data[7]
            c2 = data[8]
            cneg = data[9]
            id = data[10]

            if not (self.use_depth or self.use_normals):
                #no need to store image data as float, convert to uint
                anchor = (anchor * 255.0).to(torch.uint8)
                pos = (pos * 255.0).to(torch.uint8)
                neg = (neg * 255.0).to(torch.uint8)
                anchor_r = (anchor_r * 255.0).to(torch.uint8)
                pos_p = (pos_p * 255.0).to(torch.uint8)
                neg_p = (neg_p * 255.0).to(torch.uint8)

            tosave = {
                'anchor': anchor,
                'pos': pos,
                'neg': neg,
                'anchor_r': anchor_r,
                'pos_p': pos_p,
                'neg_p': neg_p,
                'c1': c1,
                'c2': c2,
                'cneg': cneg,
                'id': id
            }

            try:
                torch.save(tosave, batch_fname)
                torch.load(batch_fname)
                counter += 1
            except Exception as e:
                print("Could not save ",
                      batch_fname,
                      ", due to:",
                      e,
                      "skipping...",
                      file=sys.stderr)
                if os.path.isfile(batch_fname):
                    os.remove(batch_fname)

            data_queue.task_done()

            if counter >= limit:
                self.cache_done_lock.acquire()
                self.cache_done.value = 1  # 1 is True
                self.cache_done_lock.release()
                counter = 0
                # sleep until calling thread wakes us
                self.cache_builder_resume.clear()
                # resume calling thread so that it can work
                self.wait_for_cache_builder.set()

            data = data_queue.get()
            i_batch = data[0]
            #print("ibatch", i_batch)

        data_queue.task_done()

        self.cache_done_lock.acquire()
        self.cache_done.value = 1  # 1 is True
        self.all_done.value = 1
        print("Cache done ALL")
        self.cache_done_lock.release()
        # resume calling thread so that it can work
        self.wait_for_cache_builder.set()
        preloader.join()
        preloader = None
        data_queue = None
Exemple #29
0
    def __init__(self,
                 cache_dir,
                 dataset_dir,
                 dataset_list,
                 cuda,
                 batch_size=500,
                 num_workers=3,
                 renew_frequency=5,
                 rejection_radius_position=0,
                 numpatches=900,
                 numneg=3,
                 pos_thr=50.0,
                 reject=True,
                 mode='train',
                 rejection_radius=3000,
                 dist_type='3D',
                 patch_radius=None,
                 use_depth=False,
                 use_normals=False,
                 use_silhouettes=False,
                 color_jitter=False,
                 greyscale=False,
                 maxres=4096,
                 scale_jitter=False,
                 photo_jitter=False,
                 uniform_negatives=False,
                 needles=0,
                 render_only=False,
                 maxitems=200,
                 cache_once=False):
        super(MultimodalPatchesCache, self).__init__()
        self.cache_dir = cache_dir
        self.dataset_dir = dataset_dir
        #self.images_path = images_path
        self.dataset_list = dataset_list
        self.cuda = cuda
        self.batch_size = batch_size

        self.num_workers = num_workers
        self.renew_frequency = renew_frequency
        self.rejection_radius_position = rejection_radius_position
        self.numpatches = numpatches
        self.numneg = numneg
        self.pos_thr = pos_thr
        self.reject = reject
        self.mode = mode
        self.rejection_radius = rejection_radius
        self.dist_type = dist_type
        self.patch_radius = patch_radius
        self.use_depth = use_depth
        self.use_normals = use_normals
        self.use_silhouettes = use_silhouettes
        self.color_jitter = color_jitter
        self.greyscale = greyscale
        self.maxres = maxres
        self.scale_jitter = scale_jitter
        self.photo_jitter = photo_jitter
        self.uniform_negatives = uniform_negatives
        self.needles = needles
        self.render_only = render_only

        self.cache_done_lock = Lock()
        self.all_done = Value('B', 0)  # 0 is False
        self.cache_done = Value('B', 0)  # 0 is False

        self.wait_for_cache_builder = Event()
        # prepare for wait until initial cache is built
        self.wait_for_cache_builder.clear()
        self.cache_builder_resume = Event()

        self.maxitems = maxitems
        self.cache_once = cache_once

        if self.mode == 'eval':
            self.maxitems = -1
        self.cache_builder = Process(target=self.buildCache,
                                     args=[self.maxitems])
        self.current_cache_build = Value('B', 0)  # 0th cache
        self.current_cache_use = Value('B', 1)  # 1th cache

        self.cache_names = ["cache1", "cache2"]  # constant

        rebuild_cache = True
        if self.mode == 'eval':
            validation_dir = os.path.join(
                self.cache_dir,
                self.cache_names[self.current_cache_build.value])
            if os.path.isdir(validation_dir):
                # we don't need to rebuild validation cache
                # TODO: check if cache is VALID
                rebuild_cache = False
        elif cache_once:
            build_dataset_dir = os.path.join(
                self.cache_dir,
                self.cache_names[self.current_cache_build.value])
            if os.path.isdir(build_dataset_dir):
                # we don't need to rebuild training cache if we are training
                # on limited subset of the training set
                rebuild_cache = False

        if rebuild_cache:
            # clear the caches if they already exist
            build_dataset_dir = os.path.join(
                self.cache_dir,
                self.cache_names[self.current_cache_build.value])
            if os.path.isdir(build_dataset_dir):
                shutil.rmtree(build_dataset_dir)
            use_dataset_dir = os.path.join(
                self.cache_dir, self.cache_names[self.current_cache_use.value])
            if os.path.isdir(use_dataset_dir):
                shutil.rmtree(use_dataset_dir)

            os.makedirs(build_dataset_dir)

            self.cache_builder_resume.set()
            self.cache_builder.start()

            # wait until initial cache is built
            # print("before wait to build")
            # print("wait for cache builder state",
            #       self.wait_for_cache_builder.is_set())
            self.wait_for_cache_builder.wait()
            # print("after wait to build")

        # we have been resumed
        if self.mode != 'eval' and (not self.cache_once):
            # for training, we can set up the cache builder to build
            # the second cache
            self.restart()
        else:
            # else for validation we don't need second cache
            # we just need to switch the built cache to the use cache in order
            # to use it
            tmp = self.current_cache_build.value
            self.current_cache_build.value = self.current_cache_use.value
            self.current_cache_use.value = tmp
Exemple #30
0
class MultimodalPatchesCache(object):
    def __init__(self,
                 cache_dir,
                 dataset_dir,
                 dataset_list,
                 cuda,
                 batch_size=500,
                 num_workers=3,
                 renew_frequency=5,
                 rejection_radius_position=0,
                 numpatches=900,
                 numneg=3,
                 pos_thr=50.0,
                 reject=True,
                 mode='train',
                 rejection_radius=3000,
                 dist_type='3D',
                 patch_radius=None,
                 use_depth=False,
                 use_normals=False,
                 use_silhouettes=False,
                 color_jitter=False,
                 greyscale=False,
                 maxres=4096,
                 scale_jitter=False,
                 photo_jitter=False,
                 uniform_negatives=False,
                 needles=0,
                 render_only=False,
                 maxitems=200,
                 cache_once=False):
        super(MultimodalPatchesCache, self).__init__()
        self.cache_dir = cache_dir
        self.dataset_dir = dataset_dir
        #self.images_path = images_path
        self.dataset_list = dataset_list
        self.cuda = cuda
        self.batch_size = batch_size

        self.num_workers = num_workers
        self.renew_frequency = renew_frequency
        self.rejection_radius_position = rejection_radius_position
        self.numpatches = numpatches
        self.numneg = numneg
        self.pos_thr = pos_thr
        self.reject = reject
        self.mode = mode
        self.rejection_radius = rejection_radius
        self.dist_type = dist_type
        self.patch_radius = patch_radius
        self.use_depth = use_depth
        self.use_normals = use_normals
        self.use_silhouettes = use_silhouettes
        self.color_jitter = color_jitter
        self.greyscale = greyscale
        self.maxres = maxres
        self.scale_jitter = scale_jitter
        self.photo_jitter = photo_jitter
        self.uniform_negatives = uniform_negatives
        self.needles = needles
        self.render_only = render_only

        self.cache_done_lock = Lock()
        self.all_done = Value('B', 0)  # 0 is False
        self.cache_done = Value('B', 0)  # 0 is False

        self.wait_for_cache_builder = Event()
        # prepare for wait until initial cache is built
        self.wait_for_cache_builder.clear()
        self.cache_builder_resume = Event()

        self.maxitems = maxitems
        self.cache_once = cache_once

        if self.mode == 'eval':
            self.maxitems = -1
        self.cache_builder = Process(target=self.buildCache,
                                     args=[self.maxitems])
        self.current_cache_build = Value('B', 0)  # 0th cache
        self.current_cache_use = Value('B', 1)  # 1th cache

        self.cache_names = ["cache1", "cache2"]  # constant

        rebuild_cache = True
        if self.mode == 'eval':
            validation_dir = os.path.join(
                self.cache_dir,
                self.cache_names[self.current_cache_build.value])
            if os.path.isdir(validation_dir):
                # we don't need to rebuild validation cache
                # TODO: check if cache is VALID
                rebuild_cache = False
        elif cache_once:
            build_dataset_dir = os.path.join(
                self.cache_dir,
                self.cache_names[self.current_cache_build.value])
            if os.path.isdir(build_dataset_dir):
                # we don't need to rebuild training cache if we are training
                # on limited subset of the training set
                rebuild_cache = False

        if rebuild_cache:
            # clear the caches if they already exist
            build_dataset_dir = os.path.join(
                self.cache_dir,
                self.cache_names[self.current_cache_build.value])
            if os.path.isdir(build_dataset_dir):
                shutil.rmtree(build_dataset_dir)
            use_dataset_dir = os.path.join(
                self.cache_dir, self.cache_names[self.current_cache_use.value])
            if os.path.isdir(use_dataset_dir):
                shutil.rmtree(use_dataset_dir)

            os.makedirs(build_dataset_dir)

            self.cache_builder_resume.set()
            self.cache_builder.start()

            # wait until initial cache is built
            # print("before wait to build")
            # print("wait for cache builder state",
            #       self.wait_for_cache_builder.is_set())
            self.wait_for_cache_builder.wait()
            # print("after wait to build")

        # we have been resumed
        if self.mode != 'eval' and (not self.cache_once):
            # for training, we can set up the cache builder to build
            # the second cache
            self.restart()
        else:
            # else for validation we don't need second cache
            # we just need to switch the built cache to the use cache in order
            # to use it
            tmp = self.current_cache_build.value
            self.current_cache_build.value = self.current_cache_use.value
            self.current_cache_use.value = tmp

        # initialization finished, now this dataset can be used

    def getCurrentCache(self):
        # Lock should not be needed - cache_done is not touched
        # and cache_len is read only for cache in use, which should not
        # been touched by other threads
        # self.cache_done_lock.acquire()
        h5_dataset_filename = os.path.join(
            self.cache_dir, self.cache_names[self.current_cache_use.value])
        # self.cache_done_lock.release()
        return h5_dataset_filename

    def restart(self):
        # print("Restarting - waiting for lock...")
        self.cache_done_lock.acquire()
        # print("Restarting cached dataset...")
        if self.cache_done.value and (not self.cache_once):
            cache_changed = True
            tmp_cache_name = self.current_cache_use.value
            self.current_cache_use.value = self.current_cache_build.value
            self.current_cache_build.value = tmp_cache_name
            # clear the old cache if exists
            build_dataset_dir = os.path.join(
                self.cache_dir,
                self.cache_names[self.current_cache_build.value])
            if os.path.isdir(build_dataset_dir):
                shutil.rmtree(build_dataset_dir)
            os.makedirs(build_dataset_dir)
            self.cache_done.value = 0  # 0 is False
            self.cache_builder_resume.set()
            # print("Switched cache to: ",
            #       self.cache_names[self.current_cache_use.value]
            # )
        else:
            cache_changed = False
            # print(
            #     "New cache not ready, continuing with old cache:",
            #     self.cache_names[self.current_cache_use.value]
            # )
        all_done_value = self.all_done.value
        self.cache_done_lock.release()
        # returns true if no more items are available to be loaded
        # this object should be destroyed and new dataset should be created
        # in order to start over.
        return cache_changed, all_done_value

    def buildCache(self, limit):
        # print("Building cache: ",
        #       self.cache_names[self.current_cache_build.value]
        # )
        dataset = MultimodalPatchesDatasetAll(
            self.dataset_dir,
            self.dataset_list,
            rejection_radius_position=self.rejection_radius_position,
            #self.images_path, list=train_sampled,
            numpatches=self.numpatches,
            numneg=self.numneg,
            pos_thr=self.pos_thr,
            reject=self.reject,
            mode=self.mode,
            rejection_radius=self.rejection_radius,
            dist_type=self.dist_type,
            patch_radius=self.patch_radius,
            use_depth=self.use_depth,
            use_normals=self.use_normals,
            use_silhouettes=self.use_silhouettes,
            color_jitter=self.color_jitter,
            greyscale=self.greyscale,
            maxres=self.maxres,
            scale_jitter=self.scale_jitter,
            photo_jitter=self.photo_jitter,
            uniform_negatives=self.uniform_negatives,
            needles=self.needles,
            render_only=self.render_only)
        n_triplets = len(dataset)

        if limit == -1:
            limit = n_triplets

        dataloader = DataLoader(
            dataset,
            batch_size=self.batch_size,
            shuffle=False,
            pin_memory=False,
            num_workers=1,  # self.num_workers
            collate_fn=MultimodalPatchesCache.my_collate)

        qmaxsize = 15
        data_queue = JoinableQueue(maxsize=qmaxsize)

        # cannot load to cuda from background, therefore use cpu device
        preloader_resume = Event()
        preloader = Process(target=MultimodalPatchesCache.generateTrainingData,
                            args=(data_queue, dataset, dataloader,
                                  self.batch_size, qmaxsize, preloader_resume,
                                  True, True))
        preloader.do_run_generate = True
        preloader.start()
        preloader_resume.set()

        i_batch = 0
        data = data_queue.get()
        i_batch = data[0]

        counter = 0
        while i_batch != -1:

            self.cache_builder_resume.wait()

            build_dataset_dir = os.path.join(
                self.cache_dir,
                self.cache_names[self.current_cache_build.value])
            batch_fname = os.path.join(build_dataset_dir,
                                       'batch_' + str(counter) + '.pt')

            # print("ibatch", i_batch,
            #        "___data___", data[3].shape, data[6].shape)

            anchor = data[1]
            pos = data[2]
            neg = data[3]
            anchor_r = data[4]
            pos_p = data[5]
            neg_p = data[6]
            c1 = data[7]
            c2 = data[8]
            cneg = data[9]
            id = data[10]

            if not (self.use_depth or self.use_normals):
                #no need to store image data as float, convert to uint
                anchor = (anchor * 255.0).to(torch.uint8)
                pos = (pos * 255.0).to(torch.uint8)
                neg = (neg * 255.0).to(torch.uint8)
                anchor_r = (anchor_r * 255.0).to(torch.uint8)
                pos_p = (pos_p * 255.0).to(torch.uint8)
                neg_p = (neg_p * 255.0).to(torch.uint8)

            tosave = {
                'anchor': anchor,
                'pos': pos,
                'neg': neg,
                'anchor_r': anchor_r,
                'pos_p': pos_p,
                'neg_p': neg_p,
                'c1': c1,
                'c2': c2,
                'cneg': cneg,
                'id': id
            }

            try:
                torch.save(tosave, batch_fname)
                torch.load(batch_fname)
                counter += 1
            except Exception as e:
                print("Could not save ",
                      batch_fname,
                      ", due to:",
                      e,
                      "skipping...",
                      file=sys.stderr)
                if os.path.isfile(batch_fname):
                    os.remove(batch_fname)

            data_queue.task_done()

            if counter >= limit:
                self.cache_done_lock.acquire()
                self.cache_done.value = 1  # 1 is True
                self.cache_done_lock.release()
                counter = 0
                # sleep until calling thread wakes us
                self.cache_builder_resume.clear()
                # resume calling thread so that it can work
                self.wait_for_cache_builder.set()

            data = data_queue.get()
            i_batch = data[0]
            #print("ibatch", i_batch)

        data_queue.task_done()

        self.cache_done_lock.acquire()
        self.cache_done.value = 1  # 1 is True
        self.all_done.value = 1
        print("Cache done ALL")
        self.cache_done_lock.release()
        # resume calling thread so that it can work
        self.wait_for_cache_builder.set()
        preloader.join()
        preloader = None
        data_queue = None

    @staticmethod
    def loadBatch(sample_batched, mode, device, keep_all=False):
        if mode == 'eval':
            coords1 = sample_batched[6]
            coords2 = sample_batched[7]
            coords_neg = sample_batched[8]
            keep = sample_batched[10]
            item_id = sample_batched[11]
        else:
            coords1 = sample_batched[6]
            coords2 = sample_batched[7]
            coords_neg = sample_batched[8]
            keep = sample_batched[9]
            item_id = sample_batched[10]
        if keep_all:
            # requested to return fill batch
            batchsize = sample_batched[0].shape[0]
            keep = torch.ones(batchsize).byte()
        keep = keep.reshape(-1)
        keep = keep.bool()
        anchor = sample_batched[0]
        pos = sample_batched[1]
        neg = sample_batched[2]

        # swapped photo to render
        anchor_r = sample_batched[3]
        pos_p = sample_batched[4]
        neg_p = sample_batched[5]

        anchor = anchor[keep].to(device)
        pos = pos[keep].to(device)
        neg = neg[keep].to(device)

        anchor_r = anchor_r[keep]
        pos_p = pos_p[keep]
        neg_p = neg_p[keep]

        coords1 = coords1[keep]
        coords2 = coords2[keep]
        coords_neg = coords_neg[keep]
        item_id = item_id[keep]
        return anchor, pos, neg, anchor_r, pos_p, neg_p, coords1, coords2, \
            coords_neg, item_id

    @staticmethod
    def generateTrainingData(queue,
                             dataset,
                             dataloader,
                             batch_size,
                             qmaxsize,
                             resume,
                             shuffle=True,
                             disable_tqdm=False):
        local_buffer_a = []
        local_buffer_p = []
        local_buffer_n = []

        local_buffer_ar = []
        local_buffer_pp = []
        local_buffer_np = []

        local_buffer_c1 = []
        local_buffer_c2 = []
        local_buffer_cneg = []
        local_buffer_id = []
        nbatches = 10
        # cannot load to cuda in batckground process!
        device = torch.device('cpu')

        buffer_size = min(qmaxsize * batch_size, nbatches * batch_size)
        bidx = 0
        for i_batch, sample_batched in enumerate(dataloader):
            # tqdm(dataloader, disable=disable_tqdm)
            resume.wait()
            anchor, pos, neg, anchor_r, \
                pos_p, neg_p, c1, c2, cneg, id = \
                MultimodalPatchesCache.loadBatch(
                    sample_batched, dataset.mode, device
                )
            if anchor.shape[0] == 0:
                continue
            local_buffer_a.extend(list(anchor))  # [:current_batches]
            local_buffer_p.extend(list(pos))
            local_buffer_n.extend(list(neg))

            local_buffer_ar.extend(list(anchor_r))
            local_buffer_pp.extend(list(pos_p))
            local_buffer_np.extend(list(neg_p))

            local_buffer_c1.extend(list(c1))
            local_buffer_c2.extend(list(c2))
            local_buffer_cneg.extend(list(cneg))
            local_buffer_id.extend(list(id))
            if len(local_buffer_a) >= buffer_size:
                if shuffle:
                    local_buffer_a, local_buffer_p, local_buffer_n, \
                        local_buffer_ar, local_buffer_pp, local_buffer_np, \
                        local_buffer_c1, local_buffer_c2, local_buffer_cneg, \
                        local_buffer_id = sklearn.utils.shuffle(
                            local_buffer_a,
                            local_buffer_p,
                            local_buffer_n,
                            local_buffer_ar,
                            local_buffer_pp,
                            local_buffer_np,
                            local_buffer_c1,
                            local_buffer_c2,
                            local_buffer_cneg,
                            local_buffer_id
                        )
                curr_nbatches = int(np.floor(len(local_buffer_a) / batch_size))
                for i in range(0, curr_nbatches):
                    queue.put([
                        bidx,
                        torch.stack(local_buffer_a[:batch_size]),
                        torch.stack(local_buffer_p[:batch_size]),
                        torch.stack(local_buffer_n[:batch_size]),
                        torch.stack(local_buffer_ar[:batch_size]),
                        torch.stack(local_buffer_pp[:batch_size]),
                        torch.stack(local_buffer_np[:batch_size]),
                        torch.stack(local_buffer_c1[:batch_size]),
                        torch.stack(local_buffer_c2[:batch_size]),
                        torch.stack(local_buffer_cneg[:batch_size]),
                        torch.stack(local_buffer_id[:batch_size])
                    ])
                    del local_buffer_a[:batch_size]
                    del local_buffer_p[:batch_size]
                    del local_buffer_n[:batch_size]
                    del local_buffer_ar[:batch_size]
                    del local_buffer_pp[:batch_size]
                    del local_buffer_np[:batch_size]
                    del local_buffer_c1[:batch_size]
                    del local_buffer_c2[:batch_size]
                    del local_buffer_cneg[:batch_size]
                    del local_buffer_id[:batch_size]
                    bidx += 1
        remaining_batches = len(local_buffer_a) // batch_size
        for i in range(0, remaining_batches):
            queue.put([
                bidx,
                torch.stack(local_buffer_a[:batch_size]),
                torch.stack(local_buffer_p[:batch_size]),
                torch.stack(local_buffer_n[:batch_size]),
                torch.stack(local_buffer_ar[:batch_size]),
                torch.stack(local_buffer_pp[:batch_size]),
                torch.stack(local_buffer_np[:batch_size]),
                torch.stack(local_buffer_c1[:batch_size]),
                torch.stack(local_buffer_c2[:batch_size]),
                torch.stack(local_buffer_cneg[:batch_size]),
                torch.stack(local_buffer_id[:batch_size])
            ])
            del local_buffer_a[:batch_size]
            del local_buffer_p[:batch_size]
            del local_buffer_n[:batch_size]
            del local_buffer_ar[:batch_size]
            del local_buffer_pp[:batch_size]
            del local_buffer_np[:batch_size]
            del local_buffer_c1[:batch_size]
            del local_buffer_c2[:batch_size]
            del local_buffer_cneg[:batch_size]
            del local_buffer_id[:batch_size]
        ra = torch.randn(batch_size, 3, 64, 64)
        queue.put([-1, ra, ra, ra])
        queue.join()

    @staticmethod
    def my_collate(batch):
        batch = list(filter(lambda x: x is not None, batch))
        return default_collate(batch)