Esempio n. 1
0
    def observe(self, hpo):
        debug('observe')
        new_results = 0

        m = self.pop_result()
        while m is not None:
            actioned = True
            if m.mtype == RESULT_ITEM:
                info(f'HPO {self.experiment} observed {m.message[0]["uid"]}')
                try:
                    hpo.observe(m.message[0], m.message[1])
                    new_results += 1
                except TrialDoesNotExist as e:
                    warning(f'Could not observe trial: {e}')
                    actioned = False

            elif m.mtype == WORKER_JOIN:
                self.worker_count += 1

            elif m.mtype == WORKER_LEFT:
                self.worker_count -= 1

            else:
                debug(f'Received: {m}')

            if actioned:
                self.future_client.mark_actioned(RESULT_QUEUE, m)

            m = self.pop_result()
        return new_results
Esempio n. 2
0
    def run_hpo(self, message, _):
        """Run the HPO only when needed and then let it die until the results are ready"""
        state = message.message
        namespace = message.namespace
        info(f'Starting (hpo: {namespace})')

        # Instantiate HPO
        hpo = exec_remote_call(state['hpo'])
        hpo_state = state.get('hpo_state')

        if hpo_state is not None:
            hpo.load_state_dict(hpo_state)

        manager = HPOManager(self.client, state,
                             self.backoff.get(namespace, 0))
        new_results, new_trials = manager.step(hpo)
        if new_trials:
            self.backoff[namespace] = 0
        else:
            # Cap to 5 minutes sleep (2 ** 8)
            self.backoff[namespace] = min(
                self.backoff.get(namespace, 0) + 1, 8)

        info(
            f'HPO read (results: {new_results}) and queued (trials: {new_trials})'
        )

        # Return the future work that has to be done
        # before marking this task as complete
        return manager.recorded_operations()
Esempio n. 3
0
    def __init__(self,
                 uri,
                 database,
                 id,
                 experiment=None,
                 hpo_allowed=True,
                 work_allowed=True,
                 log_capture=False):
        super(TrialWorker, self).__init__(uri, database, experiment, id,
                                          WORK_QUEUE, RESULT_QUEUE)
        self.namespaced = experiment is not None
        self.client.capture = log_capture

        if work_allowed:
            self.new_handler(WORK_ITEM, self.run_trial)

        if hpo_allowed:
            self.new_handler(HPO_ITEM, self.run_hpo)

        self.new_handler(WORKER_JOIN, self.ignore_message)

        self.timeout = option('worker.timeout', 5 * 60, type=int)
        self.max_retry = option('worker.max_retry', 3, type=int)
        self.backoff = dict()

        # Disable shutting down when receiving shut down
        if experiment is None:
            info(f'Disabling message shutdown because {experiment}')
            self.dispatcher[SHUTDOWN] = lambda *args, **kwargs: print(
                'ignoring shutdown signal')
Esempio n. 4
0
    def suggest(self, depth=0):
        """Pop an item from the work queue"""
        if depth > 0:
            time.sleep(1)

        # if depth > 10:
        #     raise WaitingForTrials(f'Retried to find new trials {depth} times without success')

        m = None
        while m is None:
            m = self.client.pop(WORK_QUEUE, self.experiment)

            if m is None:
                time.sleep(0.001)

        if m.mtype == HPO_ITEM:
            self.run_hpo(m)
            return self.suggest(depth + 1)

        elif m.mtype == WORK_ITEM:
            self.current_message = m
            return [m.message['kwargs']]

        elif m.mtype == SHUTDOWN:
            self.client.push(RESULT_QUEUE,
                             self.experiment, {},
                             mtype=WORKER_LEFT)
            raise OptimizationIsDone()

        info(f'Received unsupported message {m}')
        return self.suggest(depth + 1)
Esempio n. 5
0
    def result(self):
        state = self._fetch_final_state()

        if state is None:
            info('No HPO_ITEM message found')
            return None

        state = state.message
        self.hpo.load_state_dict(state['hpo_state'])
        return self.hpo.result()
Esempio n. 6
0
def single_gpu_launch(task_name, script_args, job_env, device_id, rank, world_size, port):
    """Launch the task for a given GPU"""
    info(f'Launching job on (device: {device_id})')

    script = f'{os.path.dirname(__file__)}/{task_name}.py'

    cmd = list([f'CUDA_VISIBLE_DEVICES={device_id}', sys.executable, '-u'])
    cmd.append(script)
    cmd.extend(script_args)

    return subprocess.Popen(' '.join(cmd), env=job_env, shell=True)
Esempio n. 7
0
    def launch_workers(self, count, namespaced=True):
        """Launching async workers"""
        info('starting workers')
        namespace = self.experiment
        if not namespaced:
            namespace = None

        for w in range(0, count):
            self.workers.append(
                TrialWorker.async_worker(self.uri, self.database, w,
                                         namespace))
Esempio n. 8
0
    def run_trial(self, message, context):
        """Run a trial and return its result"""
        state = message.message
        uid = state['kwargs']['uid']
        info(f'Starting (trial: {uid})')
        state['kwargs']['experiment_name'] = context['namespace']
        state['kwargs']['client'] = self.client
        result = exec_remote_call(state)
        state['kwargs'].pop('experiment_name')
        state['kwargs'].pop('client')

        info(f'Finished (trial: {uid}) with (objective: {result:.5f})')
        return state['kwargs'], result
Esempio n. 9
0
def split(datasets, data_size, seed, ratio, index, balanced):
    n_train = datasets.train_size
    n_valid = datasets.valid_size
    n_test = datasets.test_size
    n_points = len(datasets)

    assert n_points == n_train + n_valid + n_test

    info('Using the original split')
    return Split(
        train=range(n_train),
        valid=range(n_train, n_train + n_valid),
        test=range(n_train + n_valid, n_points))
Esempio n. 10
0
def local_multigpu_launch(task_name, script_args, job_env, device_id, rank, world_size, port):
    """Launch the task using multiple GPUs"""
    info(f'Launching job on (device: {device_id})')

    script = f'{os.path.dirname(__file__)}/{task_name}.py'

    cmd = list([f'CUDA_VISIBLE_DEVICES={device_id}', sys.executable, '-u'])
    cmd.append(script)
    cmd.extend(('--rank', str(rank)))
    cmd.extend(('--world-size', str(world_size)))
    cmd.extend(('--dist-url', f'nccl:tcp://localhost:{port}'))
    cmd.extend(script_args)

    return subprocess.Popen(' '.join(cmd), env=job_env, shell=True)
Esempio n. 11
0
    def safe_load(self, name, device):
        """Handles a few common exceptions for you and returns None if a file is not found"""
        try:
            return self.load(name, device=device)

        except RuntimeError as e:
            # This error happens when there is a mismatch between save device and current device
            if 'CPU-only machine' in str(e):
                raise KeyboardInterrupt(
                    'Job got scheduled on bad node.') from e

        except FileNotFoundError:
            info(f'State file {name} was not found')
            return None
Esempio n. 12
0
    def kill_idle_worker(self, hpo):
        remaining = hpo.remaining()
        worker = self.worker_count

        # Keep a spare worker
        kill_worker = max(worker - (remaining + 1), 0)
        info(
            f'killing {kill_worker} workers because (worker: {worker}) > (remaining: {remaining}) '
        )

        for i in range(kill_worker):
            self.future_client.push(WORK_QUEUE,
                                    self.experiment, {},
                                    mtype=SHUTDOWN)
Esempio n. 13
0
    def save(self, task):
        if self.uid is None:
            raise BadCheckpoint('No uid was given cannot save state')

        was_saved = False
        state = state_dict(task)
        state['rng'] = get_rng_states()

        # Was enough time passed since last save
        now = datetime.utcnow()
        elapsed = now - self.last_save
        should_save = elapsed.total_seconds() > self.time_buffer

        # Is it the best model we have seen so far
        is_best = True
        if self.keep_best is not None:
            is_best = self.keep_best(task.metrics.value())

        if state:
            # Current model is not the best and we did not save the last model in a different path
            # (which is the best right now)
            # So we need to move the last state so it does not get overridden by current state
            if not is_best and self.best_name is None:
                info(f'Saving best ({self.keep_best.metric}: {self.keep_best.best})')
                self.best_name = self.new_best_name()

                was_pending = self.save_pending()
                if not was_pending:
                    self.storage.rename(self.uid, self.best_name)

            if should_save:
                was_saved = self.storage.save(self.uid, state)
                self.save_pending()
                self.pending = None
                self.last_save = datetime.utcnow()
            else:
                self.save_pending()
                self.pending = (is_best, state)

            # we have a new best and the best was saved as with a different filename
            # So we need to change both the best state and the latest state
            if is_best and self.best_name is not None:
                info(f'New best ({self.keep_best.metric}: {self.keep_best.best})')

                self.storage.remove(self.best_name)
                self.best_name = self.new_best_name()

                was_pending = self.save_pending()
                if not was_pending:
                    self.storage.copyfile(self.uid, self.best_name)

        else:
            warning('The state dictionary was empty!')

        if was_saved:
            info('Checkpoint saved')
            return

        info('Skipped Checkpoint')
Esempio n. 14
0
    def run_hpo(self, message):
        state = message.message

        # Instantiate HPO
        self.hpo = exec_remote_call(state['hpo'])
        hpo_state = state.get('hpo_state')

        if hpo_state is not None:
            self.hpo.load_state_dict(hpo_state)

        manager = HPOManager(self.client, state)
        new_results, new_trials = manager.step(self.hpo)
        info(
            f'HPO read (results: {new_results}) and queued (trials: {new_trials})'
        )
Esempio n. 15
0
    def suggest(self, hpo):
        debug('suggest')
        trials = self._maybe_suggest(hpo, **self.work['kwargs'])

        if trials is None:
            return 0

        for trial in trials:
            new_work = copy.deepcopy(self.work)
            new_work['kwargs'] = trial
            info(f'HPO {self.experiment} suggested {trial["uid"]}')
            self.future_client.push(WORK_QUEUE,
                                    self.experiment,
                                    new_work,
                                    mtype=WORK_ITEM)

        return len(trials)
Esempio n. 16
0
    def __call__(self, input_size, output_size, attention_probs_dropout_prob,
                 hidden_dropout_prob):

        cache_dir = option('model.cache', '/tmp/olympus/cache')
        info('model cache folder: {}'.format(cache_dir))

        config = BertConfig.from_pretrained('bert-base-uncased',
                                            num_labels=2,
                                            finetuning_task=self.task,
                                            cache_dir=cache_dir)

        config.attention_probs_dropout_prob = attention_probs_dropout_prob
        config.hidden_dropout_prob = hidden_dropout_prob

        model = BertWrapper.from_pretrained('bert-base-uncased',
                                            from_tf=False,
                                            config=config,
                                            cache_dir=cache_dir)

        return model
Esempio n. 17
0
    def step(self, hpo):
        new_results = self.observe(hpo)
        new_trials = self.suggest(hpo)

        if hpo.is_done():
            self.shutdown()
            # Queue the HPO but this time in the result queue
            self.queue_hpo(hpo, RESULT_QUEUE)
            return 0, 0
        else:
            self.kill_idle_worker(hpo)

        if new_trials == 0:
            info(f'HPO sleeping {2 ** self.backoff} seconds')
            time.sleep(2**self.backoff)

        if 'hpo_state' in self.state:
            self.queue_hpo(hpo)

        return new_results, new_trials
Esempio n. 18
0
def build(input_size, output_size):

    cfg = [[1,  16, 1, 1],
           [6,  24, 2, 1],
           [6,  32, 3, 2],
           [6,  64, 4, 2],
           [6,  96, 3, 1],
           [6, 160, 3, 2],
           [6, 320, 1, 1]]

    if input_size == (1, 28, 28):
        info('Using MobileNetV2 architecture for MNIST')
        conv = {'kernel_size': 3, 'stride': 1, 'padding': 1}
        avgpool = {'kernel_size': 4}
    elif input_size == (3, 32, 32):
        info('Using MobileNetV2 architecture for CIFAR10/100')
        conv = {'kernel_size': 3, 'stride': 1, 'padding': 1}
        avgpool = {'kernel_size': 4}
    elif input_size == (3, 64, 64):
        info('Using MobileNetV2 architecture for TinyImageNet')
        conv = {'kernel_size': 3, 'stride': 2, 'padding': 1}
        avgpool = {'kernel_size': 2}
        cfg[1][-1] = 2
    # TODO: Add support for ImageNet

    return MobileNetV2(cfg, input_size, num_classes=output_size, conv=conv, avgpool=avgpool)
Esempio n. 19
0
    def __init__(self, layers, input_size, num_classes, batch_norm):
        super(VGG, self).__init__()

        if input_size == (1, 28, 28):
            info('Using VGG architecture for MNIST')
            classifier = {'input': 512, 'hidden': None}
            layers = layers[:-1]  # Drop last maxpool
        elif input_size == (3, 32, 32):
            info('Using VGG architecture for CIFAR10/100')
            classifier = {'input': 512, 'hidden': None}
        elif input_size == (3, 64, 64):
            info('Using VGG architecture for TinyImageNet')
            classifier = {'input': 2048, 'hidden': 1024}
        # TODO: Add support for ImageNet
        else:
            raise ValueError(
                'There is no VGG architecture for an input size {}'.format(
                    input_size))

        self.features = self.make_layers(input_size[0], layers, batch_norm)

        if classifier.get('hidden'):
            self.classifier = nn.Sequential(
                nn.Linear(classifier['input'], classifier['hidden']),
                nn.ReLU(True),
                nn.Dropout(),
                nn.Linear(classifier['hidden'], classifier['hidden']),
                nn.ReLU(True),
                nn.Dropout(),
                nn.Linear(classifier['hidden'], num_classes),
            )
        else:
            self.classifier = nn.Linear(classifier['input'], num_classes)

        self._initialize_weights()
Esempio n. 20
0
    def __init__(self, input_size, num_classes):
        super(LeNet, self).__init__()

        if not isinstance(num_classes, int):
            num_classes = numpy.product(num_classes)

        n_channels = input_size[0]
        if tuple(input_size) == (1, 28, 28):
            info('Using LeNet architecture for MNIST')
            self.conv1 = nn.Conv2d(n_channels, 20, 5, 1)
            self.pool1 = nn.MaxPool2d(2, 2)
            self.conv2 = nn.Conv2d(20, 50, 5, 1)
            self.pool2 = nn.MaxPool2d(2, 2)
            self.fc1 = nn.Linear(50 * 4 * 4, 500)
            self.fc2 = nn.Linear(500, num_classes)
        elif tuple(input_size) == (3, 32, 32):
            info('Using LeNet architecture for CIFAR10/100')
            self.conv1 = nn.Conv2d(n_channels, 20, 5, 1)
            self.pool1 = nn.MaxPool2d(2, 2)
            self.conv2 = nn.Conv2d(20, 50, 5, 1)
            self.pool2 = nn.MaxPool2d(2, 2)
            self.fc1 = nn.Linear(50 * 5 * 5, 500)
            self.fc2 = nn.Linear(500, num_classes)
        elif tuple(input_size) == (3, 64, 64):
            info('Using LeNet architecture for TinyImageNet')
            self.conv1 = nn.Conv2d(n_channels, 20, 5, 1)
            self.pool1 = nn.MaxPool2d(3, 3)
            self.conv2 = nn.Conv2d(20, 50, 5, 1)
            self.pool2 = nn.MaxPool2d(3, 3)
            self.fc1 = nn.Linear(50 * 5 * 5, 500)
            self.fc2 = nn.Linear(500, num_classes)
        else:
            raise ValueError(
                'There is no LeNet architecture for an input size {}'.format(
                    input_size))
Esempio n. 21
0
def build(block, cfg, input_size, output_size):
    if not isinstance(output_size, int):
        output_size = numpy.product(output_size)

    if input_size == (1, 28, 28):
        info('Using PreActResNet architecture for MNIST')

        conv = {'kernel_size': 3, 'stride': 1, 'padding': 1}
        avgpool = {'kernel_size': 4}
        maxpool = {}
    elif input_size == (3, 32, 32):
        info('Using PreActResNet architecture for CIFAR10/100')

        conv = {'kernel_size': 3, 'stride': 1, 'padding': 1}
        avgpool = {'kernel_size': 4}
        maxpool = {}
    elif input_size == (3, 64, 64):
        info('Using PreActResNet architecture for TinyImageNet')

        conv = {'kernel_size': 7, 'stride': 2, 'padding': 3}
        avgpool = {'kernel_size': 2}
        maxpool = {'kernel_size': 3, 'stride': 2, 'padding': 1}

    # Add Resnet for ImageNet (3, 224, 224)!
    model = ResNet(block,
                   cfg,
                   input_size=input_size,
                   conv=conv,
                   maxpool=maxpool,
                   avgpool=avgpool,
                   num_classes=output_size)

    return model
Esempio n. 22
0
    def on_new_trial(self, task, step, parameters, uid):
        """On new trial try to resume the new trial"""
        # Make a unique id for resuming
        self.uid = parameters.get('uid', uid)

        if self.uid is None:
            self.uid = unique_trial_id(task.__class__.__name__, parameters)

        state = self.storage.safe_load(self.uid, device=task.device)

        if state is not None:
            set_rng_states(state['rng'])
            load_state_dict(task, state)
            info(f'Resuming (trial_id: {self.uid})')
        else:
            meta = dict(parameters=parameters, task=type(task).__name__)
            self.storage.save_meta(self.uid, meta)
            info(f'Starting a new (trial_id: {self.uid})')

        if state is None and self.save_init:
            state = state_dict(task)
            # state['rng'] = get_rng_states()
            self.storage.save(f'init_{self.uid}', state)
Esempio n. 23
0
def build(block, cfg, input_size, output_size):

    if input_size == (1, 28, 28):
        info('Using PreActResNet architecture for MNIST')
        conv = {'kernel_size': 3, 'stride': 1, 'padding': 1}
        avgpool = {'kernel_size': 4}
        maxpool = {}
    elif input_size == (3, 32, 32):
        info('Using PreActResNet architecture for CIFAR10/100')
        conv = {'kernel_size': 3, 'stride': 1, 'padding': 1}
        avgpool = {'kernel_size': 4}
        maxpool = {}
    elif input_size == (3, 64, 64):
        info('Using PreActResNet architecture for TinyImageNet')
        conv = {'kernel_size': 7, 'stride': 2, 'padding': 3}
        avgpool = {'kernel_size': 2}
        maxpool = {'kernel_size': 3, 'stride': 2, 'padding': 1}

    return PreActResNet(block, cfg, input_size=input_size, num_classes=output_size, conv=conv,
                        maxpool=maxpool, avgpool=avgpool)
Esempio n. 24
0
 def wait(self):
     for w in self.workers:
         w.join()
         w.close()
         info(f'joining worker{w}')