コード例 #1
0
    def launch_inference(self):
        """Submit inference tasks for the yet-unlabelled samples"""

        self.logger.info('Beginning to submit inference tasks')
        # Make a folder for the models
        model_folder = self.output_dir.joinpath('models')
        model_folder.mkdir(exist_ok=True)

        # Submit the chunks to the workflow engine
        for mid in range(len(self.mpnns)):
            # Get a model that is ready for inference
            model = self.ready_models.get()

            # Convert it to a pickle-able message
            model_msg = MPNNMessage(model)

            # Proxy it once, to be used by all inference tasks
            model_msg_proxy = ps.store.get_store(self.ps_names['infer']).proxy(
                model_msg, key=f'model-{mid}-{self.inference_batch}')

            # Run inference with all segements available
            for cid, (chunk, chunk_msg) in enumerate(
                    zip(self.inference_chunks, self.inference_proxies)):
                self.queues.send_inputs([model_msg_proxy],
                                        chunk_msg,
                                        topic='infer',
                                        method='evaluate_mpnn',
                                        keep_inputs=False,
                                        task_info={
                                            'chunk_id': cid,
                                            'chunk_size': len(chunk),
                                            'model_id': mid
                                        })
        self.logger.info('Finished submitting molecules for inference')
コード例 #2
0
    def train_models(self):
        """Train machine learning models"""
        self.start_training.clear()
        self.logger.info('Started retraining')

        for mid, model in enumerate(self.mpnns):
            # Wait until we have nodes
            if not self.rec.acquire('training', 1, cancel_if=self.done):
                # If unsuccessful, exit because we are finished
                return 

            # Make the database
            train_data = dict(
                (d.identifier['smiles'], d.oxidation_potential[self.output_property])
                for d in self.database
                if self.output_property in d.oxidation_potential
            )

            # Make the MPNN message
            if self.retrain_from_initial:
                self.queues.send_inputs(model.get_config(), train_data, method='retrain_mpnn', topic='train',
                                        task_info={'model_id': mid}, # , 'molecules': list(train_data.keys())}, 
                                        keep_inputs=False,
                                        input_kwargs={'random_state': mid + self.random_seed})
            else:
                model_msg = MPNNMessage(model)
                self.queues.send_inputs(model_msg, train_data, method='update_mpnn', topic='train',
                                        task_info={'model_id': mid}, #'molecules': list(train_data.keys())}, 
                                        keep_inputs=False,
                                        input_kwargs={'random_state': mid + self.random_seed})
            self.logger.info(f'Submitted model {mid} to train with {len(train_data)} entries')
        self.all_training_started.set()
コード例 #3
0
ファイル: test_mpnn.py プロジェクト: tskluzac/colmena
def test_evaluate(model, atom_types, bond_types):
    smiles = ['CC', 'CCC', 'CC=C', 'CCC']
    output = evaluate_mpnn(MPNNMessage(model),
                           smiles,
                           atom_types,
                           bond_types,
                           batch_size=2)
    assert output.size == 4
    assert isclose(output[1], output[3], abs_tol=1e-6)
コード例 #4
0
ファイル: test_mpnn.py プロジェクト: tskluzac/colmena
def test_training(model, atom_types, bond_types):
    smiles = ['CC', 'CCCC', 'CC=C', 'CCC']
    y = [1, 2, 3, 4]
    model.compile('adam', 'mean_squared_error')
    new_weights, history = update_mpnn(MPNNMessage(model),
                                       dict(zip(smiles, y)),
                                       4,
                                       atom_types,
                                       bond_types,
                                       validation_split=0.5)
    model.set_weights(new_weights)
    assert len(history['loss']) == 4
コード例 #5
0
    def train_models(self):
        """Train machine learning models"""
        self.logger.info('Started retraining')

        # Set that a retraining event is in progress
        self.update_complete.clear()
        self.update_in_progress.set()
        self.num_training_complete = 0

        # Save the models as pickle-able messages
        model_msgs = [MPNNMessage(model) for model in self.mpnns]

        # If desired store them as proxies in a large batch
        if self.ps_names['train'] is not None:
            keys = [f'model-{mid}' for mid in range(len(self.mpnns))]
            model_msgs = ps.store.get_store(
                self.ps_names['train']).proxy_batch(model_msgs,
                                                    keys=keys,
                                                    strict=True)

        for mid, (model, model_msg) in enumerate(zip(self.mpnns, model_msgs)):
            # Make the database
            train_data = dict((d.identifier['smiles'],
                               d.oxidation_potential[self.property_name])
                              for d in self.database
                              if self.property_name in d.oxidation_potential)

            # Make the MPNN message
            if self.retrain_from_initial:
                self.queues.send_inputs(model.get_config(),
                                        train_data,
                                        method='retrain_mpnn',
                                        topic='train',
                                        task_info={'model_id': mid},
                                        keep_inputs=False,
                                        input_kwargs={'random_state': mid})
            else:
                self.queues.send_inputs(
                    model_msg,
                    train_data,
                    method='update_mpnn',
                    topic='train',
                    task_info={'model_id': mid},  # 'molecules': list()
                    keep_inputs=False,
                    input_kwargs={'random_state': mid})
            self.logger.info(
                f'Submitted model {mid} to train with {len(train_data)} entries'
            )
コード例 #6
0
    def _create_message(self, path: Path) -> Union[MPNNMessage, TorchMessage]:
        """Create a message for a model at a certain path

        Args:
            path: Path to the model file
        Returns:
            Model in serializable format
        """
        if self.model_type == ModelType.MPNN:
            model = tf.keras.models.load_model(path,
                                               custom_objects=custom_objects)
            return MPNNMessage(model)
        elif self.model_type == ModelType.SCHNET:
            model = torch.load(path, map_location=torch.device('cpu'))
            return TorchMessage(model)
        else:
            raise NotImplementedError(
                f'Loading not implemented for {self.model_type}')
コード例 #7
0
def test_train(train_dataset, model):
    # Make the MPNN into a message object
    model_msg = MPNNMessage(model)
    new_weights, history = update_mpnn(model_msg,
                                       train_dataset,
                                       2,
                                       validation_split=0.5)
    assert 'val_loss' in history
    assert len(new_weights) == len(model_msg.weights)

    # Try training from fresh
    new_weights, history = retrain_mpnn(model.get_config(),
                                        train_dataset,
                                        2,
                                        validation_split=0.5)
    assert 'val_loss' in history
    assert len(new_weights) == len(model_msg.weights)

    # Try training from fresh, with bootstrap
    new_weights, history = retrain_mpnn(model.get_config(),
                                        train_dataset,
                                        2,
                                        validation_split=0.5,
                                        bootstrap=True)
    assert 'val_loss' in history
    assert len(new_weights) == len(model_msg.weights)

    # Test with a timeout
    start_time = perf_counter()
    update_mpnn(model_msg, train_dataset, 512, validation_split=0.5, timeout=1)
    assert perf_counter() - start_time < 2

    # Test with a test set
    _, _, y_pred = \
        update_mpnn(model_msg, train_dataset, 512, ['C', 'CC'],
                    validation_split=0.5, timeout=1)
    assert np.array(y_pred).shape == (2, )
コード例 #8
0
    def run(self):
        # Launch the "simulator" thread
        design_thread = Thread(target=self.simulation_dispatcher)
        design_thread.start()

        # Submit some initial molecules so that the simulator gets started immediately
        num_to_seed = self._task_queue.maxsize
        self.logger.info(f'Sending {num_to_seed} initial molecules')
        for smiles in sample(self.initial_search_space, num_to_seed):
            self._task_queue.put(((1, 0), smiles))

        # Perform the design loop iteratively
        step_number = 0
        while len(self.database) < self.n_evals:
            self.logger.info(f'Generating new molecules')

            # Update the MPNN
            self.queues.send_inputs(MPNNMessage(self.mpnn),
                                    self.database,
                                    4,
                                    method='update_mpnn',
                                    topic='ML')
            self.logger.info(
                f'Updating the model with training set size {len(self.database)}'
            )
            result = self.queues.get_result(topic='ML')
            new_weights, _ = result.value
            self.mpnn.set_weights(new_weights)

            # Use RL to generate new molecules
            self.moldqn.env.reward_fn.model = self.mpnn
            self.queues.send_inputs(self.moldqn,
                                    method='generate_molecules',
                                    topic='ML')
            result = self.queues.get_result(topic='ML')
            new_molecules, self.moldqn = result.value  # Also update the RL agent
            self.logger.info(
                f'Generated {len(new_molecules)} candidate molecules')

            # Assign them scores
            self.queues.send_inputs(MPNNMessage(self.mpnn),
                                    new_molecules,
                                    method='evaluate_mpnn',
                                    topic='ML')
            result = self.queues.get_result(topic='ML')
            scores = result.value
            self.logger.info(f'Assigned scores to all molecules')

            # Pick a set of calculations to run
            #   Greedy selection for now
            task_options = [{
                'smiles': s,
                'pred_atom': e
            } for s, e in zip(new_molecules, scores)]
            selections = greedy_selection(task_options, self.n_parallel,
                                          lambda x: -x['pred_atom'])
            self.logger.info(f'Selected {len(selections)} new molecules')

            # Add requested simulations to the queue
            for rank, task in enumerate(selections):
                self._task_queue.put(
                    ((-step_number, rank),
                     task['smiles']))  # Sort by recency and then by best
            step_number += 1  # Increment the loop

        self.logger.info('No longer generating new candidates')
        self._gen_done.set()
コード例 #9
0
ファイル: run.py プロジェクト: exalearn/electrolyte-design
    def task_ranker(self):
        """Prioritize list of available tasks"""

        # Submit some initial molecules so that the simulator gets started immediately
        num_to_seed = self.queue_length
        self.logger.info(f'Sending {num_to_seed} initial molecules')
        for smiles in sample(self.search_space, num_to_seed):
            # We send: (rank info), smiles, task_info
            self._task_queue.put((smiles, {
                'reason': 'initial',
                'batch': -1,
                'smiles': smiles
            }))

        # Perform the design loop iteratively
        batch_number = 0
        while not self._done.is_set():
            # Get the current copy of the search space
            search_space = self.search_space.copy()

            # Assign them scores
            with self._update_lock:
                self.queues.send_inputs(
                    [MPNNMessage(m) for m, _ in self.mpnns],
                    search_space,
                    method='evaluate_mpnn',
                    topic='rank')

                # Capture the training set of models used in this inference run
                training_sets = [s.copy() for _, s in self.mpnns]

            self.logger.info(f'Submitted inference task')
            result = self.queues.get_result(topic='rank')
            scores = result.value
            result.task_info = {
                'training_sets': training_sets
            }  # Record the training sets
            self._write_result(result,
                               'screen_records.jsonld',
                               keep_inputs=False,
                               keep_outputs=False)
            self.logger.info(f'Assigned scores to all {len(scores)} molecules')

            # Assign scores to each SMILES
            mean_score = scores.mean(axis=1)
            std_score = scores.std(axis=1)
            task_options = [{
                'smiles': s,
                'pred': float(m),
                'pred_std': float(u),
                'batch': batch_number
            } for s, m, u in zip(search_space, mean_score, std_score)]

            # Rank according to different metrics. Best at the right end (so .pop works)
            random_selections = task_options.copy()
            shuffle(random_selections)
            greedy_selections = sorted(task_options, key=lambda x: -x['pred'])
            uq_selections = sorted(task_options, key=lambda x: x['pred_std'])
            self.logger.info(
                'Sorted molecules by greedy, random and uncertainty selection.'
            )

            # Pick enough to fill the queue
            already_picked = set()
            selections = []
            while len(already_picked) < self.queue_length:
                # Make sure none of the lists are empty
                if min(
                        map(len, [
                            greedy_selections, random_selections, uq_selections
                        ])) == 0:
                    self.logger.info('Ran out of molecules to select from')
                    break

                # Pick a task
                r = random()
                if r < self.greedy_frac:
                    task = greedy_selections.pop()
                    task['reason'] = 'greedy'
                elif r < self.greedy_frac + self.random_frac:
                    task = random_selections.pop()
                    task['reason'] = 'random'
                else:
                    task = uq_selections.pop()
                    task['reason'] = 'uq'

                # If it is not yet selected
                if (task['smiles'] not in already_picked
                        and task['smiles'] not in self.database):
                    already_picked.add(task['smiles'])
                    selections.append(task)
            self.logger.info(f'Selected {len(selections)} new molecules')

            # Clear out the queue
            while not self._task_queue.empty():
                try:
                    self._task_queue.get_nowait()
                except Empty:
                    break
            self.logger.info('Cleared out the current queue')

            # Add requested simulations to the queue
            for rank, task in enumerate(selections):
                self._task_queue.put((task['smiles'], task))
            batch_number += 1  # Increment the loop
            self.logger.info('Added all of them the task queue')
コード例 #10
0
ファイル: run.py プロジェクト: exalearn/electrolyte-design
    def model_updater(self):
        """Handle updating the ML models"""

        # Randomly order the MPNNs
        ready_to_retrain = list(range(len(self.mpnns)))
        shuffle(ready_to_retrain)
        ready_to_retrain = deque(ready_to_retrain)

        # Launch the first models to be updated
        for _ in range(self.n_parallel_updating):
            ind = ready_to_retrain.popleft()
            mpnn = self.mpnns[ind]
            self.queues.send_inputs(MPNNMessage(mpnn[0]),
                                    self.database,
                                    4,
                                    method='update_mpnn',
                                    topic='update',
                                    task_info={
                                        'index':
                                        ind,
                                        'training_molecules':
                                        list(self.database.keys())
                                    })
            self.logger.info(f'Submitted model {ind} to be updated')

        # Make a directory to store updated models
        model_dir = os.path.join(self.output_dir, 'models')
        os.makedirs(model_dir, exist_ok=True)

        # Continually wait for new models to come back
        result_ind = 0
        while not self._done.is_set():
            # Wait for a model to be returned
            result = self.queues.get_result(topic='update')

            # Update the weights
            complted_ind = result.task_info['index']
            if result.success:
                new_weights, _ = result.value
                with self._update_lock:
                    self.mpnns[complted_ind][0].set_weights(new_weights)
                    self.mpnns[complted_ind][1] = result.task_info[
                        'training_molecules']
                self.logger.info(f'Updated weights for model {complted_ind}')
            else:
                self.logger.info(f'Retraining failed for model {complted_ind}')

            # Mark the model as ready to be updated again
            ready_to_retrain.append(complted_ind)

            # Submit another model to be updated
            ind = ready_to_retrain.popleft()
            mpnn = self.mpnns[ind]
            self.queues.send_inputs(MPNNMessage(mpnn[0]),
                                    self.database,
                                    4,
                                    method='update_mpnn',
                                    topic='update',
                                    task_info={
                                        'index':
                                        ind,
                                        'training_molecules':
                                        list(self.database.keys())
                                    })
            self.logger.info(f'Submitted model {ind} to be updated')

            # Save the results
            self._write_result(result,
                               'update_records.jsonld',
                               keep_inputs=False,
                               keep_outputs=False)

            # If the updated model, if re-training was successful
            result_ind += 1
            if result.success:
                model_name = os.path.join(
                    model_dir, f'{result_ind}_model_{complted_ind}.h5')
                self.logger.info(
                    f'Saving model {complted_ind} to disk as {model_name}')
                self.mpnns[complted_ind][0].save(model_name,
                                                 include_optimizer=False)
                self.logger.info(
                    'Model saved. Waiting for next update task to complete')