def launch_inference(self): """Submit inference tasks for the yet-unlabelled samples""" self.logger.info('Beginning to submit inference tasks') # Make a folder for the models model_folder = self.output_dir.joinpath('models') model_folder.mkdir(exist_ok=True) # Submit the chunks to the workflow engine for mid in range(len(self.mpnns)): # Get a model that is ready for inference model = self.ready_models.get() # Convert it to a pickle-able message model_msg = MPNNMessage(model) # Proxy it once, to be used by all inference tasks model_msg_proxy = ps.store.get_store(self.ps_names['infer']).proxy( model_msg, key=f'model-{mid}-{self.inference_batch}') # Run inference with all segements available for cid, (chunk, chunk_msg) in enumerate( zip(self.inference_chunks, self.inference_proxies)): self.queues.send_inputs([model_msg_proxy], chunk_msg, topic='infer', method='evaluate_mpnn', keep_inputs=False, task_info={ 'chunk_id': cid, 'chunk_size': len(chunk), 'model_id': mid }) self.logger.info('Finished submitting molecules for inference')
def train_models(self): """Train machine learning models""" self.start_training.clear() self.logger.info('Started retraining') for mid, model in enumerate(self.mpnns): # Wait until we have nodes if not self.rec.acquire('training', 1, cancel_if=self.done): # If unsuccessful, exit because we are finished return # Make the database train_data = dict( (d.identifier['smiles'], d.oxidation_potential[self.output_property]) for d in self.database if self.output_property in d.oxidation_potential ) # Make the MPNN message if self.retrain_from_initial: self.queues.send_inputs(model.get_config(), train_data, method='retrain_mpnn', topic='train', task_info={'model_id': mid}, # , 'molecules': list(train_data.keys())}, keep_inputs=False, input_kwargs={'random_state': mid + self.random_seed}) else: model_msg = MPNNMessage(model) self.queues.send_inputs(model_msg, train_data, method='update_mpnn', topic='train', task_info={'model_id': mid}, #'molecules': list(train_data.keys())}, keep_inputs=False, input_kwargs={'random_state': mid + self.random_seed}) self.logger.info(f'Submitted model {mid} to train with {len(train_data)} entries') self.all_training_started.set()
def test_evaluate(model, atom_types, bond_types): smiles = ['CC', 'CCC', 'CC=C', 'CCC'] output = evaluate_mpnn(MPNNMessage(model), smiles, atom_types, bond_types, batch_size=2) assert output.size == 4 assert isclose(output[1], output[3], abs_tol=1e-6)
def test_training(model, atom_types, bond_types): smiles = ['CC', 'CCCC', 'CC=C', 'CCC'] y = [1, 2, 3, 4] model.compile('adam', 'mean_squared_error') new_weights, history = update_mpnn(MPNNMessage(model), dict(zip(smiles, y)), 4, atom_types, bond_types, validation_split=0.5) model.set_weights(new_weights) assert len(history['loss']) == 4
def train_models(self): """Train machine learning models""" self.logger.info('Started retraining') # Set that a retraining event is in progress self.update_complete.clear() self.update_in_progress.set() self.num_training_complete = 0 # Save the models as pickle-able messages model_msgs = [MPNNMessage(model) for model in self.mpnns] # If desired store them as proxies in a large batch if self.ps_names['train'] is not None: keys = [f'model-{mid}' for mid in range(len(self.mpnns))] model_msgs = ps.store.get_store( self.ps_names['train']).proxy_batch(model_msgs, keys=keys, strict=True) for mid, (model, model_msg) in enumerate(zip(self.mpnns, model_msgs)): # Make the database train_data = dict((d.identifier['smiles'], d.oxidation_potential[self.property_name]) for d in self.database if self.property_name in d.oxidation_potential) # Make the MPNN message if self.retrain_from_initial: self.queues.send_inputs(model.get_config(), train_data, method='retrain_mpnn', topic='train', task_info={'model_id': mid}, keep_inputs=False, input_kwargs={'random_state': mid}) else: self.queues.send_inputs( model_msg, train_data, method='update_mpnn', topic='train', task_info={'model_id': mid}, # 'molecules': list() keep_inputs=False, input_kwargs={'random_state': mid}) self.logger.info( f'Submitted model {mid} to train with {len(train_data)} entries' )
def _create_message(self, path: Path) -> Union[MPNNMessage, TorchMessage]: """Create a message for a model at a certain path Args: path: Path to the model file Returns: Model in serializable format """ if self.model_type == ModelType.MPNN: model = tf.keras.models.load_model(path, custom_objects=custom_objects) return MPNNMessage(model) elif self.model_type == ModelType.SCHNET: model = torch.load(path, map_location=torch.device('cpu')) return TorchMessage(model) else: raise NotImplementedError( f'Loading not implemented for {self.model_type}')
def test_train(train_dataset, model): # Make the MPNN into a message object model_msg = MPNNMessage(model) new_weights, history = update_mpnn(model_msg, train_dataset, 2, validation_split=0.5) assert 'val_loss' in history assert len(new_weights) == len(model_msg.weights) # Try training from fresh new_weights, history = retrain_mpnn(model.get_config(), train_dataset, 2, validation_split=0.5) assert 'val_loss' in history assert len(new_weights) == len(model_msg.weights) # Try training from fresh, with bootstrap new_weights, history = retrain_mpnn(model.get_config(), train_dataset, 2, validation_split=0.5, bootstrap=True) assert 'val_loss' in history assert len(new_weights) == len(model_msg.weights) # Test with a timeout start_time = perf_counter() update_mpnn(model_msg, train_dataset, 512, validation_split=0.5, timeout=1) assert perf_counter() - start_time < 2 # Test with a test set _, _, y_pred = \ update_mpnn(model_msg, train_dataset, 512, ['C', 'CC'], validation_split=0.5, timeout=1) assert np.array(y_pred).shape == (2, )
def run(self): # Launch the "simulator" thread design_thread = Thread(target=self.simulation_dispatcher) design_thread.start() # Submit some initial molecules so that the simulator gets started immediately num_to_seed = self._task_queue.maxsize self.logger.info(f'Sending {num_to_seed} initial molecules') for smiles in sample(self.initial_search_space, num_to_seed): self._task_queue.put(((1, 0), smiles)) # Perform the design loop iteratively step_number = 0 while len(self.database) < self.n_evals: self.logger.info(f'Generating new molecules') # Update the MPNN self.queues.send_inputs(MPNNMessage(self.mpnn), self.database, 4, method='update_mpnn', topic='ML') self.logger.info( f'Updating the model with training set size {len(self.database)}' ) result = self.queues.get_result(topic='ML') new_weights, _ = result.value self.mpnn.set_weights(new_weights) # Use RL to generate new molecules self.moldqn.env.reward_fn.model = self.mpnn self.queues.send_inputs(self.moldqn, method='generate_molecules', topic='ML') result = self.queues.get_result(topic='ML') new_molecules, self.moldqn = result.value # Also update the RL agent self.logger.info( f'Generated {len(new_molecules)} candidate molecules') # Assign them scores self.queues.send_inputs(MPNNMessage(self.mpnn), new_molecules, method='evaluate_mpnn', topic='ML') result = self.queues.get_result(topic='ML') scores = result.value self.logger.info(f'Assigned scores to all molecules') # Pick a set of calculations to run # Greedy selection for now task_options = [{ 'smiles': s, 'pred_atom': e } for s, e in zip(new_molecules, scores)] selections = greedy_selection(task_options, self.n_parallel, lambda x: -x['pred_atom']) self.logger.info(f'Selected {len(selections)} new molecules') # Add requested simulations to the queue for rank, task in enumerate(selections): self._task_queue.put( ((-step_number, rank), task['smiles'])) # Sort by recency and then by best step_number += 1 # Increment the loop self.logger.info('No longer generating new candidates') self._gen_done.set()
def task_ranker(self): """Prioritize list of available tasks""" # Submit some initial molecules so that the simulator gets started immediately num_to_seed = self.queue_length self.logger.info(f'Sending {num_to_seed} initial molecules') for smiles in sample(self.search_space, num_to_seed): # We send: (rank info), smiles, task_info self._task_queue.put((smiles, { 'reason': 'initial', 'batch': -1, 'smiles': smiles })) # Perform the design loop iteratively batch_number = 0 while not self._done.is_set(): # Get the current copy of the search space search_space = self.search_space.copy() # Assign them scores with self._update_lock: self.queues.send_inputs( [MPNNMessage(m) for m, _ in self.mpnns], search_space, method='evaluate_mpnn', topic='rank') # Capture the training set of models used in this inference run training_sets = [s.copy() for _, s in self.mpnns] self.logger.info(f'Submitted inference task') result = self.queues.get_result(topic='rank') scores = result.value result.task_info = { 'training_sets': training_sets } # Record the training sets self._write_result(result, 'screen_records.jsonld', keep_inputs=False, keep_outputs=False) self.logger.info(f'Assigned scores to all {len(scores)} molecules') # Assign scores to each SMILES mean_score = scores.mean(axis=1) std_score = scores.std(axis=1) task_options = [{ 'smiles': s, 'pred': float(m), 'pred_std': float(u), 'batch': batch_number } for s, m, u in zip(search_space, mean_score, std_score)] # Rank according to different metrics. Best at the right end (so .pop works) random_selections = task_options.copy() shuffle(random_selections) greedy_selections = sorted(task_options, key=lambda x: -x['pred']) uq_selections = sorted(task_options, key=lambda x: x['pred_std']) self.logger.info( 'Sorted molecules by greedy, random and uncertainty selection.' ) # Pick enough to fill the queue already_picked = set() selections = [] while len(already_picked) < self.queue_length: # Make sure none of the lists are empty if min( map(len, [ greedy_selections, random_selections, uq_selections ])) == 0: self.logger.info('Ran out of molecules to select from') break # Pick a task r = random() if r < self.greedy_frac: task = greedy_selections.pop() task['reason'] = 'greedy' elif r < self.greedy_frac + self.random_frac: task = random_selections.pop() task['reason'] = 'random' else: task = uq_selections.pop() task['reason'] = 'uq' # If it is not yet selected if (task['smiles'] not in already_picked and task['smiles'] not in self.database): already_picked.add(task['smiles']) selections.append(task) self.logger.info(f'Selected {len(selections)} new molecules') # Clear out the queue while not self._task_queue.empty(): try: self._task_queue.get_nowait() except Empty: break self.logger.info('Cleared out the current queue') # Add requested simulations to the queue for rank, task in enumerate(selections): self._task_queue.put((task['smiles'], task)) batch_number += 1 # Increment the loop self.logger.info('Added all of them the task queue')
def model_updater(self): """Handle updating the ML models""" # Randomly order the MPNNs ready_to_retrain = list(range(len(self.mpnns))) shuffle(ready_to_retrain) ready_to_retrain = deque(ready_to_retrain) # Launch the first models to be updated for _ in range(self.n_parallel_updating): ind = ready_to_retrain.popleft() mpnn = self.mpnns[ind] self.queues.send_inputs(MPNNMessage(mpnn[0]), self.database, 4, method='update_mpnn', topic='update', task_info={ 'index': ind, 'training_molecules': list(self.database.keys()) }) self.logger.info(f'Submitted model {ind} to be updated') # Make a directory to store updated models model_dir = os.path.join(self.output_dir, 'models') os.makedirs(model_dir, exist_ok=True) # Continually wait for new models to come back result_ind = 0 while not self._done.is_set(): # Wait for a model to be returned result = self.queues.get_result(topic='update') # Update the weights complted_ind = result.task_info['index'] if result.success: new_weights, _ = result.value with self._update_lock: self.mpnns[complted_ind][0].set_weights(new_weights) self.mpnns[complted_ind][1] = result.task_info[ 'training_molecules'] self.logger.info(f'Updated weights for model {complted_ind}') else: self.logger.info(f'Retraining failed for model {complted_ind}') # Mark the model as ready to be updated again ready_to_retrain.append(complted_ind) # Submit another model to be updated ind = ready_to_retrain.popleft() mpnn = self.mpnns[ind] self.queues.send_inputs(MPNNMessage(mpnn[0]), self.database, 4, method='update_mpnn', topic='update', task_info={ 'index': ind, 'training_molecules': list(self.database.keys()) }) self.logger.info(f'Submitted model {ind} to be updated') # Save the results self._write_result(result, 'update_records.jsonld', keep_inputs=False, keep_outputs=False) # If the updated model, if re-training was successful result_ind += 1 if result.success: model_name = os.path.join( model_dir, f'{result_ind}_model_{complted_ind}.h5') self.logger.info( f'Saving model {complted_ind} to disk as {model_name}') self.mpnns[complted_ind][0].save(model_name, include_optimizer=False) self.logger.info( 'Model saved. Waiting for next update task to complete')