def __init__(self, args: InterpretArgs) -> None: """ :param args: A :class:`~chemprop.args.InterpretArgs` object containing arguments for interpretation. """ self.args = args self.train_args = load_args(args.checkpoint_paths[0]) # If features were used during training, they must be used when predicting if ((self.train_args.features_path is not None or self.train_args.features_generator is not None) and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with --features_generator <generator> ' 'and using --no_features_scaling if applicable).') if self.train_args.atom_descriptors_size > 0 or self.train_args.atom_features_size > 0 or self.train_args.bond_features_size > 0: raise NotImplementedError( 'The interpret function does not yet work with additional atom or bond features' ) self.scaler, self.features_scaler, self.atom_descriptor_scaler, self.bond_feature_scaler = load_scalers( args.checkpoint_paths[0]) self.checkpoints = [ load_checkpoint(checkpoint_path, device=args.device) for checkpoint_path in args.checkpoint_paths ]
def load_model(args: PredictArgs, generator: bool = False): """ Function to load a model or ensemble of models from file. If generator is True, a generator of the respective model and scaler objects is returned (memory efficient), else the full list (holding all models in memory, necessary for preloading). :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param generator: A boolean to return a generator instead of a list of models and scalers. :return: A tuple of updated prediction arguments, training arguments, a list or generator object of models, a list or generator object of scalers, the number of tasks and their respective names. """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names update_prediction_args(predict_args=args, train_args=train_args) args: Union[PredictArgs, TrainArgs] # Load model and scalers models = (load_checkpoint(checkpoint_path, device=args.device) for checkpoint_path in args.checkpoint_paths) scalers = (load_scalers(checkpoint_path) for checkpoint_path in args.checkpoint_paths) if not generator: models = list(models) scalers = list(scalers) return args, train_args, models, scalers, num_tasks, task_names
def predict_smile(checkpoint_path: str, smile: str): smiles = [smile] """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ args = Namespace() # print('Loading training args') scaler, features_scaler = load_scalers(checkpoint_path) train_args = load_args(checkpoint_path) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) # print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) else: print("Enter Valid Smile String") return # print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict(model=model, data=test_data, batch_size=1, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions return sum_preds[0][0]
def __init__(self, checkpoint_dir): self.checkpoints = [] for root, _, files in os.walk(checkpoint_dir): for fname in files: if fname.endswith('.pt'): fname = os.path.join(root, fname) self.scaler, self.features_scaler = load_scalers(fname) self.train_args = load_args(fname) model = load_checkpoint(fname, cuda=True) self.checkpoints.append(model)
def __init__(self, args: InterpretArgs) -> None: self.args = args self.train_args = load_args(args.checkpoint_paths[0]) # If features were used during training, they must be used when predicting if ((self.train_args.features_path is not None or self.train_args.features_generator is not None) and args.features_generator is None): raise ValueError('Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with --features_generator <generator> ' 'and using --no_features_scaling if applicable).') self.scaler, self.features_scaler = load_scalers(args.checkpoint_paths[0]) self.checkpoints = [load_checkpoint(checkpoint_path, device=args.device) for checkpoint_path in args.checkpoint_paths]
def __init__(self, checkpoint_paths: List[str], device: torch.device) -> None: self.train_args = load_args(checkpoint_paths[0]) if self.train_args.features_path is not None and self.train_args.features_generator is None: raise ValueError( 'Must specify features generator using --features_generator <generator> ' 'when using a model trained with additional features.') self.scaler, self.features_scaler = load_scalers(checkpoint_paths[0]) self.checkpoints = [ load_checkpoint(checkpoint_path, device=device) for checkpoint_path in checkpoint_paths ]
def upload_checkpoint(return_page: str): """ Uploads a checkpoint .pt file. :param return_page: The name of the page to render after uploading the checkpoint file. """ warnings, errors = [], [] current_user = request.cookies.get('currentUser') if not current_user: # Use DEFAULT as current user if the client's cookie is not set. current_user = app.config['DEFAULT_USER_ID'] ckpt = request.files['checkpoint'] ckpt_name = request.form['checkpointName'] # Create temporary file to get ckpt_args without losing data. with NamedTemporaryFile() as temp_file: ckpt.save(temp_file.name) ckpt_args = load_args(temp_file) ckpt_id, new_ckpt_name = db.insert_ckpt(ckpt_name, current_user, ckpt_args.dataset_type, ckpt_args.epochs, 1, ckpt_args.train_data_size) model_id = db.insert_model(ckpt_id) model_path = os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model_id}.pt') if ckpt_name != new_ckpt_name: warnings.append( name_already_exists_message('Checkpoint', ckpt_name, new_ckpt_name)) shutil.copy(temp_file.name, model_path) warnings, errors = json.dumps(warnings), json.dumps(errors) return redirect( url_for(return_page, checkpoint_upload_warnings=warnings, checkpoint_upload_errors=errors))
def __init__(self, args: Namespace): if args.computed_prop is not None: self.computed_prop = True if args.computed_prop == 'penalized_logp': self.scorer = penalized_logp elif args.computed_prop == 'logp': self.scorer = logp elif args.computed_prop == 'qed': self.scorer = qed elif args.computed_prop == 'sascore': self.scorer = sascore elif args.computed_prop == 'drd2': self.scorer = drd2 else: raise ValueError return self.computed_prop = False chemprop_paths = [] for root, _, files in os.walk(args.chemprop_dir): for fname in files: if fname.endswith('.pt'): chemprop_paths.append(os.path.join(root, fname)) self.scaler, self.features_scaler = load_scalers(chemprop_paths[0]) self.train_args = load_args(chemprop_paths[0]) if self.train_args.features_path is not None: self.train_args.features_path = None self.train_args.features_generator = ['rdkit_2d_normalized' ] # just assume this self.num_tasks = self.train_args.num_tasks self.batch_size = args.batch_size * 4 self.features_generator = get_features_generator( args.features_generator[0] ) if args.features_generator is not None else None self.neg_threshold = args.neg_threshold self.prop_index = args.prop_index self.chemprop_models = [] for checkpoint_path in chemprop_paths: self.chemprop_models.append( load_checkpoint(checkpoint_path, cuda=True))
def predict(): """Renders the predict page and makes predictions if the method is POST.""" if request.method == 'GET': return render_predict() # Get arguments ckpt_id = request.form['checkpointName'] if request.form['textSmiles'] != '': smiles = request.form['textSmiles'].split() elif request.form['drawSmiles'] != '': smiles = [request.form['drawSmiles']] else: # Upload data file with SMILES data = request.files['data'] data_name = secure_filename(data.filename) data_path = os.path.join(app.config['TEMP_FOLDER'], data_name) data.save(data_path) # Check if header is smiles possible_smiles = get_header(data_path)[0] smiles = [possible_smiles ] if Chem.MolFromSmiles(possible_smiles) is not None else [] # Get remaining smiles smiles.extend(get_smiles(data_path)) models = db.get_models(ckpt_id) model_paths = [ os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt') for model in models ] task_names = load_task_names(model_paths[0]) num_tasks = len(task_names) gpu = request.form.get('gpu') train_args = load_args(model_paths[0]) # Build arguments arguments = [ '--test_path', 'None', '--preds_path', os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME']), '--checkpoint_paths', *model_paths ] if gpu is not None: if gpu == 'None': arguments.append('--no_cuda') else: arguments += ['--gpu', gpu] # Handle additional features if train_args.features_path is not None: # TODO: make it possible to specify the features generator if trained using features_path arguments += [ '--features_generator', 'rdkit_2d_normalized', '--no_features_scaling' ] elif train_args.features_generator is not None: arguments += ['--features_generator', *train_args.features_generator] if not train_args.features_scaling: arguments.append('--no_features_scaling') # Parse arguments args = PredictArgs().parse_args(arguments) # Run predictions preds = make_predictions(args=args, smiles=smiles) if all(p is None for p in preds): return render_predict(errors=['All SMILES are invalid']) # Replace invalid smiles with message invalid_smiles_warning = 'Invalid SMILES String' preds = [ pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds ] return render_predict( predicted=True, smiles=smiles, num_smiles=min(10, len(smiles)), show_more=max(0, len(smiles) - 10), task_names=task_names, num_tasks=len(task_names), preds=preds, warnings=["List contains invalid SMILES strings"] if None in preds else None, errors=["No SMILES strings given"] if len(preds) == 0 else None)
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=args) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) sum_ale_uncs = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) sum_epi_uncs = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) sum_ale_uncs = np.zeros((len(test_data), args.num_tasks)) sum_epi_uncs = np.zeros((len(test_data), args.num_tasks)) # Partial results for variance robust calculation. all_preds = np.zeros( (len(test_data), args.num_tasks, len(args.checkpoint_paths))) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds, ale_uncs, epi_uncs = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler, sampling_size=args.sampling_size) sum_preds += np.array(model_preds) if ale_uncs is not None: sum_ale_uncs += np.array(ale_uncs) if epi_uncs is not None: sum_epi_uncs += np.array(epi_uncs) if args.estimate_variance: all_preds[:, :, index] = model_preds print('\nmodel_preds\n', model_preds) print('ale_uncs\n', ale_uncs) # Ensemble predictions if args.estimate_variance: # Use ensemble variance to estimate uncertainty. This overwrites existing uncertainty estimates. # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- var(preds) avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths) avg_ale_uncs = avg_ale_uncs.tolist() avg_epi_uncs = np.var(all_preds, axis=2) avg_epi_uncs = avg_epi_uncs.tolist() else: # Use another method to estimate uncertainty. # preds <- mean(preds), ale_uncs <- mean(ale_uncs), epi_uncs <- mean(epi_uncs) avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() avg_ale_uncs = sum_ale_uncs / len(args.checkpoint_paths) avg_ale_uncs = avg_ale_uncs.tolist() avg_epi_uncs = sum_epi_uncs / len(args.checkpoint_paths) avg_epi_uncs = avg_epi_uncs.tolist() # Save predictions assert len(test_data) == len(avg_preds) assert len(test_data) == len(avg_ale_uncs) assert len(test_data) == len(avg_epi_uncs) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = [None] * len(full_data) full_ale_uncs = [None] * len(full_data) full_epi_uncs = [None] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] full_ale_uncs[si] = avg_ale_uncs[i] full_epi_uncs[si] = avg_epi_uncs[i] avg_preds = full_preds avg_ale_uncs = full_ale_uncs avg_epi_uncs = full_epi_uncs test_smiles = full_data.smiles() # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = [] if args.use_compound_names: header.append('compound_names') header.append('smiles') if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) header.extend([tn + "_ale_unc" for tn in args.task_names]) header.extend([tn + "_epi_unc" for tn in args.task_names]) writer.writerow(header) for i in range(len(avg_preds)): row = [] if args.use_compound_names: row.append(compound_names[i]) row.append(test_smiles[i]) if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) row.extend(avg_ale_uncs[i]) row.extend(avg_epi_uncs[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: # Both the prediction, the aleatoric uncertainty and the epistemic uncertainty are None row.extend([''] * 3 * args.num_tasks) writer.writerow(row) return avg_preds
def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print("Loading training args") train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names update_prediction_args(predict_args=args, train_args=train_args) args: Union[PredictArgs, TrainArgs] if args.atom_descriptors == "feature": set_extra_atom_fdim(train_args.atom_features_size) if args.bond_features_path is not None: set_extra_bond_fdim(train_args.bond_features_size) # set explicit H option and reaction option set_explicit_h(train_args.explicit_h) set_reaction(train_args.reaction, train_args.reaction_mode) print("Loading data") if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator, ) else: full_data = get_data( path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=not args.drop_extra_columns, ) print("Validating SMILES") full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f"Test size = {len(test_data):,}") # Predict with each model individually and sum predictions if args.dataset_type == "multiclass": sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader( dataset=test_data, batch_size=args.batch_size, num_workers=0 if sys.platform == "darwin" else args.num_workers, ) # Partial results for variance robust calculation. if args.ensemble_variance: all_preds = np.zeros( (len(test_data), num_tasks, len(args.checkpoint_paths))) print( f"Predicting with an ensemble of {len(args.checkpoint_paths)} models") for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) ( scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler, ) = load_scalers(checkpoint_path) # Normalize features if (args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling): test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if (train_args.atom_descriptor_scaling and args.atom_descriptors is not None): test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) if args.ensemble_variance: all_preds[:, :, index] = model_preds # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() if args.ensemble_variance: all_epi_uncs = np.var(all_preds, axis=2) all_epi_uncs = all_epi_uncs.tolist() # Save predictions print(f"Saving predictions to {args.preds_path}") assert len(test_data) == len(avg_preds) if args.ensemble_variance: assert len(test_data) == len(all_epi_uncs) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == "multiclass": task_names = [ f"{name}_class_{i}" for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = (avg_preds[valid_index] if valid_index is not None else ["Invalid SMILES"] * len(task_names)) if args.ensemble_variance: epi_uncs = (all_epi_uncs[valid_index] if valid_index is not None else ["Invalid SMILES"] * len(task_names)) # If extra columns have been dropped, add back in SMILES columns if args.drop_extra_columns: datapoint.row = OrderedDict() smiles_columns = args.smiles_columns for column, smiles in zip(smiles_columns, datapoint.smiles): datapoint.row[column] = smiles # Add predictions columns if args.ensemble_variance: for pred_name, pred, epi_unc in zip(task_names, preds, epi_uncs): datapoint.row[pred_name] = pred datapoint.row[pred_name + "_epi_unc"] = epi_unc else: for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, "w") as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def predict(): """Renders the predict page and makes predictions if the method is POST.""" if request.method == 'GET': return render_predict() # Get arguments ckpt_id = request.form['checkpointName'] if request.form['textSmiles'] != '': smiles = request.form['textSmiles'].split() elif request.form['drawSmiles'] != '': smiles = [request.form['drawSmiles']] else: print(" GOT HERE") # Upload data file with SMILES data = request.files['data'] data_name = secure_filename(data.filename) data_path = os.path.join(app.config['TEMP_FOLDER'], data_name) data.save(data_path) # Check if header is smiles possible_smiles = get_header(data_path)[0] smiles = [possible_smiles] if Chem.MolFromSmiles(possible_smiles) is not None else [] # Get remaining smiles smiles.extend(get_smiles(data_path)) models = db.get_models(ckpt_id) model_paths = [os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt') for model in models] task_names = load_task_names(model_paths[0]) num_tasks = len(task_names) gpu = request.form.get('gpu') # Create and modify args args = load_args(model_paths[0]) if args.features_path != None: args.features_generator = ["rdkit_2d_normalized"] args.features_path = None preds_path = os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME']) args.test_path = 'None' # TODO: Remove this hack to avoid assert crashing in modify_predict_args args.preds_path = preds_path args.checkpoint_paths = model_paths if gpu is not None: if gpu == 'None': args.no_cuda = True else: args.gpu = int(gpu) modify_predict_args(args) # Run predictions preds = make_predictions(args, smiles=smiles) if all(p is None for p in preds): return render_predict(errors=['All SMILES are invalid']) # Replace invalid smiles with message invalid_smiles_warning = "Invalid SMILES String" preds = [pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds] return render_predict(predicted=True, smiles=smiles, num_smiles=min(10, len(smiles)), show_more=max(0, len(smiles)-10), task_names=task_names, num_tasks=len(task_names), preds=preds, warnings=["List contains invalid SMILES strings"] if None in preds else None, errors=["No SMILES strings given"] if len(preds) == 0 else None)
def make_predictions(args: Namespace, smiles: List[str] = None, invalid_smiles_warning: str = None) -> List[List[float]]: """Makes predictions.""" if args.gpu is not None: torch.cuda.set_device(args.gpu) if invalid_smiles_warning is not None: success_indices = [] for i, s in enumerate(smiles): mol = Chem.MolFromSmiles(s) if mol is not None: success_indices.append(i) full_smiles = smiles smiles = [smiles[i] for i in success_indices] print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles) else: test_data = get_data(args.test_path, args, use_compound_names=args.compound_names) test_smiles = test_data.smiles() if args.compound_names: compound_names = test_data.compound_names() print('Test size = {:,}'.format(len(test_data))) # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions sum_preds = np.zeros((len(test_data), args.num_tasks)) print('Predicting with an ensemble of {} models'.format( len(args.checkpoint_paths))) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict(model=model, data=test_data, args=args, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / args.ensemble_size avg_preds = avg_preds.tolist() # Save predictions assert len(test_data) == len(avg_preds) print('Saving predictions to {}'.format(args.preds_path)) with open(args.preds_path, 'w') as f: if args.write_smiles: f.write('smiles,') if args.compound_names: f.write('compound_name,') f.write(','.join(args.task_names) + '\n') for i in range(len(avg_preds)): if args.write_smiles: f.write(test_smiles[i] + ',') if args.compound_names: f.write(compound_names[i] + ',') f.write(','.join(str(p) for p in avg_preds[i]) + '\n') if invalid_smiles_warning is not None: full_preds = [[invalid_smiles_warning] for _ in range(len(full_smiles))] for i, si in enumerate(success_indices): full_preds[si] = avg_preds[i] return full_preds return avg_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=args) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') #for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(args.checkpoint_path, cuda=args.cuda) test_preds, test_smiles_batch = predict(model=model, data=test_data, batch_size=args.batch_size, scaler=scaler) return test_preds, test_smiles_batch '''
def molecule_fingerprint( args: FingerprintArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to encode fingerprint vectors for the data. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of fingerprint vectors (list of floats) """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments if args.fingerprint_type == 'MPN': # only need to supply input features if using FFN latent representation and if model calls for them. validate_feature_sources = False else: validate_feature_sources = True update_prediction_args(predict_args=args, train_args=train_args, validate_feature_sources=validate_feature_sources) args: Union[FingerprintArgs, TrainArgs] #set explicit H option and reaction option reset_featurization_parameters() if args.atom_descriptors == 'feature': set_extra_atom_fdim(train_args.atom_features_size) if args.bond_features_path is not None: set_extra_bond_fdim(train_args.bond_features_size) set_explicit_h(train_args.explicit_h) set_adding_hs(args.adding_h) if train_args.reaction: set_reaction(train_args.reaction, train_args.reaction_mode) elif train_args.reaction_solvent: set_reaction(True, train_args.reaction_mode) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) # Set fingerprint size if args.fingerprint_type == 'MPN': if args.atom_descriptors == "descriptor": # special case when we have 'descriptor' extra dimensions need to be added total_fp_size = ( args.hidden_size + test_data.atom_descriptors_size()) * args.number_of_molecules else: if args.reaction_solvent: total_fp_size = args.hidden_size + args.hidden_size_solvent else: total_fp_size = args.hidden_size * args.number_of_molecules if args.features_only: raise ValueError( 'With features_only models, there is no latent MPN representation. Use last_FFN fingerprint type instead.' ) elif args.fingerprint_type == 'last_FFN': if args.ffn_num_layers != 1: total_fp_size = args.ffn_hidden_size else: raise ValueError( 'With a ffn_num_layers of 1, there is no latent FFN representation. Use MPN fingerprint type instead.' ) else: raise ValueError( f'Fingerprint type {args.fingerprint_type} not supported') all_fingerprints = np.zeros( (len(test_data), total_fp_size, len(args.checkpoint_paths))) # Load model print( f'Encoding smiles into a fingerprint vector from {len(args.checkpoint_paths)} models.' ) for index, checkpoint_path in enumerate( tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths))): model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers( args.checkpoint_paths[index]) # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make fingerprints model_fp = model_fingerprint(model=model, data_loader=test_data_loader, fingerprint_type=args.fingerprint_type) if args.fingerprint_type == 'MPN' and ( args.features_path is not None or args.features_generator ): # truncate any features from MPN fingerprint model_fp = np.array(model_fp)[:, :total_fp_size] all_fingerprints[:, :, index] = model_fp # Save predictions print(f'Saving predictions to {args.preds_path}') # assert len(test_data) == len(all_fingerprints) #TODO: add unit test for this makedirs(args.preds_path, isfile=True) # Set column names fingerprint_columns = [] if args.fingerprint_type == 'MPN': if len(args.checkpoint_paths) == 1: for j in range(total_fp_size // args.number_of_molecules): for k in range(args.number_of_molecules): fingerprint_columns.append(f'fp_{j}_mol_{k}') else: for j in range(total_fp_size // args.number_of_molecules): for i in range(len(args.checkpoint_paths)): for k in range(args.number_of_molecules): fingerprint_columns.append(f'fp_{j}_mol_{k}_model_{i}') else: # args == 'last_FNN' if len(args.checkpoint_paths) == 1: for j in range(total_fp_size): fingerprint_columns.append(f'fp_{j}') else: for j in range(total_fp_size): for i in range(len(args.checkpoint_paths)): fingerprint_columns.append(f'fp_{j}_model_{i}') # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = all_fingerprints[valid_index].reshape( (len(args.checkpoint_paths) * total_fp_size )) if valid_index is not None else ['Invalid SMILES'] * len( args.checkpoint_paths) * total_fp_size for i in range(len(fingerprint_columns)): datapoint.row[fingerprint_columns[i]] = preds[i] # Write predictions with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=args.smiles_columns + fingerprint_columns, extrasaction='ignore') writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return all_fingerprints
def make_predictions(args: PredictArgs, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] if args.parcel_size and args.max_data_size: num_iterations = math.ceil(args.max_data_size / args.parcel_size) max_data_size = args.parcel_size print('Using parcels: ' + str(num_iterations)) else: num_iterations = 1 max_data_size = args.max_data_size print('Not using parcels.') if args.parcel_offset: offset = args.parcel_offset * parcel_size else: offset = 0 for iteration in range(num_iterations): print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: print("Getting without SMILES") full_data = get_data(path=args.test_path, args=args, target_columns=[], max_data_size=max_data_size, data_offset=offset, skip_invalid_smiles=False) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Normalize features if args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models' ) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, device=args.device) model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions if iteration != 0: name, ext = os.path.splitext(args.preds_path) preds_path = "{name}.{it}.csv".format(name=name, it=iteration) else: preds_path = args.preds_path print(f'Saving predictions to {preds_path}') assert len(test_data) == len(avg_preds) makedirs(preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred with open(preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) offset = offset + parcel_size return avg_preds
def molecule_fingerprint( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to encode fingerprint vectors for the data. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of fingerprint vectors (list of floats) """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments update_prediction_args(predict_args=args, train_args=train_args, validate_feature_sources=False) args: Union[PredictArgs, TrainArgs] #set explicit H option and reaction option set_explicit_h(train_args.explicit_h) set_reaction(train_args.reaction, train_args.reaction_mode) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) # Load model print(f'Encoding smiles into a fingerprint vector from a single model') if len(args.checkpoint_paths) != 1: raise ValueError( "Fingerprint generation only supports one model, cannot use an ensemble" ) model = load_checkpoint(args.checkpoint_paths[0], device=args.device) scaler, features_scaler, atom_descriptor_scaler, bond_feature_scaler = load_scalers( args.checkpoint_paths[0]) # Normalize features if args.features_scaling or train_args.atom_descriptor_scaling or train_args.bond_feature_scaling: test_data.reset_features_and_targets() if args.features_scaling: test_data.normalize_features(features_scaler) if train_args.atom_descriptor_scaling and args.atom_descriptors is not None: test_data.normalize_features(atom_descriptor_scaler, scale_atom_descriptors=True) if train_args.bond_feature_scaling and args.bond_features_size > 0: test_data.normalize_features(bond_feature_scaler, scale_bond_features=True) # Make fingerprints model_preds = model_fingerprint(model=model, data_loader=test_data_loader) # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(model_preds) makedirs(args.preds_path, isfile=True) # Copy predictions over to full_data total_hidden_size = args.hidden_size * args.number_of_molecules for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = model_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * total_hidden_size fingerprint_columns = [f'fp_{i}' for i in range(total_hidden_size)] for i in range(len(fingerprint_columns)): datapoint.row[fingerprint_columns[i]] = preds[i] # Write predictions with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=args.smiles_columns + fingerprint_columns, extrasaction='ignore') writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return model_preds
def upload_checkpoint(return_page: str): """ Uploads a checkpoint .pt file. :param return_page: The name of the page to render after uploading the checkpoint file. """ warnings, errors = [], [] current_user = request.cookies.get('currentUser') if not current_user: # Use DEFAULT as current user if the client's cookie is not set. current_user = app.config['DEFAULT_USER_ID'] ckpt = request.files['checkpoint'] ckpt_name = request.form['checkpointName'] ckpt_ext = os.path.splitext(ckpt.filename)[1] # Collect paths to all uploaded checkpoints (and unzip if necessary) temp_dir = TemporaryDirectory() ckpt_paths = [] if ckpt_ext.endswith('.pt'): ckpt_path = os.path.join(temp_dir.name, 'model.pt') ckpt.save(ckpt_path) ckpt_paths = [ckpt_path] elif ckpt_ext.endswith('.zip'): ckpt_dir = os.path.join(temp_dir.name, 'models') zip_path = os.path.join(temp_dir.name, 'models.zip') ckpt.save(zip_path) with zipfile.ZipFile(zip_path, mode='r') as z: z.extractall(ckpt_dir) for root, _, fnames in os.walk(ckpt_dir): ckpt_paths += [ os.path.join(root, fname) for fname in fnames if fname.endswith('.pt') ] else: errors.append( f'Uploaded checkpoint(s) file must be either .pt or .zip but got {ckpt_ext}' ) # Insert checkpoints into database if len(ckpt_paths) > 0: ckpt_args = load_args(ckpt_paths[0]) ckpt_id, new_ckpt_name = db.insert_ckpt(ckpt_name, current_user, ckpt_args.dataset_type, ckpt_args.epochs, len(ckpt_paths), ckpt_args.train_data_size) for ckpt_path in ckpt_paths: model_id = db.insert_model(ckpt_id) model_path = os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model_id}.pt') if ckpt_name != new_ckpt_name: warnings.append( name_already_exists_message('Checkpoint', ckpt_name, new_ckpt_name)) shutil.copy(ckpt_path, model_path) temp_dir.cleanup() warnings, errors = json.dumps(warnings), json.dumps(errors) return redirect( url_for(return_page, checkpoint_upload_warnings=warnings, checkpoint_upload_errors=errors))
def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=True) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler = load_scalers(checkpoint_path) # Normalize features if args.features_scaling: test_data.reset_features_and_targets() test_data.normalize_features(features_scaler) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def make_predictions(args: PredictArgs, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, args=args, target_columns=[], skip_invalid_smiles=False) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Normalize features if args.features_scaling: test_data.normalize_features(features_scaler) # Initialize uncertainty estimator if args.uncertainty: uncertainty_estimator = uncertainty_estimator_builder( args.uncertainty)(args, test_data, scaler) # Predict with each model individually and sum predictions if not args.uncertainty: if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for N, checkpoint_path in tqdm(enumerate(args.checkpoint_paths), total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, device=args.device) model.training = False if not args.uncertainty: model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) else: uncertainty_estimator.process_model(model, N) # Ensemble predictions if not args.uncertainty: avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() else: avg_preds, avg_UQ = uncertainty_estimator.calculate_UQ() if type(avg_UQ) is tuple: aleatoric, epistemic = avg_UQ # Save predictions print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) if args.uncertainty: if not args.split_UQ: cur_UQ = avg_UQ[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) datapoint.row['Uncertainty'] = cur_UQ elif args.split_UQ: cur_al = aleatoric[ valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) cur_ep = epistemic[ valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) datapoint.row['Aleatoric'] = cur_al datapoint.row['Epistemic'] = cur_ep if type(preds) is list: for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred else: datapoint.row[task_names[0]] = preds # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) return avg_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, drug_scaler, cmpd_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) else: test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) print('Validating SMILES') valid_indices = [i for i in range(len(test_data)) if test_data[i].drug_mol is not None] full_data = test_data test_data = MolPairDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) if args.use_compound_names: compound_names = test_data.compound_names() print(f'Test size = {len(test_data):,}') # Normalize features if train_args.features_scaling: test_data.normalize_features(drug_scaler, cmpd_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros((len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict( model=model, data=test_data, batch_size=args.batch_size, scaler=scaler # TODO: Shouldn't this be the custom scalers if avail? ) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions assert len(test_data) == len(avg_preds) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = [None] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] avg_preds = full_preds test_smiles = full_data.smiles() # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = ['drugSMILE', 'cmpdSMILE'] if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) writer.writerow(header) for i in range(len(avg_preds)): row = [test_smiles[i][0], test_smiles[i][1]] if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: row.extend([''] * args.num_tasks) writer.writerow(row) return avg_preds
def make_predictions(args: Namespace, smiles: List[str] = None) -> List[Optional[List[float]]]: """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ if args.gpu is not None: torch.cuda.set_device(args.gpu) print('Loading training args') scaler, features_scaler = load_scalers(args.checkpoint_paths[0]) train_args = load_args(args.checkpoint_paths[0]) data = smiles # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) print('Loading data') # if smiles is not None: # test_data = get_data_from_smiles_fast(smiles=smiles, skip_invalid_smiles=False) # else: # test_data = get_data(path=args.test_path, args=args, use_compound_names=args.use_compound_names, skip_invalid_smiles=False) with open(args.test_path, 'r') as f: smiles = list(map(lambda x: x.split(',')[0].strip(), f.readlines()[1:])) assert (smiles is not None) print('Validating SMILES') # # valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None] # full_data = test_data # test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # # # Edge case if empty list of smiles is provided # if len(test_data) == 0: # return [None] * len(full_data) # # if args.use_compound_names: # compound_names = test_data.compound_names() # print(f'Test size = {len(test_data):,}') # # # Normalize features # if train_args.features_scaling: # test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions # if args.dataset_type == 'multiclass': # sum_preds = np.zeros((len(smiles), args.num_tasks, args.multiclass_num_classes)) # else: # sum_preds = np.zeros((len(smiles), args.num_tasks)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model model = load_checkpoint(checkpoint_path, cuda=args.cuda) avg_preds = predict(model=model, data=smiles, batch_size=args.batch_size, scaler=scaler, args=args) # avg_preds += np.array(model_preds) # Ensemble predictions # avg_preds = sum_preds / len(args.checkpoint_paths) # avg_preds = avg_preds.tolist() # Save predictions print(len(smiles), len(avg_preds)) assert len(smiles) == len(avg_preds) print(f'Saving predictions to {args.preds_path}') # Put Nones for invalid smiles full_preds = avg_preds # for i, si in enumerate(valid_indices): # full_preds[si] = avg_preds[i] avg_preds = full_preds test_smiles = smiles # Write predictions with open(args.preds_path, 'w') as f: writer = csv.writer(f) header = [] if args.use_compound_names: header.append('compound_names') header.append('smiles') if args.dataset_type == 'multiclass': for name in args.task_names: for i in range(args.multiclass_num_classes): header.append(name + '_class' + str(i)) else: header.extend(args.task_names) writer.writerow(header) for i in range(len(avg_preds)): row = [] # if args.use_compound_names: # row.append(compound_names[i]) row.append(test_smiles[i]) if avg_preds[i] is not None: if args.dataset_type == 'multiclass': for task_probs in avg_preds[i]: row.extend(task_probs) else: row.extend(avg_preds[i]) else: if args.dataset_type == 'multiclass': row.extend([''] * args.num_tasks * args.multiclass_num_classes) else: row.extend([''] * args.num_tasks) writer.writerow(row) return avg_preds