def average_duplicates(args: Args): """Averages duplicate data points in a dataset.""" print('Loading data') header = get_header(args.data_path) data = get_data(path=args.data_path, smiles_columns=args.smiles_columns, target_columns=args.target_columns) print(f'Data size = {len(data):,}') # Map SMILES string to lists of targets smiles_in_order = [] smiles_to_targets = defaultdict(list) for smiles, targets in zip(data.smiles(flatten=True), data.targets()): smiles_to_targets[smiles].append(targets) if len(smiles_to_targets[smiles]) == 1: smiles_in_order.append(smiles) # Find duplicates duplicate_count = 0 stds = [] new_data = [] for smiles in smiles_in_order: all_targets = smiles_to_targets[smiles] duplicate_count += len(all_targets) - 1 num_tasks = len(all_targets[0]) targets_by_task = [[] for _ in range(num_tasks)] for task in range(num_tasks): for targets in all_targets: if targets[task] is not None: targets_by_task[task].append(targets[task]) stds.append([ np.std(task_targets) if len(task_targets) > 0 else 0.0 for task_targets in targets_by_task ]) means = [ np.mean(task_targets) if len(task_targets) > 0 else None for task_targets in targets_by_task ] new_data.append((smiles, means)) print(f'Number of duplicates = {duplicate_count:,}') print( f'Duplicate standard deviation per task = {", ".join(f":{std:.4e}" for std in np.mean(stds, axis=0))}' ) print(f'New data size = {len(new_data):,}') # Save new data with open(args.save_path, 'w') as f: f.write(','.join(header) + '\n') for smiles, avg_targets in new_data: f.write(smiles + ',' + ','.join( str(value) if value is not None else '' for value in avg_targets) + '\n')
def interpret(args: InterpretArgs) -> None: """ Runs interpretation of a Chemprop model using the Monte Carlo Tree Search algorithm. :param args: A :class:`~chemprop.args.InterpretArgs` object containing arguments for interpretation. """ if args.number_of_molecules != 1: raise ValueError( "Interpreting is currently only available for single-molecule models." ) global C_PUCT, MIN_ATOMS chemprop_model = ChempropModel(args) def scoring_function(smiles: List[str]) -> List[float]: return chemprop_model(smiles)[:, args.property_id - 1] C_PUCT = args.c_puct MIN_ATOMS = args.min_atoms all_smiles = get_smiles(path=args.data_path, smiles_columns=args.smiles_columns) header = get_header(path=args.data_path) property_name = header[ args.property_id] if len(header) > args.property_id else 'score' print(f'smiles,{property_name},rationale,rationale_score') for smiles in all_smiles: score = scoring_function([smiles])[0] if score > args.prop_delta: rationales = mcts(smiles=smiles[0], scoring_function=scoring_function, n_rollout=args.rollout, max_atoms=args.max_atoms, prop_delta=args.prop_delta) else: rationales = [] if len(rationales) == 0: print(f'{smiles},{score:.3f},,') else: min_size = min(len(x.atoms) for x in rationales) min_rationales = [ x for x in rationales if len(x.atoms) == min_size ] rats = sorted(min_rationales, key=lambda x: x.P, reverse=True) print(f'{smiles},{score:.3f},{rats[0].smiles},{rats[0].P:.3f}')
def make_predictions( args: PredictArgs, smiles: List[List[str]] = None) -> List[List[Optional[float]]]: """ Loads data and a trained model and uses the model to make predictions on the data. If SMILES are provided, then makes predictions on smiles. Otherwise makes predictions on :code:`args.test_data`. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: List of list of SMILES to make predictions on. :return: A list of lists of target predictions. """ print('Loading training args') train_args = load_args(args.checkpoint_paths[0]) num_tasks, task_names = train_args.num_tasks, train_args.task_names # If features were used during training, they must be used when predicting if ((train_args.features_path is not None or train_args.features_generator is not None) and args.features_path is None and args.features_generator is None): raise ValueError( 'Features were used during training so they must be specified again during prediction ' 'using the same type of features as before (with either --features_generator or ' '--features_path and using --no_features_scaling if applicable).') # If atom-descriptors were used during training, they must be used when predicting and vice-versa if train_args.atom_descriptors != args.atom_descriptors: raise ValueError( 'The use of atom descriptors is inconsistent between training and prediction. If atom descriptors ' ' were used during training, they must be specified again during prediction using the same type of ' ' descriptors as before. If they were not used during training, they cannot be specified during prediction.' ) # Update predict args with training arguments to create a merged args object for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) args: Union[PredictArgs, TrainArgs] if args.atom_descriptors == 'feature': set_extra_atom_fdim(train_args.atom_features_size) print('Loading data') if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator) else: full_data = get_data(path=args.test_path, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=not args.drop_extra_columns) print('Validating SMILES') full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) print(f'Test size = {len(test_data):,}') # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), num_tasks)) # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): # Load model and scalers model = load_checkpoint(checkpoint_path, device=args.device) scaler, features_scaler = load_scalers(checkpoint_path) # Normalize features if args.features_scaling: test_data.reset_features_and_targets() test_data.normalize_features(features_scaler) # Make predictions model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() # Save predictions #print(f'Saving predictions to {args.preds_path}') assert len(test_data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Get prediction column names if args.dataset_type == 'multiclass': task_names = [ f'{name}_class_{i}' for name in task_names for i in range(args.multiclass_num_classes) ] else: task_names = task_names # Copy predictions over to full_data for full_index, datapoint in enumerate(full_data): valid_index = full_to_valid_indices.get(full_index, None) preds = avg_preds[valid_index] if valid_index is not None else [ 'Invalid SMILES' ] * len(task_names) # If extra columns have been dropped, add back in SMILES columns if args.drop_extra_columns: datapoint.row = OrderedDict() smiles_columns = args.smiles_columns if None in smiles_columns: smiles_columns = get_header( args.test_path)[:len(smiles_columns)] for column, smiles in zip(smiles_columns, datapoint.smiles): datapoint.row[column] = smiles # Add predictions columns for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save """ with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=full_data[0].row.keys()) writer.writeheader() for datapoint in full_data: writer.writerow(datapoint.row) """ return avg_preds, dict(full_data[0].row)
def predict(): """Renders the predict page and makes predictions if the method is POST.""" if request.method == 'GET': return render_predict() # Get arguments ckpt_id = request.form['checkpointName'] if request.form['textSmiles'] != '': smiles = request.form['textSmiles'].split() elif request.form['drawSmiles'] != '': smiles = [request.form['drawSmiles']] else: # Upload data file with SMILES data = request.files['data'] data_name = secure_filename(data.filename) data_path = os.path.join(app.config['TEMP_FOLDER'], data_name) data.save(data_path) # Check if header is smiles possible_smiles = get_header(data_path)[0] smiles = [possible_smiles ] if Chem.MolFromSmiles(possible_smiles) is not None else [] # Get remaining smiles smiles.extend(get_smiles(data_path)) models = db.get_models(ckpt_id) model_paths = [ os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model["id"]}.pt') for model in models ] task_names = load_task_names(model_paths[0]) num_tasks = len(task_names) gpu = request.form.get('gpu') train_args = load_args(model_paths[0]) # Build arguments arguments = [ '--test_path', 'None', '--preds_path', os.path.join(app.config['TEMP_FOLDER'], app.config['PREDICTIONS_FILENAME']), '--checkpoint_paths', *model_paths ] if gpu is not None: if gpu == 'None': arguments.append('--no_cuda') else: arguments += ['--gpu', gpu] # Handle additional features if train_args.features_path is not None: # TODO: make it possible to specify the features generator if trained using features_path arguments += [ '--features_generator', 'rdkit_2d_normalized', '--no_features_scaling' ] elif train_args.features_generator is not None: arguments += ['--features_generator', *train_args.features_generator] if not train_args.features_scaling: arguments.append('--no_features_scaling') # Parse arguments args = PredictArgs().parse_args(arguments) # Run predictions preds = make_predictions(args=args, smiles=smiles) if all(p is None for p in preds): return render_predict(errors=['All SMILES are invalid']) # Replace invalid smiles with message invalid_smiles_warning = 'Invalid SMILES String' preds = [ pred if pred is not None else [invalid_smiles_warning] * num_tasks for pred in preds ] return render_predict( predicted=True, smiles=smiles, num_smiles=min(10, len(smiles)), show_more=max(0, len(smiles) - 10), task_names=task_names, num_tasks=len(task_names), preds=preds, warnings=["List contains invalid SMILES strings"] if None in preds else None, errors=["No SMILES strings given"] if len(preds) == 0 else None)
def test_bad_path(self): """Test bad provided path """ bad_path = os.path.join(self.temp_dir.name, 'bad_path.csv') with self.assertRaises(FileNotFoundError): get_header(bad_path)
def test_correct_file(self): """ Test correct input """ header = get_header(os.path.join(self.temp_dir.name, 'dummy_data.csv')) self.assertEqual(header, ['column0', 'column1'])