def __call__(self, smiles, batch_size=500): test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=self.train_args) valid_indices = [i for i in range(len(test_data)) if test_data[i].mol is not None] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) sum_preds = np.zeros((len(test_data), 1)) for model in self.checkpoints: model_preds = predict( model=model, data=test_data, batch_size=batch_size, scaler=self.scaler ) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(self.checkpoints) avg_preds = avg_preds.squeeze(-1).tolist() # Put zero for invalid smiles full_preds = [0.0] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] return np.array(full_preds, dtype=np.float32)
def compute_molecule_vectors(model: nn.Module, data: MoleculeDataset, batch_size: int) -> List[np.ndarray]: """ Computes the molecule vectors output from the last layer of a MoleculeModel. :param model: A MoleculeModel. :param data: A MoleculeDataset. :param batch_size: Batch size. :return: A list of 1D numpy arrays of length hidden_size containing the molecule vectors generated by the model for each molecule provided. """ model.eval() model.ffn[-1] = Identity() # Replace last linear layer with identity if hasattr(model, 'sigmoid'): model.sigmoid = Identity() vecs = [] num_iters, iter_step = len(data), batch_size for i in trange(0, num_iters, iter_step): # Prepare batch mol_batch = MoleculeDataset(data[i:i + batch_size]) smiles_batch, features_batch = mol_batch.smiles(), mol_batch.features() # Run model batch = smiles_batch with torch.no_grad(): batch_vecs = model(batch, features_batch) # Collect vectors batch_vecs = batch_vecs.data.cpu().numpy() vecs.extend(batch_vecs) return vecs
def __call__(self, smiles, batch_size=500): test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False, args=self.train_args) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) sum_preds = [] for model in self.checkpoints: model_preds = predict(model=model, data=test_data, batch_size=batch_size, scaler=self.scaler, disable_progress_bar=True) sum_preds.append(np.array(model_preds)) # Ensemble predictions sum_preds = sum(sum_preds) avg_preds = sum_preds / len(self.checkpoints) return avg_preds
def __call__(self, smiles: List[str], batch_size: int = 500) -> List[List[float]]: test_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=self.args.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = [] for model in self.checkpoints: model_preds = predict(model=model, data_loader=test_data_loader, scaler=self.scaler, disable_progress_bar=True) sum_preds.append(np.array(model_preds)) # Ensemble predictions sum_preds = sum(sum_preds) avg_preds = sum_preds / len(self.checkpoints) return avg_preds
def predict(model: nn.Module, data: MoleculeDataset, batch_size: int, scaler: StandardScaler = None, uncertainty: bool = False) -> List[List[float]]: """ Makes predictions on a dataset using an ensemble of models. :param model: A model. :param data: A MoleculeDataset. :param batch_size: Batch size. :param scaler: A StandardScaler object fit on the training targets. :param uncertainty: Whether uncertainty values should be returned. :return: A list of lists of predictions. The outer list is examples while the inner list is tasks. """ model.eval() preds = [] num_iters, iter_step = len(data), batch_size for i in trange(0, num_iters, iter_step): # Prepare batch mol_batch = MoleculeDataset(data[i:i + batch_size]) smiles_batch, features_batch = mol_batch.smiles(), mol_batch.features() # Run model batch = smiles_batch with torch.no_grad(): batch_preds = model(batch, features_batch) batch_preds = batch_preds.data.cpu().numpy() # Collect vectors batch_preds = batch_preds.tolist() preds.extend(batch_preds) if model.uncertainty: p = [] c = [] for i in range(len(preds)): p.append([preds[i][j] for j in range(len(preds[i])) if j % 2 == 0]) c.append([preds[i][j] for j in range(len(preds[i])) if j % 2 == 1]) if scaler is not None: p = scaler.inverse_transform(p).tolist() c = (scaler.stds**2 * c).tolist() if uncertainty: return p, c return p if scaler is not None: preds = scaler.inverse_transform(preds).tolist() return preds
def predict_smile(checkpoint_path: str, smile: str): smiles = [smile] """ Makes predictions. If smiles is provided, makes predictions on smiles. Otherwise makes predictions on args.test_data. :param args: Arguments. :param smiles: Smiles to make predictions on. :return: A list of lists of target predictions. """ args = Namespace() # print('Loading training args') scaler, features_scaler = load_scalers(checkpoint_path) train_args = load_args(checkpoint_path) # Update args with training arguments for key, value in vars(train_args).items(): if not hasattr(args, key): setattr(args, key, value) # print('Loading data') if smiles is not None: test_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) else: print("Enter Valid Smile String") return # print('Validating SMILES') valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) # Edge case if empty list of smiles is provided if len(test_data) == 0: return [None] * len(full_data) # Normalize features if train_args.features_scaling: test_data.normalize_features(features_scaler) # Predict with each model individually and sum predictions if args.dataset_type == 'multiclass': sum_preds = np.zeros( (len(test_data), args.num_tasks, args.multiclass_num_classes)) else: sum_preds = np.zeros((len(test_data), args.num_tasks)) model = load_checkpoint(checkpoint_path, cuda=args.cuda) model_preds = predict(model=model, data=test_data, batch_size=1, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions return sum_preds[0][0]
def __init__(self, train_data: MoleculeDataset, val_data: MoleculeDataset, test_data: MoleculeDataset, scaler: StandardScaler, args: Namespace): super().__init__(train_data, val_data, test_data, scaler, args) self.sum_val_uncertainty = np.zeros( (len(val_data.smiles()), args.num_tasks)) self.sum_test_uncertainty = np.zeros( (len(test_data.smiles()), args.num_tasks))
def single_task_sklearn(model: Union[RandomForestRegressor, RandomForestClassifier, SVR, SVC], train_data: MoleculeDataset, test_data: MoleculeDataset, metrics: List[str], args: SklearnTrainArgs, logger: Logger = None) -> List[float]: """ Trains a single-task scikit-learn model, meaning a separate model is trained for each task. This is necessary if some tasks have None (unknown) values. :param model: The scikit-learn model to train. :param train_data: The training data. :param test_data: The test data. :param metrics: A list of names of metric functions. :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for training the scikit-learn model. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task. """ scores = {} num_tasks = train_data.num_tasks() for task_num in trange(num_tasks): # Only get features and targets for molecules where target is not None train_features, train_targets = zip(*[(features, targets[task_num]) for features, targets in zip(train_data.features(), train_data.targets()) if targets[task_num] is not None]) test_features, test_targets = zip(*[(features, targets[task_num]) for features, targets in zip(test_data.features(), test_data.targets()) if targets[task_num] is not None]) model.fit(train_features, train_targets) test_preds = predict( model=model, model_type=args.model_type, dataset_type=args.dataset_type, features=test_features ) test_targets = [[target] for target in test_targets] score = evaluate_predictions( preds=test_preds, targets=test_targets, num_tasks=1, metrics=metrics, dataset_type=args.dataset_type, logger=logger ) for metric in metrics: if metric not in scores: scores[metric] = [] scores[metric].append(score[metric][0]) return scores
def test_filter_good_smiles(self): """Test pass through for all good smiles""" smiles_list = [['C'], ['CC'], ['CN'], ['O']] dataset = MoleculeDataset([MoleculeDatapoint(s) for s in smiles_list]) filtered_dataset = filter_invalid_smiles(dataset) self.assertEqual(filtered_dataset.smiles(), [['C'], ['CC'], ['CN'], ['O']])
def run_split_data(data_path: str, split_type: str, split_sizes: Tuple[int, int, int], seed: int, save_dir: str): with open(data_path) as f: reader = csv.reader(f) header = next(reader) lines = list(reader) data = [] for line in tqdm(lines): datapoint = MoleculeDatapoint(line=line) datapoint.line = line data.append(datapoint) data = MoleculeDataset(data) train, dev, test = split_data(data=data, split_type=split_type, sizes=split_sizes, seed=seed) makedirs(save_dir) for name, dataset in [('train', train), ('dev', dev), ('test', test)]: with open(os.path.join(save_dir, f'{name}.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for datapoint in dataset: writer.writerow(datapoint.line)
def setUp(self): smiles_list = [['C', 'CC'], ['CC', 'CCC'], ['CCC', 'CN'], ['CN', 'CCN'], ['CCN', 'CCCN'], ['CCCN', 'CCCCN'], ['CCCCN', 'CO'], ['CO', 'CCO'], ['CO', 'CCCO'], ['CN', 'CCC']] self.dataset = MoleculeDataset( [MoleculeDatapoint(s) for s in smiles_list])
def evaluate(model: nn.Module, data: MoleculeDataset, num_tasks: int, metric_func: Callable, batch_size: int, dataset_type: str, scaler: StandardScaler = None, logger: logging.Logger = None) -> List[float]: """ Evaluates an ensemble of models on a dataset. :param model: A model. :param data: A MoleculeDataset. :param num_tasks: Number of tasks. :param metric_func: Metric function which takes in a list of targets and a list of predictions. :param batch_size: Batch size. :param dataset_type: Dataset type. :param scaler: A StandardScaler object fit on the training targets. :param logger: Logger. :return: A list with the score for each task based on `metric_func`. """ preds, check_fp = predict( model=model, data=data, batch_size=batch_size, scaler=scaler) # wei, add check_fp for debug to check fp of each depth targets = data.targets() results = evaluate_predictions(preds=preds, targets=targets, num_tasks=num_tasks, metric_func=metric_func, dataset_type=dataset_type, logger=logger) return results
def split_indices(all_indices: List[int], num_folds: int, scaffold: bool = False, data: MoleculeDataset = None, shuffle: bool = True) -> List[List[int]]: num_data = len(all_indices) if scaffold: scaffold_to_indices = scaffold_to_smiles(data.mols(), use_indices=True) index_sets = sorted(list(scaffold_to_indices.values()), key=lambda index_set: len(index_set), reverse=True) fold_indices = [[] for _ in range(num_folds)] for s in index_sets: length_array = [len(fi) for fi in fold_indices] min_index = length_array.index(min(length_array)) fold_indices[min_index] += s if shuffle: random.shuffle(fold_indices) else: # random if shuffle: random.shuffle(all_indices) fold_indices = [] for i in range(num_folds): begin, end = int(i * num_data / num_folds), int( (i + 1) * num_data / num_folds) fold_indices.append(np.array(all_indices[begin:end])) return fold_indices
def run_split_data(args: Args): # Load raw data with open(args.data_path) as f: reader = csv.reader(f) header = next(reader) lines = list(reader) # Load SMILES smiles = get_smiles(path=args.data_path, smiles_column=args.smiles_column) # Make sure lines and smiles line up assert len(lines) == len(smiles) assert all(smile in line for smile, line in zip(smiles, lines)) # Create data data = [] for smile, line in tqdm(zip(smiles, lines), total=len(smiles)): datapoint = MoleculeDatapoint(smiles=smile) datapoint.line = line data.append(datapoint) data = MoleculeDataset(data) train, val, test = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed) makedirs(args.save_dir) for name, dataset in [('train', train), ('val', val), ('test', test)]: with open(os.path.join(args.save_dir, f'{name}.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(header) for datapoint in dataset: writer.writerow(datapoint.line)
def examine_split_balance(split_type: str): results = [] for dataset in DATASETS: # Load task names for the dataset data_path = os.path.join(BASE, dataset, f'{dataset}.csv') data = get_data(data_path) # Get class balance ratios for full dataset ratios = compute_ratios(data) # Initialize array of diffs between ratios ratio_diffs = [] # Loop through folds for fold in os.listdir(os.path.join(BASE, dataset, split_type)): # Open fold indices with open( os.path.join(BASE, dataset, split_type, fold, '0', 'split_indices.pckl'), 'rb') as f: indices = pickle.load(f) # Get test data test_data = MoleculeDataset([data[index] for index in indices[2]]) # Get test ratios test_ratios = compute_ratios(test_data) # Compute ratio diff ratio_diff = np.maximum(ratios / test_ratios, test_ratios / ratios) ratio_diff[np.where(np.isinf(ratio_diff))[0]] = np.nan # Add ratio diff ratio_diffs.append(ratio_diff) # Convert to numpy array ratio_diffs = np.array(ratio_diffs) # num_folds x num_tasks # Determine number of folds and number of failures num_folds = len(ratio_diffs) num_failures = np.sum(np.isnan(ratio_diffs)) # Average across tasks ratio_diffs = np.nanmean(ratio_diffs, axis=1) # num_folds # Compute mean and standard deviation across folds mean, std = np.nanmean(ratio_diffs), np.nanstd(ratio_diffs) # Add results results.append({ 'dataset': dataset, 'mean': mean, 'std': std, 'num_folds': num_folds, 'num_failures': num_failures }) pprint(results)
def single_task_random_forest(train_data: MoleculeDataset, test_data: MoleculeDataset, metric_func: Callable, args: Namespace, logger: Logger = None) -> List[float]: scores = [] num_tasks = train_data.num_tasks() for task_num in trange(num_tasks): # Only get features and targets for molecules where target is not None train_features, train_targets = zip( *[(features, targets[task_num]) for features, targets in zip( train_data.features(), train_data.targets()) if targets[task_num] is not None]) test_features, test_targets = zip( *[(features, targets[task_num]) for features, targets in zip( test_data.features(), test_data.targets()) if targets[task_num] is not None]) if args.dataset_type == 'regression': model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1) elif args.dataset_type == 'classification': model = RandomForestClassifier(class_weight=args.class_weight, n_estimators=args.num_trees, n_jobs=-1) else: raise ValueError( f'dataset_type "{args.dataset_type}" not supported.') model.fit(train_features, train_targets) test_preds = model.predict(test_features) test_preds = [[pred] for pred in test_preds] test_targets = [[target] for target in test_targets] score = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=1, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) scores.append(score[0]) return scores
def predict(model: nn.Module, data: MoleculeDataset, batch_size: int, scaler: StandardScaler = None) -> List[List[float]]: """ Makes predictions on a dataset using an ensemble of models. :param model: A model. :param data: A MoleculeDataset. :param batch_size: Batch size. :param scaler: A StandardScaler object fit on the training targets. :return: A list of lists of predictions. The outer list is examples while the inner list is tasks. """ model.eval() preds = [] num_iters, iter_step = len(data), batch_size smiles_batch_all = [] for i in trange(0, num_iters, iter_step): # Prepare batch mol_batch = MoleculeDataset(data[i:i + batch_size]) smiles_batch, features_batch = mol_batch.smiles(), mol_batch.features() # Run model batch = smiles_batch with torch.no_grad(): batch_preds = model(batch, features_batch) batch_preds = [x.data.cpu().numpy() for x in batch_preds] # Inverse scale if regression if scaler is not None: batch_preds = scaler.inverse_transform(batch_preds) # Collect vectors preds.append(batch_preds) smiles_batch_all.extend(smiles_batch) preds = [np.concatenate(x) for x in zip(*preds)] return preds, smiles_batch_all
def predict(model: nn.Module, data: MoleculeDataset, batch_size: int, disable_progress_bar: bool = False, scaler: StandardScaler = None) -> List[List[float]]: """ Makes predictions on a dataset using an ensemble of models. :param model: A model. :param data: A MoleculeDataset. :param batch_size: Batch size. :param disable_progress_bar: Whether to disable the progress bar. :param scaler: A StandardScaler object fit on the training targets. :return: A list of lists of predictions. The outer list is examples while the inner list is tasks. """ model.eval() preds = [] num_iters, iter_step = len(data), batch_size for i in trange(0, num_iters, iter_step, disable=disable_progress_bar): # Prepare batch mol_batch = MoleculeDataset(data[i:i + batch_size]) smiles_batch, features_batch = mol_batch.smiles(), mol_batch.features() # Run model batch = smiles_batch with torch.no_grad(): batch_preds = model(batch, features_batch) batch_preds = batch_preds.data.cpu().numpy() # Inverse scale if regression if scaler is not None: batch_preds = scaler.inverse_transform(batch_preds) # Collect vectors batch_preds = batch_preds.tolist() preds.extend(batch_preds) return preds
def gcnn_predict(self) -> Tuple[array, array]: """ Function that handles graph convolutinal neural network predictions, enters them into the predictions DataFrame and reports any errors Parameters: rdkit_mols (array): a numpy array containing RDKit molecules Returns: predictions, prediction_labels (Tuple[array, array]): predictions and labels """ smiles = self.rdkit_mols.tolist() full_data = get_data_from_smiles(smiles=smiles, skip_invalid_smiles=False) full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if full_data[full_index].mol is not None: full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) # create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=50, num_workers=0) model_preds = predict(model=rlm_gcnn_model, data_loader=test_data_loader, scaler=rlm_gcnn_scaler) predictions = np.ma.empty(len(full_data)) predictions.mask = True labels = np.ma.empty(len(full_data)) labels.mask = True for key in full_to_valid_indices.keys(): full_index = int(key) predictions[full_index] = model_preds[ full_to_valid_indices[key]][0] labels[full_index] = np.round( model_preds[full_to_valid_indices[key]][0], 0) self.predictions_df['GCNN'] = pd.Series( pd.Series(labels).fillna('').astype(str) + ' (' + pd.Series(predictions).round(2).astype(str) + ')').str.replace( '(nan)', '', regex=False) if len(self.predictions_df.index) > len( predictions) or np.ma.count_masked(predictions) > 0: self.model_errors.append('graph convolutional neural network') self.has_errors = True return predictions, labels
def multi_task_sklearn(model, train_data: MoleculeDataset, test_data: MoleculeDataset, metric_func: Callable, args: SklearnTrainArgs, logger: Logger = None) -> List[float]: num_tasks = train_data.num_tasks() train_targets = train_data.targets() if train_data.num_tasks() == 1: train_targets = [targets[0] for targets in train_targets] # Train model.fit(train_data.features(), train_targets) # Save model with open(os.path.join(args.save_dir, 'model.pkl'), 'wb') as f: pickle.dump(model, f) test_preds = predict(model=model, model_type=args.model_type, dataset_type=args.dataset_type, features=test_data.features()) scores = evaluate_predictions(preds=test_preds, targets=test_data.targets(), num_tasks=num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) return scores
def multi_task_random_forest(train_data: MoleculeDataset, test_data: MoleculeDataset, metric_func: Callable, args: Namespace) -> List[float]: if args.dataset_type == 'regression': model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1) elif args.dataset_type == 'classification': model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1) else: raise ValueError(f'dataset_type "{args.dataset_type}" not supported.') train_targets = train_data.targets() if train_data.num_tasks() == 1: train_targets = [targets[0] for targets in train_targets] model.fit(train_data.features(), train_targets) test_preds = model.predict(test_data.features()) if train_data.num_tasks() == 1: test_preds = [[pred] for pred in test_preds] scores = evaluate_predictions( preds=test_preds, targets=test_data.targets(), metric_func=metric_func, dataset_type=args.dataset_type ) return scores
def create_time_splits(args: Args): # ASSUME DATA GIVEN IN CHRONOLOGICAL ORDER. # this will dump a very different format of indices, with all in one file; TODO modify as convenient later. data = get_data(path=args.data_path, smiles_columns=args.smiles_columns) num_data = len(data) all_indices = list(range(num_data)) fold_indices = {'random': [], 'scaffold': [], 'time': []} for i in range(args.num_folds - args.time_folds_per_train_set - 1): begin, end = int(i * num_data / args.num_folds), int( (i + args.time_folds_per_train_set + 2) * num_data / args.num_folds) subset_indices = all_indices[begin:end] subset_data = MoleculeDataset(data[begin:end]) fold_indices['random'].append( split_indices(deepcopy(subset_indices), args.time_folds_per_train_set + 2)) fold_indices['scaffold'].append( split_indices(subset_indices, args.time_folds_per_train_set + 2, scaffold=True, split_key_molecule=args.split_key_molecule, data=subset_data)) fold_indices['time'].append( split_indices(subset_indices, args.time_folds_per_train_set + 2, shuffle=False)) for split_type in ['random', 'scaffold', 'time']: all_splits = [] for i in range(len(fold_indices[split_type])): os.makedirs(os.path.join(args.save_dir, split_type, 'fold_' + str(i), '0'), exist_ok=True) with open( os.path.join(args.save_dir, split_type, 'fold_' + str(i), '0', 'split_indices.pckl'), 'wb') as wf: train = np.concatenate([ fold_indices[split_type][i][j] for j in range(args.time_folds_per_train_set) ]) # train = [] # for fold in train_folds: # train += fold val = fold_indices[split_type][i][-2] test = fold_indices[split_type][i][-1] pickle.dump( [train, val, test], wf ) # each is a pickle file containing a list of length-3 index lists for train/val/test all_splits.append([train, val, test]) with open( os.path.join(args.save_dir, split_type, 'fold_' + str(i), 'split_indices.pckl'), 'wb') as wf: pickle.dump(all_splits, wf)
def single_task_sklearn(model, train_data: MoleculeDataset, test_data: MoleculeDataset, metric_func: Callable, args: SklearnTrainArgs, logger: Logger = None) -> List[float]: scores = [] num_tasks = train_data.num_tasks() for task_num in trange(num_tasks): # Only get features and targets for molecules where target is not None train_features, train_targets = zip( *[(features, targets[task_num]) for features, targets in zip( train_data.features(), train_data.targets()) if targets[task_num] is not None]) test_features, test_targets = zip( *[(features, targets[task_num]) for features, targets in zip( test_data.features(), test_data.targets()) if targets[task_num] is not None]) model.fit(train_features, train_targets) test_preds = predict(model=model, model_type=args.model_type, dataset_type=args.dataset_type, features=test_features) test_targets = [[target] for target in test_targets] score = evaluate_predictions(preds=test_preds, targets=test_targets, num_tasks=1, metric_func=metric_func, dataset_type=args.dataset_type, logger=logger) scores.append(score[0]) return scores
def load_data(args: PredictArgs, smiles: List[List[str]]): """ Function to load data from a list of smiles or a file. :param args: A :class:`~chemprop.args.PredictArgs` object containing arguments for loading data and a model and making predictions. :param smiles: A list of list of smiles, or None if data is to be read from file :return: A tuple of a :class:`~chemprop.data.MoleculeDataset` containing all datapoints, a :class:`~chemprop.data.MoleculeDataset` containing only valid datapoints, a :class:`~chemprop.data.MoleculeDataLoader` and a dictionary mapping full to valid indices. """ print("Loading data") if smiles is not None: full_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=args.features_generator, ) else: full_data = get_data( path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], skip_invalid_smiles=False, args=args, store_row=not args.drop_extra_columns, ) print("Validating SMILES") full_to_valid_indices = {} valid_index = 0 for full_index in range(len(full_data)): if all(mol is not None for mol in full_data[full_index].mol): full_to_valid_indices[full_index] = valid_index valid_index += 1 test_data = MoleculeDataset( [full_data[i] for i in sorted(full_to_valid_indices.keys())]) print(f"Test size = {len(test_data):,}") # Create data loader test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=args.batch_size, num_workers=args.num_workers) return full_data, test_data, test_data_loader, full_to_valid_indices
def __call__(self, smiles: List[str], batch_size: int = 500) -> List[List[float]]: """ Makes predictions on a list of SMILES. :param smiles: A list of SMILES to make predictions on. :param batch_size: The batch size. :return: A list of lists of floats containing the predicted values. """ test_data = get_data_from_smiles( smiles=smiles, skip_invalid_smiles=False, features_generator=self.args.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol is not None ] test_data = MoleculeDataset([test_data[i] for i in valid_indices]) if self.train_args.features_scaling: test_data.normalize_features(self.features_scaler) if self.train_args.atom_descriptor_scaling and self.args.atom_descriptors is not None: test_data.normalize_features(self.atom_descriptor_scaler, scale_atom_descriptors=True) if self.train_args.bond_feature_scaling and self.args.bond_features_size > 0: test_data.normalize_features(self.bond_feature_scaler, scale_bond_features=True) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = [] for model in self.checkpoints: model_preds = predict(model=model, data_loader=test_data_loader, scaler=self.scaler, disable_progress_bar=True) sum_preds.append(np.array(model_preds)) # Ensemble predictions sum_preds = sum(sum_preds) avg_preds = sum_preds / len(self.checkpoints) return avg_preds
def evaluate(model: nn.Module, data: MoleculeDataset, metric_func: Callable, args: Namespace, scaler: StandardScaler = None, logger: logging.Logger = None) -> List[float]: """ Evaluates an ensemble of models on a dataset. :param model: A model. :param data: A MoleculeDataset. :param metric_func: Metric function which takes in a list of targets and a list of predictions. :param dataset_type: Dataset type. :param args: Arguments. :param scaler: A StandardScaler object fit on the training targets. :param logger: Logger. :return: A list with the score for each task based on `metric_func`. """ preds = predict( model=model, data=data, args=args, scaler=scaler, bert_save_memory=True, logger=logger ) if args.maml: preds, targets = preds # in this case the targets are determined by the tasks sampled during prediction else: targets = data.targets() if args.dataset_type == 'bert_pretraining': # Only predict targets that are masked out targets['vocab'] = [target if mask == 0 else None for target, mask in zip(targets['vocab'], data.mask())] results = evaluate_predictions( preds=preds, targets=targets, metric_func=metric_func, dataset_type=args.dataset_type, args=args, logger=logger ) return results
def multi_task_sklearn(model: Union[RandomForestRegressor, RandomForestClassifier, SVR, SVC], train_data: MoleculeDataset, test_data: MoleculeDataset, metrics: List[str], args: SklearnTrainArgs, logger: Logger = None) -> Dict[str, List[float]]: """ Trains a multi-task scikit-learn model, meaning one model is trained simultaneously on all tasks. This is only possible if none of the tasks have None (unknown) values. :param model: The scikit-learn model to train. :param train_data: The training data. :param test_data: The test data. :param metrics: A list of names of metric functions. :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for training the scikit-learn model. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task. """ num_tasks = train_data.num_tasks() train_targets = train_data.targets() if train_data.num_tasks() == 1: train_targets = [targets[0] for targets in train_targets] # Train model.fit(train_data.features(), train_targets) # Save model with open(os.path.join(args.save_dir, 'model.pkl'), 'wb') as f: pickle.dump(model, f) test_preds = predict( model=model, model_type=args.model_type, dataset_type=args.dataset_type, features=test_data.features() ) scores = evaluate_predictions( preds=test_preds, targets=test_data.targets(), num_tasks=num_tasks, metrics=metrics, dataset_type=args.dataset_type, logger=logger ) return scores
def async_mol2graph(q: Queue, data: MoleculeDataset, args: Namespace, num_iters: int, iter_size: int, exit_q: Queue, last_batch: bool=False): batches = [] for i in range(0, num_iters, iter_size): # will only go up to max size of queue, then yield if not last_batch and i + args.batch_size > len(data): break batch = MoleculeDataset(data[i:i + args.batch_size]) batches.append(batch) if len(batches) == args.batches_per_queue_group: # many at a time, since synchronization is expensive with Pool() as pool: processed_batches = pool.map(mol2graph_helper, [(batch, args) for batch in batches]) q.put(processed_batches) batches = [] if len(batches) > 0: with Pool() as pool: processed_batches = pool.map(mol2graph_helper, [(batch, args) for batch in batches]) q.put(processed_batches) exit_q.get() # prevent from exiting until main process tells it to; otherwise we apparently can't read the end of the queue and crash
def train(inv_model, src_data, tgt_data, loss_func, inv_opt, args): inv_model.train() src_data.shuffle() new_size = len(tgt_data) / args.batch_size * args.src_batch_size new_size = int(new_size) src_pos_data = [d for d in src_data if d.targets[0] == 1] src_neg_data = [d for d in src_data if d.targets[0] == 0] print(len(tgt_data)) print(len(src_pos_data), len(src_neg_data), new_size) src_data = MoleculeDataset(src_pos_data + src_neg_data[:new_size]) src_data.shuffle() tgt_data.shuffle() src_iter = range(0, len(src_data), args.src_batch_size) tgt_iter = range(0, len(tgt_data), args.batch_size) for i, j in zip(src_iter, tgt_iter): inv_model.zero_grad() src_batch = src_data[i:i + args.src_batch_size] src_batch = MoleculeDataset(src_batch) src_loss = forward(inv_model, src_batch, loss_func, is_source=True) tgt_batch = tgt_data[j:j + args.batch_size] tgt_batch = MoleculeDataset(tgt_batch) tgt_loss = forward(inv_model, tgt_batch, loss_func, is_source=False) loss = (src_loss + tgt_loss) / 2 loss.backward() inv_opt[0].step() inv_opt[1].step() lr = inv_opt[1].get_lr()[0] ignorm = compute_gnorm(inv_model) print(f'lr: {lr:.5f}, loss: {loss:.4f}, gnorm: {ignorm:.4f}')
def __call__(self, smiles, batch_size=500): test_data = get_data_from_smiles( smiles=[[s] for s in smiles], skip_invalid_smiles=False, features_generator=self.features_generator) valid_indices = [ i for i in range(len(test_data)) if test_data[i].mol[0] is not None ] full_data = test_data test_data = MoleculeDataset([test_data[i] for i in valid_indices]) test_data_loader = MoleculeDataLoader(dataset=test_data, batch_size=batch_size) sum_preds = np.zeros((len(test_data), 1)) for model, scaler, features_scaler in zip(self.checkpoints, self.scalers, self.features_scalers): test_data.reset_features_and_targets() if features_scaler is not None: test_data.normalize_features(features_scaler) model_preds = predict(model=model, data_loader=test_data_loader, scaler=scaler) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(self.checkpoints) avg_preds = avg_preds.squeeze(-1).tolist() # Put zero for invalid smiles full_preds = [0.0] * len(full_data) for i, si in enumerate(valid_indices): full_preds[si] = avg_preds[i] return np.array(full_preds, dtype=np.float32)