def __init__(self, smiles: str, targets: List[Optional[float]] = None, row: OrderedDict = None, features: np.ndarray = None, features_generator: List[str] = None): """ :param smiles: The SMILES string for the molecule. :param targets: A list of targets for the molecule (contains None for unknown target values). :param row: The raw CSV row containing the information for this molecule. :param features: A numpy array containing additional features (e.g., Morgan fingerprint). :param features_generator: A list of features generators to use. """ if features is not None and features_generator is not None: raise ValueError( 'Cannot provide both loaded features and a features generator.' ) self.smiles = smiles self.targets = targets self.row = row self.features = features self.features_generator = features_generator self._mol = 'None' # Initialize with 'None' to distinguish between None returned by invalid molecule # Generate additional features if given a generator if self.features_generator is not None: self.features = [] for fg in self.features_generator: features_generator = get_features_generator(fg) if self.mol is not None and self.mol.GetNumHeavyAtoms() > 0: self.features.extend(features_generator(self.mol)) self.features = np.array(self.features) # Fix nans in features if self.features is not None: replace_token = 0 self.features = np.where(np.isnan(self.features), replace_token, self.features) # Save a copy of the raw features and targets to enable different scaling later on self.raw_features, self.raw_targets = features, targets
def __init__(self, smiles: str, mol: Chem.Mol, targets: np.ndarray, features: np.ndarray = None, features_generator: List[str] = None, compound_name: str = None): ''' Initializes a MoleculeDatapoint, which contains a single molecule. :param smiles: Smiles string :param mol: Chem.Mol :param targets: A numpy array containing features :param features: A numpy array containing additional features (e.g. ƒMorgan fingerprint). :param features_generator: List of strings of features_generator names. :param use_compound_names: Whether the data CSV includes the compound name on each line. ''' self.smiles = smiles self.mol = mol self.compound_name = compound_name if features and features_generator: raise ValueError('Currently cannot provide both loaded features' ' and a features generator.') if features_generator: # Generate additional features if given a generator: features = [] for fg in features_generator: features_generator = get_features_generator(fg) if self.mol and self.mol.GetNumHeavyAtoms(): self.features.extend(features_generator(self.mol)) features = np.array(self.features) self.set_features(features) self.set_targets(targets)
def predict_sklearn(args: Namespace): print('Loading data') data = get_data(path=args.test_path) print('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for datapoint in tqdm(data, total=len(data)): datapoint.set_features(morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) print(f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') sum_preds = np.zeros((len(data), args.num_tasks)) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): with open(checkpoint_path, 'rb') as f: model = pickle.load(f) model_preds = predict( model=model, model_type=args.model_type, dataset_type=args.dataset_type, features=data.features() ) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() print('Saving predictions') assert len(data) == len(avg_preds) makedirs(args.preds_path, isfile=True) with open(args.preds_path, 'w') as f: writer = csv.writer(f) writer.writerow(['smiles'] + get_task_names(args.test_path)) for smiles, pred in zip(data.smiles(), avg_preds): writer.writerow([smiles] + pred)
def __init__(self, smiles: List[str], targets: List[Optional[float]] = None, row: OrderedDict = None, features: np.ndarray = None, features_generator: List[str] = None, atom_features: np.ndarray = None, atom_descriptors: np.ndarray = None): """ :param smiles: A list of the SMILES strings for the molecules. :param targets: A list of targets for the molecule (contains None for unknown target values). :param row: The raw CSV row containing the information for this molecule. :param features: A numpy array containing additional features (e.g., Morgan fingerprint). :param features_generator: A list of features generators to use. """ if features is not None and features_generator is not None: raise ValueError( 'Cannot provide both loaded features and a features generator.' ) self.smiles = smiles self.targets = targets self.row = row self.features = features self.features_generator = features_generator self.atom_descriptors = atom_descriptors self.atom_features = atom_features # Generate additional features if given a generator if self.features_generator is not None: self.features = [] for fg in self.features_generator: features_generator = get_features_generator(fg) for m in self.mol: if m is not None and m.GetNumHeavyAtoms() > 0: self.features.extend(features_generator(m)) # for H2 elif m is not None and m.GetNumHeavyAtoms() == 0: # not all features are equally long, so use methane as dummy molecule to determine length self.features.extend( np.zeros( len(features_generator( Chem.MolFromSmiles('C'))))) self.features = np.array(self.features) # Fix nans in features replace_token = 0 if self.features is not None: self.features = np.where(np.isnan(self.features), replace_token, self.features) # Fix nans in atom_descriptors if self.atom_descriptors is not None: self.atom_descriptors = np.where(np.isnan(self.atom_descriptors), replace_token, self.atom_descriptors) # Fix nans in atom_features if self.atom_features is not None: self.atom_features = np.where(np.isnan(self.atom_features), replace_token, self.atom_features) # Save a copy of the raw features and targets to enable different scaling later on self.raw_features, self.raw_targets = self.features, self.targets
def generate_and_save_features(args: Namespace): """ Computes and saves features for a dataset of molecules as a 2D array in a .npz file. :param args: Arguments. """ # Create directory for save_path makedirs(args.save_path, isfile=True) # Get data and features function data = get_data(path=args.data_path, max_data_size=None) features_generator = get_features_generator(args.features_generator) temp_save_dir = args.save_path + '_temp' # Load partially complete data if args.restart: if os.path.exists(args.save_path): os.remove(args.save_path) if os.path.exists(temp_save_dir): shutil.rmtree(temp_save_dir) else: if os.path.exists(args.save_path): raise ValueError( f'"{args.save_path}" already exists and args.restart is False.' ) if os.path.exists(temp_save_dir): features, temp_num = load_temp(temp_save_dir) if not os.path.exists(temp_save_dir): makedirs(temp_save_dir) features, temp_num = [], 0 # Build features map function data = data[len( features ):] # restrict to data for which features have not been computed yet mols = (d.mol for d in data) if args.sequential: features_map = map(features_generator, mols) else: features_map = Pool().imap(features_generator, mols) # Get features temp_features = [] for i, feats in tqdm(enumerate(features_map), total=len(data)): temp_features.append(feats) # Save temporary features every save_frequency if (i > 0 and (i + 1) % args.save_frequency == 0) or i == len(data) - 1: save_features(os.path.join(temp_save_dir, f'{temp_num}.npz'), temp_features) features.extend(temp_features) temp_features = [] temp_num += 1 try: # Save all features save_features(args.save_path, features) # Remove temporary features shutil.rmtree(temp_save_dir) except OverflowError: print( 'Features array is too large to save as a single file. Instead keeping features as a directory of files.' )
def run_sklearn(args: SklearnTrainArgs, data: MoleculeDataset, logger: Logger = None) -> Dict[str, List[float]]: """ Loads data, trains a scikit-learn model, and returns test scores for the model checkpoint with the highest validation score. :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for loading data and training the scikit-learn model. :param data: A :class:`~chemprop.data.MoleculeDataset` containing the data. :param logger: A logger to record output. :return: A dictionary mapping each metric in :code:`metrics` to a list of values for each task. """ if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print if args.model_type == 'svm' and data.num_tasks() != 1: raise ValueError( f'SVM can only handle single-task data but found {data.num_tasks()} tasks' ) debug(f'Splitting data with seed {args.seed}') # Need to have val set so that train and test sets are the same as when doing MPN train_data, _, test_data = split_data(data=data, split_type=args.split_type, seed=args.seed, sizes=args.split_sizes, num_folds=args.num_folds, args=args) debug( f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}' ) debug('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for dataset in [train_data, test_data]: for datapoint in tqdm(dataset, total=len(dataset)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) debug('Building model') if args.dataset_type == 'regression': if args.model_type == 'random_forest': model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1) elif args.model_type == 'svm': model = SVR() else: raise ValueError(f'Model type "{args.model_type}" not supported') elif args.dataset_type == 'classification': if args.model_type == 'random_forest': model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1, class_weight=args.class_weight) elif args.model_type == 'svm': model = SVC() else: raise ValueError(f'Model type "{args.model_type}" not supported') else: raise ValueError(f'Dataset type "{args.dataset_type}" not supported') debug(model) model.train_args = args.as_dict() debug('Training') if args.single_task: scores = single_task_sklearn(model=model, train_data=train_data, test_data=test_data, metrics=args.metrics, args=args, logger=logger) else: scores = multi_task_sklearn(model=model, train_data=train_data, test_data=test_data, metrics=args.metrics, args=args, logger=logger) for metric in args.metrics: info(f'Test {metric} = {np.nanmean(scores[metric])}') return scores
def compare_datasets_tsne(args: Args): if len(args.smiles_paths) > len(args.colors) or len( args.smiles_paths) > len(args.sizes): raise ValueError( 'Must have at least as many colors and sizes as datasets') # Random seed for random subsampling np.random.seed(0) # Load the smiles datasets print('Loading data') smiles, slices, labels = [], [], [] for smiles_path in args.smiles_paths: # Get label label = os.path.basename(smiles_path).replace('.csv', '') # Get SMILES new_smiles = get_smiles(path=smiles_path, smiles_columns=args.smiles_column, flatten=True) print(f'{label}: {len(new_smiles):,}') # Subsample if dataset is too large if len(new_smiles) > args.max_per_dataset: print(f'Subsampling to {args.max_per_dataset:,} molecules') new_smiles = np.random.choice(new_smiles, size=args.max_per_dataset, replace=False).tolist() slices.append(slice(len(smiles), len(smiles) + len(new_smiles))) labels.append(label) smiles += new_smiles # Compute Morgan fingerprints print('Computing Morgan fingerprints') morgan_generator = get_features_generator('morgan') morgans = [ morgan_generator(smile) for smile in tqdm(smiles, total=len(smiles)) ] print('Running t-SNE') start = time.time() tsne = TSNE(n_components=2, init='pca', random_state=0, metric='jaccard') X = tsne.fit_transform(morgans) print(f'time = {time.time() - start:.2f} seconds') if args.cluster: import hdbscan # pip install hdbscan print('Running HDBSCAN') start = time.time() clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True) colors = clusterer.fit_predict(X) print(f'time = {time.time() - start:.2f} seconds') print('Plotting t-SNE') x_min, x_max = np.min(X, axis=0), np.max(X, axis=0) X = (X - x_min) / (x_max - x_min) makedirs(args.save_path, isfile=True) plt.clf() fontsize = 50 * args.scale fig = plt.figure(figsize=(64 * args.scale, 48 * args.scale)) plt.title('t-SNE using Morgan fingerprint with Jaccard similarity', fontsize=2 * fontsize) ax = fig.gca() handles = [] legend_kwargs = dict(loc='upper right', fontsize=fontsize) if args.cluster: plt.scatter(X[:, 0], X[:, 1], s=150 * np.mean(args.sizes), c=colors, cmap='nipy_spectral') else: for slc, color, label, size in zip(slices, args.colors, labels, args.sizes): if args.plot_molecules: # Plots molecules handles.append(mpatches.Patch(color=color, label=label)) for smile, (x, y) in zip(smiles[slc], X[slc]): img = Draw.MolsToGridImage([Chem.MolFromSmiles(smile)], molsPerRow=1, subImgSize=(200, 200)) imagebox = offsetbox.AnnotationBbox( offsetbox.OffsetImage(img), (x, y), bboxprops=dict(color=color)) ax.add_artist(imagebox) else: # Plots points plt.scatter(X[slc, 0], X[slc, 1], s=150 * size, color=color, label=label) if args.plot_molecules: legend_kwargs['handles'] = handles plt.legend(**legend_kwargs) plt.xticks([]), plt.yticks([]) print('Saving t-SNE') plt.savefig(args.save_path)
def __init__(self, drug_smiles: str, cmpd_smiles: str, targets: List[float], args: Namespace = None, drug_feats: np.ndarray = None, cmpd_feats: np.ndarray = None, context: np.ndarray = None, use_compound_names: bool = False): """ Initializes a MolPairDatapoint, which contains a molecule pair. :param line: A list of strings generated by separating a line in a data CSV file by comma. :param args: Arguments. :param drug_feats: A numpy array containing additional features for molecule 1 (ex. Morgan fingerprint). :param cmpd_feats: A numpy array containing additional features for molecule 2 (ex. Morgan fingerprint). :param context: A numpy array containing additional features defining context (ex. cell line). NOT SUPPORTED :param use_compound_names: Whether the data CSV includes the compound name on each line. """ if args is not None: self.features_generator = args.features_generator self.args = args else: self.features_generator = self.args = None if (drug_feats is not None or cmpd_feats is not None) and self.features_generator is not None: raise ValueError( 'Currently cannot provide both loaded features and a features generator.' ) self.drug_feats = drug_feats self.cmpd_feats = cmpd_feats self.context = context if use_compound_names: raise NotImplementedError self.compound_name = None self.drug_smiles = drug_smiles # str self.drug_mol = Chem.MolFromSmiles(self.drug_smiles) self.cmpd_smiles = cmpd_smiles # str self.cmpd_mol = Chem.MolFromSmiles(self.cmpd_smiles) # Create targets self.targets = targets # Generate additional features if given a generator if self.features_generator is not None: self.drug_feats = [] self.cmpd_feats = [] for fg in self.features_generator: features_generator = get_features_generator(fg) if self.drug_mol is not None and self.drug_mol.GetNumHeavyAtoms( ) > 0: self.drug_feats.extend(features_generator(self.drug_mol)) if self.cmpd_mol is not None and self.cmpd_mol.GetNumHeavyAtoms( ) > 0: self.cmpd_feats.extend(features_generator(self.cmpd_mol)) self.drug_feats = np.array(self.drug_feats) self.cmpd_feats = np.array(self.cmpd_feats) # Fix nans in features if self.drug_feats is not None: replace_token = 0 self.drug_feats = np.where(np.isnan(self.drug_feats), replace_token, self.drug_feats) if self.cmpd_feats is not None: replace_token = 0 self.cmpd_feats = np.where(np.isnan(self.cmpd_feats), replace_token, self.cmpd_feats) if self.context is not None: replace_token = 0 self.context = np.where(np.isnan(self.context), replace_token, self.context)
def __init__( self, smiles: List[str], targets: List[Optional[float]] = None, row: OrderedDict = None, features: np.ndarray = None, features_generator: List[str] = None, atom_features: np.ndarray = None, atom_descriptors: np.ndarray = None, bond_features: np.ndarray = None, overwrite_default_atom_features: bool = False, overwrite_default_bond_features: bool = False, ): """ :param smiles: A list of the SMILES strings for the molecules. :param targets: A list of targets for the molecule (contains None for unknown target values). :param row: The raw CSV row containing the information for this molecule. :param features: A numpy array containing additional features (e.g., Morgan fingerprint). :param features_generator: A list of features generators to use. :param atom_descriptors: A numpy array containing additional atom descriptors to featurize the molecule :param bond_features: A numpy array containing additional bond features to featurize the molecule :param overwrite_default_atom_features: Boolean to overwrite default atom features by atom_features :param overwrite_default_bond_features: Boolean to overwrite default bond features by bond_features """ if features is not None and features_generator is not None: raise ValueError( "Cannot provide both loaded features and a features generator." ) self.smiles = smiles self.targets = targets self.row = row self.features = features self.features_generator = features_generator self.atom_descriptors = atom_descriptors self.atom_features = atom_features self.bond_features = bond_features self.overwrite_default_atom_features = overwrite_default_atom_features self.overwrite_default_bond_features = overwrite_default_bond_features # Create One-Hot Encoding for the Molecule self.one_hot_lookup = { char: idx for (idx, char) in enumerate( "#%)(+-/.0123456789=A@CBEDGFIHKMLONPSRUTWVY[Z]\\acbedgfihmlonsrutpy" ) } self.smiles_one_hot_encoding = self.one_hot_encode_smiles() # Generate additional features if given a generator if self.features_generator is not None: self.features = [] for fg in self.features_generator: features_generator = get_features_generator(fg) for m in self.mol: if m is not None and m.GetNumHeavyAtoms() > 0: self.features.extend(features_generator(m)) # for H2 elif m is not None and m.GetNumHeavyAtoms() == 0: # not all features are equally long, so use methane as dummy molecule to determine length self.features.extend( np.zeros( len(features_generator( Chem.MolFromSmiles("C"))))) self.features = np.array(self.features) # Fix nans in features replace_token = 0 if self.features is not None: self.features = np.where(np.isnan(self.features), replace_token, self.features) # Fix nans in atom_descriptors if self.atom_descriptors is not None: self.atom_descriptors = np.where(np.isnan(self.atom_descriptors), replace_token, self.atom_descriptors) # Fix nans in atom_features if self.atom_features is not None: self.atom_features = np.where(np.isnan(self.atom_features), replace_token, self.atom_features) # Fix nans in bond_descriptors if self.bond_features is not None: self.bond_features = np.where(np.isnan(self.bond_features), replace_token, self.bond_features) # Save a copy of the raw features and targets to enable different scaling later on self.raw_features, self.raw_targets = self.features, self.targets self.raw_atom_descriptors, self.raw_atom_features, self.raw_bond_features = ( self.atom_descriptors, self.atom_features, self.bond_features, )
def predict_sklearn(args: SklearnPredictArgs): if args.parcel_size and args.max_data_size: num_iterations = math.ceil(args.max_data_size / args.parcel_size) max_data_size = args.parcel_size else: num_iterations = 1 max_data_size = args.max_data_size offset = 0 for iteration in range(num_iterations): if iteration > 0: offset = offset + args.parcel_size max_data_size = max_data_size + args.parcel_size print('Loading data') data = get_data(path=args.test_path, smiles_column=args.smiles_column, target_columns=[], max_data_size=max_data_size, data_offset=offset) print('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for datapoint in tqdm(data, total=len(data)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models' ) sum_preds = np.zeros((len(data), args.num_tasks)) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): with open(checkpoint_path, 'rb') as f: model = pickle.load(f) model_preds = predict(model=model, model_type=args.model_type, dataset_type=args.dataset_type, features=data.features()) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() print(f'Saving predictions to {args.preds_path}') assert len(data) == len(avg_preds) makedirs(args.preds_path, isfile=True) # Copy predictions over to data task_names = get_task_names(path=args.test_path) for datapoint, preds in zip(data, avg_preds): for pred_name, pred in zip(task_names, preds): datapoint.row[pred_name] = pred # Save if iteration != 0: name, ext = os.path.splitext(args.preds_path) preds_path = "{name}.{it}{ext}".format(name=name, it=iteration, ext=ext) else: preds_path = args.preds_path with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=data[0].row.keys()) writer.writeheader() for datapoint in data: writer.writerow(datapoint.row)
def compare_datasets_tsne(smiles_paths: List[str], smiles_col_name: str, colors: List[str], plot_molecules: bool, max_num_per_dataset: int, save_path: str): assert len(smiles_paths) <= len(colors) # Random seed for random subsampling np.random.seed(1) # Genenrate labels based on file name labels = [os.path.basename(path).replace('.csv', '') for path in smiles_paths] # Load the smiles datasets print('Loading data') smiles, slices = [], [] for smiles_path, color in zip(smiles_paths, colors): # Get SMILES new_smiles = pd.read_csv(smiles_path)[smiles_col_name] new_smiles = list(new_smiles[new_smiles.notna()]) # Exclude empty strings print(f'{os.path.basename(smiles_path)}: {len(new_smiles):,}') # Subsample if dataset is too large if len(new_smiles) > max_num_per_dataset: print(f'Subsampling to {max_num_per_dataset:,} molecules') new_smiles = np.random.choice(new_smiles, size=max_num_per_dataset, replace=False).tolist() slices.append(slice(len(smiles), len(smiles) + len(new_smiles))) smiles += new_smiles # Compute Morgan fingerprints print('Computing Morgan fingerprints') morgan_generator = get_features_generator('morgan') morgans = [morgan_generator(smile) for smile in tqdm(smiles, total=len(smiles))] print('Running t-SNE') import time start = time.time() tsne = TSNE(n_components=2, init='pca', random_state=0, metric='jaccard') X = tsne.fit_transform(morgans) print(f'time = {time.time() - start}') print('Plotting t-SNE') x_min, x_max = np.min(X, axis=0), np.max(X, axis=0) X = (X - x_min) / (x_max - x_min) makedirs(save_path, isfile=True) plt.clf() scale = 10 fontsize = 5 * scale fig = plt.figure(figsize=(6.4 * scale, 4.8 * scale)) plt.title('t-SNE using Morgan fingerprint with Jaccard similarity', fontsize=2 * fontsize) ax = fig.gca() handles = [] legend_kwargs = dict(loc='upper right', fontsize=fontsize) for slc, color, label in zip(slices, colors, labels): if plot_molecules: # Plots molecules handles.append(mpatches.Patch(color=color, label=label)) for smile, (x, y) in zip(smiles[slc], X[slc]): img = Draw.MolsToGridImage([Chem.MolFromSmiles(smile)], molsPerRow=1, subImgSize=(200, 200)) imagebox = offsetbox.AnnotationBbox(offsetbox.OffsetImage(img), (x, y), bboxprops=dict(color=color)) ax.add_artist(imagebox) else: # Plots points s = 450 if label == 'sars_pos' else 150 plt.scatter(X[slc, 0], X[slc, 1], s=s, color=color, label=label) if plot_molecules: legend_kwargs['handles'] = handles plt.legend(**legend_kwargs) plt.xticks([]), plt.yticks([]) plt.savefig(save_path) # Plot pairs of sars_pos and other dataset if 'sars_pos' in labels: pos_index = labels.index('sars_pos') for index in range(len(labels) - 1): plt.clf() fontsize = 50 plt.figure(figsize=(6.4 * 10, 4.8 * 10)) plt.title('t-SNE using Morgan fingerprint with Jaccard similarity', fontsize=2 * fontsize) plt.scatter(X[slices[index], 0], X[slices[index], 1], s=150, color=colors[index], label=labels[index]) plt.scatter(X[slices[pos_index], 0], X[slices[pos_index], 1], s=450, color=colors[pos_index], label=labels[pos_index]) plt.xticks([]), plt.yticks([]) plt.legend(loc='upper right', fontsize=fontsize) plt.savefig(save_path.replace('.png', f'_{labels[index]}.png'))
import os from typing import List, Optional, Tuple, Union from rdkit import Chem, DataStructs from rdkit.Chem import AllChem from tqdm import tqdm from chemprop.data.utils import get_smiles from chemprop.features import get_features_generator UNCONDITIONAL_EVAL_DUMMY = '<DUMMY>' SMILES_TO_MORGAN = {} morgan_fingerprint = get_features_generator('morgan') def get_pred_smiles( pred_smiles_dir: str, return_num_decode: bool = False ) -> Union[List[str], Tuple[List[str], int]]: # Collect all paths to .txt files pred_paths = [] for root, _, files in os.walk(pred_smiles_dir): pred_paths += [ os.path.join(root, fname) for fname in files if fname.endswith('.txt') ] # Get SMILES all_smiles = []
def __init__(self, line: pd.Series, args: Namespace = None, features: np.ndarray = None, use_compound_names: bool = False, pred: bool = False): """ Initializes a MoleculeDatapoint, which contains a single molecule. :param line: a pandas Series :param args: Arguments. :param features: A numpy array containing additional features (ex. Morgan fingerprint). :param use_compound_names: Whether the data CSV includes the compound name on each line. """ if args is not None: self.features_generator = args.features_generator self.args = args else: self.features_generator = self.args = None if features is not None and self.features_generator is not None: raise ValueError( 'Currently cannot provide both loaded features and a features generator.' ) self.features = features if use_compound_names: self.compound_name = line['compound_name'] # str line = line.drop('compound_name') else: self.compound_name = None self.smiles = line['smiles'] # str self.mol = Chem.MolFromSmiles(self.smiles) # Generate additional features if given a generator if self.features_generator is not None: self.features = [] for fg in self.features_generator: features_generator = get_features_generator(fg) if self.mol is not None and self.mol.GetNumHeavyAtoms() > 0: self.features.extend(features_generator(self.mol)) self.features = np.array(self.features) # Fix nans in features if self.features is not None: replace_token = 0 self.features = np.where(np.isnan(self.features), replace_token, self.features) line = line.drop('smiles') if pred: return # Create targets # atom targets self.atom_targets = line[args.atom_targets].values.tolist() bond_targets = [] for b_target_name in args.bond_targets: if args.explicit_Hs: self.mol = Chem.AddHs(self.mol) bond_target = line[b_target_name] bond_target_arranged = [] for b in self.mol.GetBonds(): bond_target_arranged.append( bond_target[b.GetBeginAtom().GetIdx(), b.GetEndAtom().GetIdx()]) bond_targets.append(bond_target_arranged) self.bond_targets = bond_targets
def __init__(self, drug_smiles: str, cmpd_smiles: str, targets: List[float], args: Namespace = None, drug_feats: np.ndarray = None, cmpd_feats: np.ndarray = None, context: np.ndarray = None, no_generation: bool = False): """ Initializes a MolPairDatapoint, which contains a molecule pair. :param line: A list of strings generated by separating a line in a data CSV file by comma. :param args: Arguments. :param drug_feats: A numpy array containing additional features for molecule 1 (ex. Morgan fingerprint). :param cmpd_feats: A numpy array containing additional features for molecule 2 (ex. Morgan fingerprint). :param context: A numpy array containing additional features defining context (ex. cell line). :param no_generation: Overrides args. If True, generation doesn't happen. NOT SUPPORTED :param use_compound_names: Whether the data CSV includes the compound name on each line. """ if args is not None: self.features_generator = args.features_generator self.args = args else: self.features_generator = self.args = None self.compound_name = None self.drug_smiles = drug_smiles # str self.drug_mol = Chem.MolFromSmiles(self.drug_smiles) self.cmpd_smiles = cmpd_smiles # str self.cmpd_mol = Chem.MolFromSmiles(self.cmpd_smiles) # Create targets self.targets = targets # Set features self.drug_feats = drug_feats self.cmpd_feats = cmpd_feats self.context = context # Generate additional features if given a generator if not no_generation and self.features_generator is not None: gen_drug_feats = [] gen_cmpd_feats = [] for fg in self.features_generator: features_generator = get_features_generator(fg) if self.drug_mol is not None and self.drug_mol.GetNumHeavyAtoms() > 0: gen_drug_feats.extend(features_generator(self.drug_mol)) if self.cmpd_mol is not None and self.cmpd_mol.GetNumHeavyAtoms() > 0: gen_cmpd_feats.extend(features_generator(self.cmpd_mol)) if self.drug_feats: # Concatenate with given input feats if already exist self.drug_feats = np.concatenate((self.drug_feats, gen_drug_feats)) else: self.drug_feats = np.array(gen_drug_feats) if self.cmpd_feats: self.cmpd_feats = np.concatenate((self.cmpd_feats, gen_cmpd_feats)) else: self.cmpd_feats = np.array(gen_cmpd_feats) # Fix nans in features if self.drug_feats is not None: replace_token = 0 self.drug_feats = np.where(np.isnan(self.drug_feats), replace_token, self.drug_feats) if self.cmpd_feats is not None: replace_token = 0 self.cmpd_feats = np.where(np.isnan(self.cmpd_feats), replace_token, self.cmpd_feats) if self.context is not None: replace_token = 0 self.context = np.where(np.isnan(self.context), replace_token, self.context)
def __init__(self, line: List[str], args: Namespace = None, features: np.ndarray = None, use_compound_names: bool = False): """ Initializes a MoleculeDatapoint, which contains a single molecule. :param line: A list of strings generated by separating a line in a data CSV file by comma. :param args: Arguments. :param features: A numpy array containing additional features (ex. Morgan fingerprint). :param use_compound_names: Whether the data CSV includes the compound name on each line. """ if args is not None: self.features_generator = args.features_generator self.args = args else: self.features_generator = self.args = None if features is not None and self.features_generator is not None: raise ValueError( 'Currently cannot provide both loaded features and a features generator.' ) self.features = features if use_compound_names: self.compound_name = line[0] # str line = line[1:] else: self.compound_name = None self.smiles = line[0] # str self.mol = Chem.MolFromSmiles(self.smiles) # Generate additional features if given a generator if self.features_generator is not None: self.features = [] for fg in self.features_generator: features_generator = get_features_generator(fg) if self.mol is not None and self.mol.GetNumHeavyAtoms() > 0: self.features.extend(features_generator(self.mol)) self.features = np.array(self.features) # Fix nans in features if self.features is not None: replace_token = 0 self.features = np.where(np.isnan(self.features), replace_token, self.features) # print (line) # print (len(line)) # print ([line[2]]) # Create targets # self.targets = [float(x) if x != '' else None for x in line[1:]] self.targets = [float(x) if x != '' else None for x in [line[1]]] if len(line) > 2: self.weights = [float(x) if x != '' else None for x in [line[2]]] else: self.weights = [float(1.0)]
def __init__(self, smiles: List[str], targets: List[Optional[float]] = None, row: OrderedDict = None, data_weight: float = None, gt_targets: List[bool] = None, lt_targets: List[bool] = None, features: np.ndarray = None, features_generator: List[str] = None, phase_features: List[float] = None, atom_features: np.ndarray = None, atom_descriptors: np.ndarray = None, bond_features: np.ndarray = None, overwrite_default_atom_features: bool = False, overwrite_default_bond_features: bool = False): """ :param smiles: A list of the SMILES strings for the molecules. :param targets: A list of targets for the molecule (contains None for unknown target values). :param row: The raw CSV row containing the information for this molecule. :param data_weight: Weighting of the datapoint for the loss function. :param gt_targets: Indicates whether the targets are an inequality regression target of the form ">x". :param lt_targets: Indicates whether the targets are an inequality regression target of the form "<x". :param features: A numpy array containing additional features (e.g., Morgan fingerprint). :param features_generator: A list of features generators to use. :param phase_features: A one-hot vector indicating the phase of the data, as used in spectra data. :param atom_descriptors: A numpy array containing additional atom descriptors to featurize the molecule :param bond_features: A numpy array containing additional bond features to featurize the molecule :param overwrite_default_atom_features: Boolean to overwrite default atom features by atom_features :param overwrite_default_bond_features: Boolean to overwrite default bond features by bond_features """ if features is not None and features_generator is not None: raise ValueError( 'Cannot provide both loaded features and a features generator.' ) self.smiles = smiles self.targets = targets self.row = row self.features = features self.features_generator = features_generator self.phase_features = phase_features self.atom_descriptors = atom_descriptors self.atom_features = atom_features self.bond_features = bond_features self.overwrite_default_atom_features = overwrite_default_atom_features self.overwrite_default_bond_features = overwrite_default_bond_features self.is_reaction = is_reaction() self.is_explicit_h = is_explicit_h() self.is_adding_hs = is_adding_hs() if data_weight is not None: self.data_weight = data_weight if gt_targets is not None: self.gt_targets = gt_targets if lt_targets is not None: self.lt_targets = lt_targets # Generate additional features if given a generator if self.features_generator is not None: self.features = [] for fg in self.features_generator: features_generator = get_features_generator(fg) for m in self.mol: if not self.is_reaction: if m is not None and m.GetNumHeavyAtoms() > 0: self.features.extend(features_generator(m)) # for H2 elif m is not None and m.GetNumHeavyAtoms() == 0: # not all features are equally long, so use methane as dummy molecule to determine length self.features.extend( np.zeros( len( features_generator( Chem.MolFromSmiles('C'))))) else: if m[0] is not None and m[ 1] is not None and m[0].GetNumHeavyAtoms() > 0: self.features.extend(features_generator(m[0])) elif m[0] is not None and m[1] is not None and m[ 0].GetNumHeavyAtoms() == 0: self.features.extend( np.zeros( len( features_generator( Chem.MolFromSmiles('C'))))) self.features = np.array(self.features) # Fix nans in features replace_token = 0 if self.features is not None: self.features = np.where(np.isnan(self.features), replace_token, self.features) # Fix nans in atom_descriptors if self.atom_descriptors is not None: self.atom_descriptors = np.where(np.isnan(self.atom_descriptors), replace_token, self.atom_descriptors) # Fix nans in atom_features if self.atom_features is not None: self.atom_features = np.where(np.isnan(self.atom_features), replace_token, self.atom_features) # Fix nans in bond_descriptors if self.bond_features is not None: self.bond_features = np.where(np.isnan(self.bond_features), replace_token, self.bond_features) # Save a copy of the raw features and targets to enable different scaling later on self.raw_features, self.raw_targets = self.features, self.targets self.raw_atom_descriptors, self.raw_atom_features, self.raw_bond_features = \ self.atom_descriptors, self.atom_features, self.bond_features
def predict_sklearn(args: SklearnPredictArgs) -> None: """ Loads data and a trained scikit-learn model and uses the model to make predictions on the data. :param args: A :class:`~chemprop.args.SklearnPredictArgs` object containing arguments for loading data, loading a trained scikit-learn model, and making predictions with the model. """ print('Loading data') data = get_data(path=args.test_path, smiles_columns=args.smiles_columns, target_columns=[], ignore_columns=[], store_row=True) print('Loading training arguments') with open(args.checkpoint_paths[0], 'rb') as f: model = pickle.load(f) train_args: SklearnTrainArgs = SklearnTrainArgs().from_dict( model.train_args, skip_unsettable=True) print('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for datapoint in tqdm(data, total=len(data)): for s in datapoint.smiles: datapoint.extend_features( morgan_fingerprint(mol=s, radius=train_args.radius, num_bits=train_args.num_bits)) print( f'Predicting with an ensemble of {len(args.checkpoint_paths)} models') sum_preds = np.zeros((len(data), train_args.num_tasks)) for checkpoint_path in tqdm(args.checkpoint_paths, total=len(args.checkpoint_paths)): with open(checkpoint_path, 'rb') as f: model = pickle.load(f) model_preds = predict(model=model, model_type=train_args.model_type, dataset_type=train_args.dataset_type, features=data.features()) sum_preds += np.array(model_preds) # Ensemble predictions avg_preds = sum_preds / len(args.checkpoint_paths) avg_preds = avg_preds.tolist() print(f'Saving predictions to {args.preds_path}') # assert len(data) == len(avg_preds) #TODO: address with unit test later makedirs(args.preds_path, isfile=True) # Copy predictions over to data for datapoint, preds in zip(data, avg_preds): for pred_name, pred in zip(train_args.task_names, preds): datapoint.row[pred_name] = pred # Save with open(args.preds_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=data[0].row.keys()) writer.writeheader() for datapoint in data: writer.writerow(datapoint.row)
def run_sklearn(args: SklearnTrainArgs, logger: Logger = None) -> List[float]: if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print debug(pformat(vars(args))) metric_func = get_metric_func(args.metric) debug('Loading data') data = get_data(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns) args.task_names = get_task_names(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns, ignore_columns=args.ignore_columns) if args.model_type == 'svm' and data.num_tasks() != 1: raise ValueError( f'SVM can only handle single-task data but found {data.num_tasks()} tasks' ) debug(f'Splitting data with seed {args.seed}') # Need to have val set so that train and test sets are the same as when doing MPN train_data, _, test_data = split_data(data=data, split_type=args.split_type, seed=args.seed, sizes=args.split_sizes, args=args) debug( f'Total size = {len(data):,} | train size = {len(train_data):,} | test size = {len(test_data):,}' ) debug('Computing morgan fingerprints') morgan_fingerprint = get_features_generator('morgan') for dataset in [train_data, test_data]: for datapoint in tqdm(dataset, total=len(dataset)): datapoint.set_features( morgan_fingerprint(mol=datapoint.smiles, radius=args.radius, num_bits=args.num_bits)) debug('Building model') if args.dataset_type == 'regression': if args.model_type == 'random_forest': model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1) elif args.model_type == 'svm': model = SVR() else: raise ValueError(f'Model type "{args.model_type}" not supported') elif args.dataset_type == 'classification': if args.model_type == 'random_forest': model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1, class_weight=args.class_weight) elif args.model_type == 'svm': model = SVC() else: raise ValueError(f'Model type "{args.model_type}" not supported') else: raise ValueError(f'Dataset type "{args.dataset_type}" not supported') debug(model) model.train_args = args.as_dict() debug('Training') if args.single_task: scores = single_task_sklearn(model=model, train_data=train_data, test_data=test_data, metric_func=metric_func, args=args, logger=logger) else: scores = multi_task_sklearn(model=model, train_data=train_data, test_data=test_data, metric_func=metric_func, args=args, logger=logger) info(f'Test {args.metric} = {np.nanmean(scores)}') return scores