def test_wrong_num_mol(self): """Test that error is raised when wrong number of molecules provided""" with self.assertRaises(ValueError): smiles_columns = preprocess_smiles_columns( path='dummy_path.txt', smiles_columns=['column3'], number_of_molecules=2, )
def test_smiles_not_in_file(self): """Test that error is raised whgen the provided smiles columns are not in the file""" with self.assertRaises(ValueError): smiles_columns = preprocess_smiles_columns( path='dummy_path.txt', smiles_columns=['column3', 'not_in_file'], number_of_molecules=2, )
def test_input_not_list(self): """Test case where smiles_columns provided are not a list""" smiles_columns = preprocess_smiles_columns( path='dummy_path.txt', smiles_columns='column3', number_of_molecules=1, ) self.assertEqual(smiles_columns, ['column3'])
def test_out_of_order_smiles(self): """Test specified smiles columns provided in a different order""" smiles_columns = preprocess_smiles_columns( path='dummy_path.txt', smiles_columns=['column3', 'column2'], number_of_molecules=2, ) self.assertEqual(smiles_columns, ['column3', 'column2'])
def test_specified_smiles(self): """Test specified smiles column""" smiles_columns = preprocess_smiles_columns( path='dummy_path.txt', smiles_columns=['column3'], number_of_molecules=1, ) self.assertEqual(smiles_columns, ['column3'])
def test_2mol(self): """Test 2 molecule case""" smiles_columns = preprocess_smiles_columns( path='dummy_path.txt', smiles_columns=None, number_of_molecules=2, ) self.assertEqual(smiles_columns, ['column0', 'column1'])
def test_basecase_1mol(self): """Test base case input with 1 molecule""" smiles_columns = preprocess_smiles_columns( path='dummy_path.txt', smiles_columns=None, number_of_molecules=1, ) self.assertEqual(smiles_columns, ['column0'])
def save_smiles_splits( data_path: str, save_dir: str, task_names: List[str] = None, features_path: List[str] = None, train_data: MoleculeDataset = None, val_data: MoleculeDataset = None, test_data: MoleculeDataset = None, logger: logging.Logger = None, smiles_columns: List[str] = None, ) -> None: """ Saves a csv file with train/val/test splits of target data and additional features. Also saves indices of train/val/test split as a pickle file. Pickle file does not support repeated entries with the same SMILES or entries entered from a path other than the main data path, such as a separate test path. :param data_path: Path to data CSV file. :param save_dir: Path where pickle files will be saved. :param task_names: List of target names for the model as from the function get_task_names(). If not provided, will use datafile header entries. :param features_path: List of path(s) to files with additional molecule features. :param train_data: Train :class:`~chemprop.data.data.MoleculeDataset`. :param val_data: Validation :class:`~chemprop.data.data.MoleculeDataset`. :param test_data: Test :class:`~chemprop.data.data.MoleculeDataset`. :param smiles_columns: The name of the column containing SMILES. By default, uses the first column. :param logger: A logger for recording output. """ makedirs(save_dir) info = logger.info if logger is not None else print save_split_indices = True if not isinstance(smiles_columns, list): smiles_columns = preprocess_smiles_columns( path=data_path, smiles_columns=smiles_columns) with open(data_path) as f: reader = csv.DictReader(f) indices_by_smiles = {} for i, row in enumerate(tqdm(reader)): smiles = tuple([row[column] for column in smiles_columns]) if smiles in indices_by_smiles: save_split_indices = False info( "Warning: Repeated SMILES found in data, pickle file of split indices cannot distinguish entries and will not be generated." ) break indices_by_smiles[smiles] = i if task_names is None: task_names = get_task_names(path=data_path, smiles_columns=smiles_columns) features_header = [] if features_path is not None: for feat_path in features_path: with open(feat_path, "r") as f: reader = csv.reader(f) feat_header = next(reader) features_header.extend(feat_header) all_split_indices = [] for dataset, name in [(train_data, "train"), (val_data, "val"), (test_data, "test")]: if dataset is None: continue with open(os.path.join(save_dir, f"{name}_smiles.csv"), "w") as f: writer = csv.writer(f) if smiles_columns[0] == "": writer.writerow(["smiles"]) else: writer.writerow(smiles_columns) for smiles in dataset.smiles(): writer.writerow(smiles) with open(os.path.join(save_dir, f"{name}_full.csv"), "w") as f: writer = csv.writer(f) writer.writerow(smiles_columns + task_names) dataset_targets = dataset.targets() for i, smiles in enumerate(dataset.smiles()): writer.writerow(smiles + dataset_targets[i]) if features_path is not None: dataset_features = dataset.features() with open(os.path.join(save_dir, f"{name}_features.csv"), "w") as f: writer = csv.writer(f) writer.writerow(features_header) writer.writerows(dataset_features) if save_split_indices: split_indices = [] for smiles in dataset.smiles(): index = indices_by_smiles.get(tuple(smiles)) if index is None: save_split_indices = False info( f"Warning: SMILES string in {name} could not be found in data file, and " "likely came from a secondary data file. The pickle file of split indices " "can only indicate indices for a single file and will not be generated." ) break split_indices.append(index) else: split_indices.sort() all_split_indices.append(split_indices) if name == "train": data_weights = dataset.data_weights() if any([w != 1 for w in data_weights]): with open(os.path.join(save_dir, f"{name}_weights.csv"), "w") as f: writer = csv.writer(f) writer.writerow(["data weights"]) for weight in data_weights: writer.writerow([weight]) if save_split_indices: with open(os.path.join(save_dir, "split_indices.pckl"), "wb") as f: pickle.dump(all_split_indices, f)
def save_smiles_splits(data_path: str, save_dir: str, task_names: List[str] = None, features_path: List[str] = None, train_data: MoleculeDataset = None, val_data: MoleculeDataset = None, test_data: MoleculeDataset = None, smiles_columns: List[str] = None) -> None: """ Saves a csv file with train/val/test splits of target data and additional features. Also saves indices of train/val/test split as a pickle file. Pickle file does not support repeated entries with same SMILES. :param data_path: Path to data CSV file. :param save_dir: Path where pickle files will be saved. :param task_names: List of target names for the model as from the function get_task_names(). If not provided, will use datafile header entries. :param features_path: List of path(s) to files with additional molecule features. :param train_data: Train :class:`~chemprop.data.data.MoleculeDataset`. :param val_data: Validation :class:`~chemprop.data.data.MoleculeDataset`. :param test_data: Test :class:`~chemprop.data.data.MoleculeDataset`. :param smiles_columns: The name of the column containing SMILES. By default, uses the first column. """ makedirs(save_dir) if not isinstance(smiles_columns, list): smiles_columns = preprocess_smiles_columns( path=data_path, smiles_columns=smiles_columns) with open(data_path) as f: reader = csv.DictReader(f) indices_by_smiles = {} for i, row in enumerate(tqdm(reader)): smiles = tuple([row[column] for column in smiles_columns]) indices_by_smiles[smiles] = i if task_names is None: task_names = get_task_names(path=data_path, smiles_columns=smiles_columns) features_header = [] if features_path is not None: for feat_path in features_path: with open(feat_path, 'r') as f: reader = csv.reader(f) feat_header = next(reader) features_header.extend(feat_header) all_split_indices = [] for dataset, name in [(train_data, 'train'), (val_data, 'val'), (test_data, 'test')]: if dataset is None: continue with open(os.path.join(save_dir, f'{name}_smiles.csv'), 'w', newline='') as f: writer = csv.writer(f) #, lineterminator = '\n') if smiles_columns[0] == '': writer.writerow(['smiles']) else: writer.writerow(smiles_columns) for smiles in dataset.smiles(): writer.writerow(smiles) with open(os.path.join(save_dir, f'{name}_full.csv'), 'w', newline='') as f: writer = csv.writer(f) #, lineterminator = '\n') writer.writerow(smiles_columns + task_names) dataset_targets = dataset.targets() for i, smiles in enumerate(dataset.smiles()): writer.writerow(smiles + dataset_targets[i]) dataset_features = dataset.features() if features_path is not None: with open(os.path.join(save_dir, f'{name}_features.csv'), 'w', newline='') as f: writer = csv.writer(f) #, lineterminator = '\n') writer.writerow(features_header) writer.writerows(dataset_features) split_indices = [] for smiles in dataset.smiles(): split_indices.append(indices_by_smiles.get(tuple(smiles))) split_indices = sorted(split_indices) all_split_indices.append(split_indices) with open(os.path.join(save_dir, 'split_indices.pckl'), 'wb') as f: pickle.dump(all_split_indices, f)