Beispiel #1
0
 def test_wrong_num_mol(self):
     """Test that error is raised when wrong number of molecules provided"""
     with self.assertRaises(ValueError):
         smiles_columns = preprocess_smiles_columns(
             path='dummy_path.txt',
             smiles_columns=['column3'],
             number_of_molecules=2,
         )
Beispiel #2
0
 def test_smiles_not_in_file(self):
     """Test that error is raised whgen the provided smiles columns are not in the file"""
     with self.assertRaises(ValueError):
         smiles_columns = preprocess_smiles_columns(
             path='dummy_path.txt',
             smiles_columns=['column3', 'not_in_file'],
             number_of_molecules=2,
         )
Beispiel #3
0
 def test_input_not_list(self):
     """Test case where smiles_columns provided are not a list"""
     smiles_columns = preprocess_smiles_columns(
         path='dummy_path.txt',
         smiles_columns='column3',
         number_of_molecules=1,
     )
     self.assertEqual(smiles_columns, ['column3'])
Beispiel #4
0
 def test_out_of_order_smiles(self):
     """Test specified smiles columns provided in a different order"""
     smiles_columns = preprocess_smiles_columns(
         path='dummy_path.txt',
         smiles_columns=['column3', 'column2'],
         number_of_molecules=2,
     )
     self.assertEqual(smiles_columns, ['column3', 'column2'])
Beispiel #5
0
 def test_specified_smiles(self):
     """Test specified smiles column"""
     smiles_columns = preprocess_smiles_columns(
         path='dummy_path.txt',
         smiles_columns=['column3'],
         number_of_molecules=1,
     )
     self.assertEqual(smiles_columns, ['column3'])
Beispiel #6
0
 def test_2mol(self):
     """Test 2 molecule case"""
     smiles_columns = preprocess_smiles_columns(
         path='dummy_path.txt',
         smiles_columns=None,
         number_of_molecules=2,
     )
     self.assertEqual(smiles_columns, ['column0', 'column1'])
Beispiel #7
0
 def test_basecase_1mol(self):
     """Test base case input with 1 molecule"""
     smiles_columns = preprocess_smiles_columns(
         path='dummy_path.txt',
         smiles_columns=None,
         number_of_molecules=1,
     )
     self.assertEqual(smiles_columns, ['column0'])
Beispiel #8
0
def save_smiles_splits(
    data_path: str,
    save_dir: str,
    task_names: List[str] = None,
    features_path: List[str] = None,
    train_data: MoleculeDataset = None,
    val_data: MoleculeDataset = None,
    test_data: MoleculeDataset = None,
    logger: logging.Logger = None,
    smiles_columns: List[str] = None,
) -> None:
    """
    Saves a csv file with train/val/test splits of target data and additional features.
    Also saves indices of train/val/test split as a pickle file. Pickle file does not support repeated entries
    with the same SMILES or entries entered from a path other than the main data path, such as a separate test path.

    :param data_path: Path to data CSV file.
    :param save_dir: Path where pickle files will be saved.
    :param task_names: List of target names for the model as from the function get_task_names().
        If not provided, will use datafile header entries.
    :param features_path: List of path(s) to files with additional molecule features.
    :param train_data: Train :class:`~chemprop.data.data.MoleculeDataset`.
    :param val_data: Validation :class:`~chemprop.data.data.MoleculeDataset`.
    :param test_data: Test :class:`~chemprop.data.data.MoleculeDataset`.
    :param smiles_columns: The name of the column containing SMILES. By default, uses the first column.
    :param logger: A logger for recording output.
    """
    makedirs(save_dir)

    info = logger.info if logger is not None else print
    save_split_indices = True

    if not isinstance(smiles_columns, list):
        smiles_columns = preprocess_smiles_columns(
            path=data_path, smiles_columns=smiles_columns)

    with open(data_path) as f:
        reader = csv.DictReader(f)

        indices_by_smiles = {}
        for i, row in enumerate(tqdm(reader)):
            smiles = tuple([row[column] for column in smiles_columns])
            if smiles in indices_by_smiles:
                save_split_indices = False
                info(
                    "Warning: Repeated SMILES found in data, pickle file of split indices cannot distinguish entries and will not be generated."
                )
                break
            indices_by_smiles[smiles] = i

    if task_names is None:
        task_names = get_task_names(path=data_path,
                                    smiles_columns=smiles_columns)

    features_header = []
    if features_path is not None:
        for feat_path in features_path:
            with open(feat_path, "r") as f:
                reader = csv.reader(f)
                feat_header = next(reader)
                features_header.extend(feat_header)

    all_split_indices = []
    for dataset, name in [(train_data, "train"), (val_data, "val"),
                          (test_data, "test")]:
        if dataset is None:
            continue

        with open(os.path.join(save_dir, f"{name}_smiles.csv"), "w") as f:
            writer = csv.writer(f)
            if smiles_columns[0] == "":
                writer.writerow(["smiles"])
            else:
                writer.writerow(smiles_columns)
            for smiles in dataset.smiles():
                writer.writerow(smiles)

        with open(os.path.join(save_dir, f"{name}_full.csv"), "w") as f:
            writer = csv.writer(f)
            writer.writerow(smiles_columns + task_names)
            dataset_targets = dataset.targets()
            for i, smiles in enumerate(dataset.smiles()):
                writer.writerow(smiles + dataset_targets[i])

        if features_path is not None:
            dataset_features = dataset.features()
            with open(os.path.join(save_dir, f"{name}_features.csv"),
                      "w") as f:
                writer = csv.writer(f)
                writer.writerow(features_header)
                writer.writerows(dataset_features)

        if save_split_indices:
            split_indices = []
            for smiles in dataset.smiles():
                index = indices_by_smiles.get(tuple(smiles))
                if index is None:
                    save_split_indices = False
                    info(
                        f"Warning: SMILES string in {name} could not be found in data file, and "
                        "likely came from a secondary data file. The pickle file of split indices "
                        "can only indicate indices for a single file and will not be generated."
                    )
                    break
                split_indices.append(index)
            else:
                split_indices.sort()
                all_split_indices.append(split_indices)

        if name == "train":
            data_weights = dataset.data_weights()
            if any([w != 1 for w in data_weights]):
                with open(os.path.join(save_dir, f"{name}_weights.csv"),
                          "w") as f:
                    writer = csv.writer(f)
                    writer.writerow(["data weights"])
                    for weight in data_weights:
                        writer.writerow([weight])

    if save_split_indices:
        with open(os.path.join(save_dir, "split_indices.pckl"), "wb") as f:
            pickle.dump(all_split_indices, f)
Beispiel #9
0
def save_smiles_splits(data_path: str,
                       save_dir: str,
                       task_names: List[str] = None,
                       features_path: List[str] = None,
                       train_data: MoleculeDataset = None,
                       val_data: MoleculeDataset = None,
                       test_data: MoleculeDataset = None,
                       smiles_columns: List[str] = None) -> None:
    """
    Saves a csv file with train/val/test splits of target data and additional features.
    Also saves indices of train/val/test split as a pickle file. Pickle file does not support repeated entries with same SMILES.

    :param data_path: Path to data CSV file.
    :param save_dir: Path where pickle files will be saved.
    :param task_names: List of target names for the model as from the function get_task_names().
        If not provided, will use datafile header entries.
    :param features_path: List of path(s) to files with additional molecule features.
    :param train_data: Train :class:`~chemprop.data.data.MoleculeDataset`.
    :param val_data: Validation :class:`~chemprop.data.data.MoleculeDataset`.
    :param test_data: Test :class:`~chemprop.data.data.MoleculeDataset`.
    :param smiles_columns: The name of the column containing SMILES. By default, uses the first column.
    """
    makedirs(save_dir)

    if not isinstance(smiles_columns, list):
        smiles_columns = preprocess_smiles_columns(
            path=data_path, smiles_columns=smiles_columns)

    with open(data_path) as f:
        reader = csv.DictReader(f)

        indices_by_smiles = {}
        for i, row in enumerate(tqdm(reader)):
            smiles = tuple([row[column] for column in smiles_columns])
            indices_by_smiles[smiles] = i

    if task_names is None:
        task_names = get_task_names(path=data_path,
                                    smiles_columns=smiles_columns)

    features_header = []
    if features_path is not None:
        for feat_path in features_path:
            with open(feat_path, 'r') as f:
                reader = csv.reader(f)
                feat_header = next(reader)
                features_header.extend(feat_header)

    all_split_indices = []
    for dataset, name in [(train_data, 'train'), (val_data, 'val'),
                          (test_data, 'test')]:
        if dataset is None:
            continue

        with open(os.path.join(save_dir, f'{name}_smiles.csv'),
                  'w',
                  newline='') as f:
            writer = csv.writer(f)  #, lineterminator = '\n')
            if smiles_columns[0] == '':
                writer.writerow(['smiles'])
            else:
                writer.writerow(smiles_columns)
            for smiles in dataset.smiles():
                writer.writerow(smiles)

        with open(os.path.join(save_dir, f'{name}_full.csv'), 'w',
                  newline='') as f:
            writer = csv.writer(f)  #, lineterminator = '\n')
            writer.writerow(smiles_columns + task_names)
            dataset_targets = dataset.targets()
            for i, smiles in enumerate(dataset.smiles()):
                writer.writerow(smiles + dataset_targets[i])

        dataset_features = dataset.features()
        if features_path is not None:
            with open(os.path.join(save_dir, f'{name}_features.csv'),
                      'w',
                      newline='') as f:
                writer = csv.writer(f)  #, lineterminator = '\n')
                writer.writerow(features_header)
                writer.writerows(dataset_features)

        split_indices = []
        for smiles in dataset.smiles():
            split_indices.append(indices_by_smiles.get(tuple(smiles)))
            split_indices = sorted(split_indices)
        all_split_indices.append(split_indices)

    with open(os.path.join(save_dir, 'split_indices.pckl'), 'wb') as f:
        pickle.dump(all_split_indices, f)