def test_filter_cli(openff_methane: Molecule, runner): # Create an SDF file to filter. with stream_to_file("molecules.sdf") as writer: writer( Molecule.from_smiles("C1(=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl)[O-].[Na+]")) writer(Molecule.from_smiles("CCC(C)(C)C(F)(F)CCCCC(F)(F)C(C)(C)CC")) arguments = [ "--input", "molecules.sdf", "--output", "filtered.sdf", "--strip-ions" ] result = runner.invoke(filter_cli, arguments) if result.exit_code != 0: raise result.exception assert os.path.isfile("filtered.sdf") filtered_molecules = [ molecule for molecule in stream_from_file("filtered.sdf") ] assert len(filtered_molecules) == 1 filtered_molecule = filtered_molecules[0] assert (filtered_molecule.to_smiles(toolkit_registry=RDKitToolkitWrapper()) == "[O-][c]1[c]([Cl])[c]([Cl])[c]([Cl])[c]([Cl])[c]1[Cl]")
def filter_cli( input_path: str, output_path: str, n_processes: int, strip_ions: bool, ): print(" - Filtering molecules") with capture_toolkit_warnings(): with stream_to_file(output_path) as writer: with Pool(processes=n_processes) as pool: for molecule, should_include in tqdm( pool.imap( functools.partial(apply_filter, retain_largest=strip_ions), stream_from_file(input_path), ), ): if not should_include: continue writer(molecule)
def test_enumerate_cli(openff_methane: Molecule, runner): # Create an SDF file to enumerate. buteneol = Molecule.from_smiles(r"C/C=C(/C)\O") with stream_to_file("molecules.sdf") as writer: writer(buteneol) writer(buteneol) arguments = ["--input", "molecules.sdf", "--output", "tautomers.sdf", "--tautomers"] result = runner.invoke(enumerate_cli, arguments) if result.exit_code != 0: raise result.exception assert os.path.isfile("tautomers.sdf") tautomers = [molecule for molecule in stream_from_file("tautomers.sdf")] assert len(tautomers) == 4 assert { tautomer.to_smiles( explicit_hydrogens=False, toolkit_registry=RDKitToolkitWrapper() ) for tautomer in tautomers } == {"C/C=C(/C)O", "C=C(O)CC", "CCC(C)=O", "CC=C(C)O"}
def enumerate_cli( input_path: str, output_path: str, enumerate_tautomers: bool, max_tautomers: int, enumerate_protomers: bool, max_protomers: int, n_processes: int, ): print(f" - Enumerating" f"{' tautomers' if enumerate_tautomers else ''}" f"{'/' if enumerate_protomers and enumerate_tautomers else ''}" f"{' protomers' if enumerate_protomers else ''}") unique_molecules = set() with capture_toolkit_warnings(): with stream_to_file(output_path) as writer: with Pool(processes=n_processes) as pool: for smiles in tqdm( pool.imap( functools.partial( _enumerate_tautomers, enumerate_tautomers=enumerate_tautomers, max_tautomers=max_tautomers, enumerate_protomers=enumerate_protomers, max_protomers=max_protomers, ), stream_from_file(input_path, as_smiles=True), ), ): for pattern in smiles: from openff.toolkit.topology import Molecule molecule: Molecule = Molecule.from_smiles( pattern, allow_undefined_stereo=True) inchi_key = molecule.to_inchikey(fixed_hydrogens=True) if inchi_key in unique_molecules: continue writer(molecule) unique_molecules.add(inchi_key)
def test_read_write_streams(): molecules = [Molecule.from_smiles("C"), Molecule.from_smiles("CO")] with temporary_cd(): with stream_to_file("molecules.sdf") as writer: for molecule in molecules: writer(molecule) loaded_molecules = [*stream_from_file("molecules.sdf")] assert len(molecules) == len(loaded_molecules) assert {molecule.to_smiles() for molecule in molecules} == { molecule.to_smiles() for molecule in loaded_molecules }
def label_cli( input_path: str, output_path: str, guess_stereo: bool, rms_cutoff: float, worker_type: str, n_workers: int, batch_size: int, lsf_memory: int, lsf_walltime: str, lsf_queue: str, lsf_env: str, ): from dask import distributed root_logger: logging.Logger = logging.getLogger("nagl") root_logger.setLevel(logging.INFO) root_handler = logging.StreamHandler() root_handler.setFormatter(logging.Formatter("%(message)s")) _logger.info("Labeling molecules") with capture_toolkit_warnings(): all_smiles = [ smiles for smiles in tqdm( stream_from_file(input_path, as_smiles=True), desc="loading molecules", ncols=80, ) ] unique_smiles = sorted({*all_smiles}) if len(unique_smiles) != len(all_smiles): _logger.warning( f"{len(all_smiles) - len(unique_smiles)} duplicate molecules were ignored" ) n_batches = int(math.ceil(len(all_smiles) / batch_size)) if n_workers < 0: n_workers = n_batches if n_workers > n_batches: _logger.warning( f"More workers were requested then there are batches to compute. Only " f"{n_batches} workers will be requested." ) n_workers = n_batches # Set-up dask to distribute the processing. if worker_type == "lsf": dask_cluster = setup_dask_lsf_cluster( n_workers, lsf_queue, lsf_memory, lsf_walltime, lsf_env ) elif worker_type == "local": dask_cluster = setup_dask_local_cluster(n_workers) else: raise NotImplementedError() _logger.info( f"{len(unique_smiles)} molecules will labelled in {n_batches} batches across " f"{n_workers} workers\n" ) dask_client = distributed.Client(dask_cluster) # Submit the tasks to be computed in chunked batches. def batch(iterable): n_iterables = len(iterable) for i in range(0, n_iterables, batch_size): yield iterable[i : min(i + batch_size, n_iterables)] futures = [ dask_client.submit( functools.partial( label_molecules, guess_stereochemistry=guess_stereo, partial_charge_methods=["am1", "am1bcc"], bond_order_methods=["am1"], rms_cutoff=rms_cutoff, ), batched_molecules, ) for batched_molecules in batch(unique_smiles) ] # Create a database to store the labelled molecules in and store general # provenance information. storage = MoleculeStore(output_path) storage.set_provenance( general_provenance={ "date": datetime.now().strftime("%d-%m-%Y"), }, software_provenance=get_labelling_software_provenance(), ) # Save out the molecules as they are ready. error_file_path = output_path.replace(".sqlite", "-errors.log") with open(error_file_path, "w") as file: for future in tqdm( distributed.as_completed(futures, raise_errors=False), total=n_batches, desc="labelling molecules", ncols=80, ): for molecule_record, error in tqdm( future.result(), desc="storing batch", ncols=80, ): try: with capture_toolkit_warnings(): if molecule_record is not None and error is None: storage.store(molecule_record) except BaseException as e: formatted_traceback = traceback.format_exception( etype=type(e), value=e, tb=e.__traceback__ ) error = f"Could not store record: {formatted_traceback}" if error is not None: file.write("=".join(["="] * 40) + "\n") file.write(error + "\n") file.flush() continue future.release() if worker_type == "lsf": dask_cluster.scale(n=0)