def test_multi_file_multi_mol(self, tmpdir): with tmpdir.as_cwd(): test_dir = '1-validate_and_assign' input_mols = [ get_data_file_path('input_five_confs_flexible.sdf'), get_data_file_path('input_eight_stereoisomers.sdf') ] response = runner.invoke(cli, [ "preprocess", "validate", "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False) output_files = glob.glob(os.path.join(test_dir, '*.sdf')) output_files = [os.path.basename(fname) for fname in output_files] assert sorted(output_files) == [ 'BBB-00000-00.sdf', # The first input has 5 confs of the same molecule 'BBB-00000-01.sdf', 'BBB-00000-02.sdf', 'BBB-00000-03.sdf', 'BBB-00000-04.sdf', 'BBB-00001-00.sdf', # The there are 8 different stereoisomers with 1 conf each 'BBB-00002-00.sdf', 'BBB-00003-00.sdf', 'BBB-00004-00.sdf', 'BBB-00005-00.sdf', 'BBB-00006-00.sdf', 'BBB-00007-00.sdf', 'BBB-00008-00.sdf', ] error_files = glob.glob( os.path.join(test_dir, 'error_mols', '*.sdf')) error_files = [os.path.basename(fname) for fname in error_files] assert error_files == []
def test_multi_file_multi_mol_duplicates(self, tmpdir): with tmpdir.as_cwd(): test_dir = '1-validate_and_assign' input_mols = [ get_data_file_path('input_one_stereoisomer.sdf'), get_data_file_path('input_eight_stereoisomers.sdf') ] response = runner.invoke(cli, [ "preprocess", "validate", "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False) output_files = glob.glob(os.path.join(test_dir, '*.sdf')) output_files = [os.path.basename(fname) for fname in output_files] assert sorted(output_files) == [ 'BBB-00000-00.sdf', 'BBB-00001-00.sdf', 'BBB-00002-00.sdf', 'BBB-00003-00.sdf', 'BBB-00004-00.sdf', 'BBB-00005-00.sdf', 'BBB-00006-00.sdf', 'BBB-00007-00.sdf', ] error_files = glob.glob( os.path.join(test_dir, 'error_mols', '*.sdf')) assert len(error_files) == 1
def test_multiple_files(n_procs): molecules = [Molecule.from_file(get_data_file_path('1-validate_and_assign_graphs_and_confs/BBB-00000-00.sdf'), "sdf"), Molecule.from_file(get_data_file_path('1-validate_and_assign_graphs_and_confs/BBB-00001-00.sdf'), "sdf")] coverage, success_mols, error_mols = generate_coverage_report(input_molecules=molecules, forcefield_name='openff_unconstrained-1.3.0.offxml', processors=n_procs) assert len(success_mols) == 2 assert len(error_mols) == 0 assert coverage["passed_unique_molecules"] == 2 assert coverage["total_unique_molecules"] == 2
def test_cli_add_molecules_error(tmpdir): """Make sure when adding error molecules the coverage report is not changed and the molecule is put in the error mols folder""" with tmpdir.as_cwd(): test_dir = '3-coverage_report' input_dir = "1-validate_and_assign_graphs_and_confs" # copy all files to a local folder shutil.copytree(get_data_file_path(input_dir), input_dir) # run once to get the coverage report response = runner.invoke(cli, ["preprocess", "coverage-report", "-p", 1, "-o", test_dir, input_dir], catch_exceptions=False) # count the number of files in the output n_out_mols = len(os.listdir(test_dir)) # get the coverage report with open(os.path.join(test_dir, "coverage_report.json")) as report: old_report = json.load(report) # now add a new molecule to dir mol = Molecule.from_file(get_data_file_path("missing_valence_params.sdf")) mol.properties["group_name"] = "BBB" mol.properties["molecule_index"] = "99999" mol.to_file(os.path.join(input_dir, "BBB-99999-00.sdf"), "sdf") # run again with add response = runner.invoke(cli, ["preprocess", "coverage-report", "-p", 1, "--add", "-o", test_dir, input_dir], catch_exceptions=False) # make sure no new molecules were added n_new_out_mols = len(os.listdir(test_dir)) assert n_new_out_mols == n_out_mols # make sure the molecule is in the error folder assert len(glob.glob(os.path.join(test_dir, "error_mols", "*.sdf"))) == 1 # get the new coverage report and make sure it has been updated with open(os.path.join(test_dir, "coverage_report.json")) as report: new_report = json.load(report) assert new_report.pop("total_unique_molecules") > old_report.pop("total_unique_molecules") assert new_report.pop("passed_unique_molecules") == old_report.pop("passed_unique_molecules") assert new_report.pop("forcefield_name") == old_report.pop("forcefield_name") # now we only have parameter counts left, make sure they have not changed assert new_report == old_report
def test_cli_add_no_molecules(tmpdir): """Make sure that the cli exits if users run coverage report with add but no new molecules are found.""" with tmpdir.as_cwd(): test_dir = '3-coverage_report' input_folder = get_data_file_path('1-validate_and_assign_graphs_and_confs') # run once to get the coverage report response = runner.invoke(cli, ["preprocess", "coverage-report", "-p", 1, "-o", test_dir, input_folder], catch_exceptions=False) # count the number of files in the output n_out_mols = len(os.listdir(test_dir)) # now run again with the add flag response = runner.invoke(cli, ["preprocess", "coverage-report", "-p", 1, "-o", test_dir, "--add", input_folder], catch_exceptions=False) assert response.output == f"No new files found in {input_folder}, the coverage report was not changed.\n" # make sure the number of output files has not changed n_out_add_mols = len(os.listdir(test_dir)) assert n_out_mols == n_out_add_mols
def test_cli_move_all_confs(tmpdir): """ Make sure that if a molecule passes all conformers are also moved. """ with tmpdir.as_cwd(): test_dir = '3-coverage_report' input_folder = get_data_file_path('1-validate_and_assign_graphs_and_confs') # get the number of input molecules and conformers n_input_moles = len(glob.glob(os.path.join(input_folder, "*.sdf"))) response = runner.invoke(cli, ["preprocess", "coverage-report", "-p", 1, "-o", test_dir, input_folder], catch_exceptions=False) n_out_mols = len(glob.glob(os.path.join(test_dir, "*.sdf"))) # assuming no molecules fail assert n_input_moles == n_out_mols n_error_mols = len(glob.glob(os.path.join(test_dir, "error_mols", "*.sdf"))) assert n_error_mols == 0 # load the coverage report and make sure the unique mols is correct with open(os.path.join(test_dir, "coverage_report.json"), "r") as data: report = json.load(data) assert report["passed_unique_molecules"] == 5 assert report["total_unique_molecules"] == 5
def test_generate_conformers(tmpdir): with tmpdir.as_cwd(): # test_name = inspect.stack()[0].function input_dir = get_data_file_path( '1-validate_and_assign_graphs_and_confs') output_dir = '2-generate_conformers' # generate_conformers(input_dir, output_dir) response = runner.invoke( cli, ["preprocess", "generate-conformers", "-o", output_dir, input_dir], catch_exceptions=False) ## BBB-00000 starts with two conformers, so many more conformers should be created bbb0_confs = glob.glob(os.path.join(output_dir, 'BBB-00000-*.sdf')) assert len(bbb0_confs) > 3 ## BBB-00001 starts with a one conformer, so many more conformers should be created bbb1_confs = glob.glob(os.path.join(output_dir, 'BBB-00001-*.sdf')) assert len(bbb1_confs) > 2 ## BBB-00002 starts with one conformer. # It is rigid so only one conformer should be created bbb2_confs = glob.glob(os.path.join(output_dir, 'BBB-00002-*.sdf')) assert len(bbb2_confs) == 1 ## BBB-00003 starts with 12 conformers. # We should see 12 output confs here, since we NEVER delete user confs bbb3_confs = glob.glob(os.path.join(output_dir, 'BBB-00003-*.sdf')) assert len(bbb3_confs) == 12
def test_single_file_single_mol(self, tmpdir): with tmpdir.as_cwd(): test_dir = '1-validate_and_assign' input_mols = [get_data_file_path('input_single_mol_rigid.sdf')] input_mols = [ os.path.abspath(input_mol) for input_mol in input_mols ] response = runner.invoke(cli, [ "preprocess", "validate", "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False) output_files = glob.glob(os.path.join(test_dir, '*.sdf')) output_files = [os.path.basename(fname) for fname in output_files] assert 'BBB-00000-00.sdf' in output_files assert len(output_files) == 1 file_text = open(os.path.join(test_dir, 'BBB-00000-00.sdf')).read() assert """ > <group_name> (1) BBB""" in file_text assert """ > <molecule_index> (1) 0""" in file_text assert """ > <conformer_index> (1) 0""" in file_text error_files = glob.glob( os.path.join(test_dir, 'error_mols', '*.sdf')) error_files = [os.path.basename(fname) for fname in error_files] assert error_files == []
def test_generate_conformers_add(tmpdir): with tmpdir.as_cwd(): # test_name = inspect.stack()[0].function input_dir = '1-validate_and_assign_graphs_and_confs' # Make a copy of this directory, otherwise we'll contaminate the original when we add a new mol for the test shutil.copytree(get_data_file_path(input_dir), input_dir) output_dir = '2-generate_conformers' # generate_conformers(input_dir, output_dir) response = runner.invoke( cli, ["preprocess", "generate-conformers", "-o", output_dir, input_dir], catch_exceptions=False) initial_confs = glob.glob(os.path.join(output_dir, 'BBB-*.sdf')) initial_confs = [ os.path.basename(filename) for filename in initial_confs ] # now add a new ridiculously flexible molecule to dir mol = Molecule.from_smiles('CCCCC[C@H](COCOC)COCCOCCCCCCC') mol.generate_conformers() mol.properties["group_name"] = "BBB" mol.properties["molecule_index"] = "99999" mol.to_file(os.path.join(input_dir, "BBB-99999-00.sdf"), "sdf") response = runner.invoke(cli, [ "preprocess", "generate-conformers", "-o", output_dir, "--add", input_dir ], catch_exceptions=False) final_confs = glob.glob(os.path.join(output_dir, 'BBB-*.sdf')) final_confs = [os.path.basename(filename) for filename in final_confs] assert 'BBB-99999-00.sdf' in final_confs assert 'BBB-99999-09.sdf' in final_confs assert len(final_confs) == len(initial_confs) + 10
def test_single_file(n_procs): molecules = Molecule.from_file(get_data_file_path('1-validate_and_assign_graphs_and_confs/BBB-00000-00.sdf'), "sdf" ) coverage, success_mols, error_mols = generate_coverage_report(input_molecules=molecules, forcefield_name='openff_unconstrained-1.3.0.offxml', processors=n_procs) assert len(success_mols) == 1, error_mols[0][1] assert len(error_mols) == 0
def test_single_file_multi_mol(self, tmpdir): with tmpdir.as_cwd(): test_dir = '1-validate_and_assign' input_mols = [ get_data_file_path( 'input_one_stereoisomer_and_multi_conf_flexible.sdf') ] response = runner.invoke(cli, [ "preprocess", "validate", "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False) output_files = glob.glob(os.path.join(test_dir, '*.sdf')) output_files = [os.path.basename(fname) for fname in output_files] assert sorted(output_files) == [ 'BBB-00000-00.sdf', 'BBB-00001-00.sdf', 'BBB-00001-01.sdf', 'BBB-00001-02.sdf', 'BBB-00001-03.sdf', 'BBB-00001-04.sdf' ] error_files = glob.glob( os.path.join(test_dir, 'error_mols', '*.sdf')) error_files = [os.path.basename(fname) for fname in error_files] assert error_files == []
def test_cli_error_mol(tmpdir): """Make sure that molecules that fail are correctly put in error mols folder.""" with tmpdir.as_cwd(): test_dir = '3-coverage_report' input_folder = "2-generate_conformers" os.mkdir(input_folder) mol = Molecule.from_file(get_data_file_path('missing_valence_params.sdf'), "sdf", allow_undefined_stereo=True) mol.properties["group_name"] = "OFF" mol.properties["molecule_index"] = "00001" mol.to_file(os.path.join(input_folder, "OFF-00001-00.sdf"), "sdf") response = runner.invoke(cli, ["preprocess", "coverage-report", "-p", 1, "-o", test_dir, input_folder], catch_exceptions=False) n_out_mols = len(glob.glob(os.path.join(test_dir, "*.sdf"))) assert n_out_mols == 0 n_error_mols = len(glob.glob(os.path.join(test_dir, "error_mols", "*.sdf"))) assert n_error_mols == 1 # load the coverage report and make sure the unique mols is correct with open(os.path.join(test_dir, "coverage_report.json"), "r") as data: report = json.load(data) assert report["passed_unique_molecules"] == 0 assert report["total_unique_molecules"] == 1
def test_error_uncovered_antechamber_param(): molecule = Molecule.from_file(get_data_file_path('sodium_carbide.sdf'), "sdf") coverage, success_mols, error_mols = generate_coverage_report(input_molecules=molecule, forcefield_name='openff_unconstrained-1.3.0.offxml') assert len(success_mols) == 0 assert len(error_mols) == 1 assert coverage["passed_unique_molecules"] == 0 assert coverage["total_unique_molecules"] == 1 assert "Command '['antechamber'" in str(error_mols[0][1])
def test_error_missing_valence_param(n_procs): molecules = Molecule.from_file(get_data_file_path('missing_valence_params.sdf'), "sdf") coverage, success_mols, error_mols = generate_coverage_report(input_molecules=molecules, forcefield_name='openff_unconstrained-1.3.0.offxml', processors=n_procs) assert len(success_mols) == 0 assert len(error_mols) == 1 assert coverage["passed_unique_molecules"] == 0 assert coverage["total_unique_molecules"] == 1 assert "BondHandler was not able to find parameters" in str(error_mols[0][1])
def test_add_and_delete_existing_error(tmpdir): with tmpdir.as_cwd(): test_dir = '1-validate_and_assign' input_mols = [get_data_file_path('input_one_stereoisomer.sdf')] with pytest.raises(Exception, match='Can not specify BOTH') as context: response = runner.invoke(cli, [ "preprocess", "validate", "--add", "--delete-existing", "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False)
def test_do_overwrite_output_directory(tmpdir): with tmpdir.as_cwd(): test_dir = '1-validate_and_assign' input_mols = [get_data_file_path('input_single_mol_rigid.sdf')] response = runner.invoke(cli, [ "preprocess", "validate", "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False) response = runner.invoke(cli, [ "preprocess", "validate", "-g", "BBB", "-o", test_dir, "--delete-existing", *input_mols ], catch_exceptions=False)
def test_double_smirks(): """Test filtering based on 2 different smirks patterns.""" molecules = [] for i in [0, 1, 2, 3, 5]: molecules.append( Molecule.from_file( get_data_file_path( f'1-validate_and_assign_graphs_and_confs/BBB-0000{i}-00.sdf' ), "sdf")) # filter P should only be one molecule, and F should also be one molecule result = smirks_filter(input_molecules=molecules, filtered_smirks=["[P:1]", "[F:1]"], processors=1) assert result.n_filtered == 2 assert result.n_molecules == 3
def test_multi_file_single_mol_redundant_conf(self, tmpdir): with tmpdir.as_cwd(): test_dir = '1-validate_and_assign' input_mols = [ get_data_file_path('input_single_mol_rigid.sdf'), get_data_file_path('input_single_mol_rigid_translated.sdf') ] response = runner.invoke(cli, [ "preprocess", "validate", "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False) output_files = glob.glob(os.path.join(test_dir, '*.sdf')) output_files = [os.path.basename(fname) for fname in output_files] assert sorted(output_files) == ['BBB-00000-00.sdf'] error_files = glob.glob( os.path.join(test_dir, 'error_mols', '*.sdf')) error_files = [os.path.basename(fname) for fname in error_files] assert error_files == ['error_mol_0.sdf'] error_txts = glob.glob( os.path.join(test_dir, 'error_mols', '*.txt')) error_txts = [open(fname).read() for fname in error_txts] assert 'Duplicate molecule conformer input detected' in error_txts[ 0]
def test_cli_move_molecules(tmpdir): """Make sure that the cli can correctly move the molecules to the passed and fail directories.""" with tmpdir.as_cwd(): input_folder = get_data_file_path( '1-validate_and_assign_graphs_and_confs') n_input_moles = len(glob.glob(os.path.join(input_folder, "*.sdf"))) test_dir = '5-smirks_filter' response = runner.invoke(cli, [ "filter", "smirks", input_folder, test_dir, "-p", 1, "-s", "[P:1]" ], catch_exceptions=False) n_out_mols = len(glob.glob(os.path.join(test_dir, "*.sdf"))) # this should only remove 1 molecule with 2 conformers assert n_out_mols == n_input_moles - 2 n_error_mols = len( glob.glob(os.path.join(test_dir, "error_mols", "*.sdf"))) assert n_error_mols == 2
def test_dont_overwrite_output_directory(tmpdir): with tmpdir.as_cwd(): test_name = inspect.stack()[0].function input_dir = get_data_file_path( '1-validate_and_assign_graphs_and_confs') output_dir = os.path.join(test_name, '2-generate_conformers') response = runner.invoke( cli, ["preprocess", "generate-conformers", "-o", output_dir, input_dir], catch_exceptions=False) with pytest.raises(Exception, match='Specify `--delete-existing` to remove'): response = runner.invoke(cli, [ "preprocess", "generate-conformers", "-o", output_dir, input_dir ], catch_exceptions=False)
def test_bad_macrocycle(tmpdir): with tmpdir.as_cwd(): # test_name = inspect.stack()[0].function input_dir = get_data_file_path( '1-validate_and_assign_graphs_and_confs_bad_macrocycle') output_dir = '2-generate_conformers' # generate_conformers(input_dir, output_dir) response = runner.invoke( cli, ["preprocess", "generate-conformers", "-o", output_dir, input_dir], catch_exceptions=False) # JAN_00203 has a macrocycle that RDKit generates bad conformers for. These conformers # have twisted double bonds and can't be parsed by subsequent processing steps. jan_203_confs = glob.glob(os.path.join(output_dir, 'JAN-00203-*.sdf')) assert len(jan_203_confs) == 1
def test_add_doesnt_overwrite_error_mols(self, tmpdir): """ Run add multiple times, such that error mols are generated by two separate invocations. Then, make sure that the error outputs don't overwrite each other """ with tmpdir.as_cwd(): test_dir = '1-validate_and_assign' input_mols = [get_data_file_path('input_single_mol_rigid.sdf')] response = runner.invoke(cli, [ "preprocess", "validate", "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False) response = runner.invoke(cli, [ "preprocess", "validate", '--add', "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False) response = runner.invoke(cli, [ "preprocess", "validate", '--add', "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False) output_files = glob.glob(os.path.join(test_dir, '*.sdf')) output_files = [os.path.basename(fname) for fname in output_files] assert sorted(output_files) == ['BBB-00000-00.sdf'] error_files = glob.glob( os.path.join(test_dir, 'error_mols', '*.sdf')) error_files = [os.path.basename(fname) for fname in error_files] assert sorted(error_files) == [ 'error_mol_0.sdf', 'error_mol_1.sdf' ] error_txts = sorted( glob.glob(os.path.join(test_dir, 'error_mols', '*.txt'))) error_txts = [open(fname).read() for fname in error_txts] assert "Input molecule graph is already present in output" in error_txts[ 0] assert "Input molecule graph is already present in output" in error_txts[ 1]
def test_add(self, tmpdir): with tmpdir.as_cwd(): test_dir = '1-validate_and_assign' input_mols = [get_data_file_path('input_one_stereoisomer.sdf')] response = runner.invoke(cli, [ "preprocess", "validate", "-g", "BBB", "-o", test_dir, *input_mols ], catch_exceptions=False) output_files = glob.glob(os.path.join(test_dir, '*.sdf')) output_files = [os.path.basename(fname) for fname in output_files] assert sorted(output_files) == ['BBB-00000-00.sdf'] error_files = glob.glob( os.path.join(test_dir, 'error_mols', '*.sdf')) error_files = [os.path.basename(fname) for fname in error_files] assert error_files == [] # Test that output names were correctly assigned output_name_assignments = [] with open(os.path.join(test_dir, 'name_assignments.csv')) as of: csv_reader = csv.reader(of) for row in csv_reader: new_row = [os.path.basename(i) for i in row] output_name_assignments.append(new_row) assert output_name_assignments == [ ['orig_name', 'orig_file', 'orig_file_index', 'out_file_name'], [ 'PDB_DB00136_99', 'input_one_stereoisomer.sdf', '0', 'BBB-00000-00' ], ] # Run again, with a partially overlapping set of molecules input_mols2 = [get_data_file_path('input_eight_stereoisomers.sdf')] response = runner.invoke(cli, [ "preprocess", "validate", "--add", "-g", "BBB", "-o", test_dir, *input_mols2 ], catch_exceptions=False) output_files = glob.glob(os.path.join(test_dir, '*.sdf')) output_files = [os.path.basename(fname) for fname in output_files] assert sorted(output_files) == [ 'BBB-00000-00.sdf', 'BBB-00001-00.sdf', 'BBB-00002-00.sdf', 'BBB-00003-00.sdf', 'BBB-00004-00.sdf', 'BBB-00005-00.sdf', 'BBB-00006-00.sdf', 'BBB-00007-00.sdf', ] error_files = glob.glob( os.path.join(test_dir, 'error_mols', '*.sdf')) error_files = [os.path.basename(fname) for fname in error_files] assert error_files == ['error_mol_0.sdf'] error_txts = glob.glob( os.path.join(test_dir, 'error_mols', '*.txt')) error_txts = [open(fname).read() for fname in error_txts] assert "Input molecule graph is already present in output" in error_txts[ 0] # Test that output names were correctly assigned output_name_assignments = [] with open(os.path.join(test_dir, 'name_assignments.csv')) as of: csv_reader = csv.reader(of) for row in csv_reader: new_row = [os.path.basename(i) for i in row] output_name_assignments.append(new_row) assert output_name_assignments == [ ['orig_name', 'orig_file', 'orig_file_index', 'out_file_name'], [ 'PDB_DB00136_99', 'input_one_stereoisomer.sdf', '0', 'BBB-00000-00' ], [ 'PDB_DB00136_01', 'input_eight_stereoisomers.sdf', '1', 'BBB-00001-00' ], [ 'PDB_DB00136_02', 'input_eight_stereoisomers.sdf', '2', 'BBB-00002-00' ], [ 'PDB_DB00136_03', 'input_eight_stereoisomers.sdf', '3', 'BBB-00003-00' ], [ 'PDB_DB00136_04', 'input_eight_stereoisomers.sdf', '4', 'BBB-00004-00' ], [ 'PDB_DB00136_05', 'input_eight_stereoisomers.sdf', '5', 'BBB-00005-00' ], [ 'PDB_DB00136_06', 'input_eight_stereoisomers.sdf', '6', 'BBB-00006-00' ], [ 'PDB_DB00136_07', 'input_eight_stereoisomers.sdf', '7', 'BBB-00007-00' ], ]