def testB1BondGenMol(self): ini_mono_type_list = [S, S, S, G, S] sg_ratio = 1.0 max_monos = 12 random_num = 55 initial_monomers = [ Monomer(mono_type, i) for i, mono_type in enumerate(ini_mono_type_list) ] initial_events = create_initial_events(initial_monomers, DEF_RXN_RATES) initial_events.append(Event(GROW, [], rate=1e4)) initial_state = create_initial_state(initial_events, initial_monomers) result = run_kmc(DEF_RXN_RATES, initial_state, initial_events, n_max=max_monos, t_max=2, random_seed=random_num, sg_ratio=sg_ratio) nodes = result[MONO_LIST] adj = result[ADJ_MATRIX] # generate_mol(adj, nodes) with capture_stderr(generate_mol, adj, nodes) as output: self.assertFalse(output) mol = MolFromMolBlock(generate_mol(adj, nodes)) mols = GetMolFrags(mol) analysis = analyze_adj_matrix(adj) frag_sizes = analysis[CHAIN_LEN] # Make sure there are the same number of separate fragments calculated by RDKIT # as we get from just separating the alternate B1 self.assertEqual(np.sum(list(frag_sizes.values())), len(mols))
def test_embed_r_groups__ROR(bax_mol): fragment = MolFromMolBlock(''' RDKit 3D 3 2 0 0 0 0 0 0 0 0999 V2000 0.0000 0.0000 0.0000 R 0 0 0 0 0 1 0 0 0 0 0 0 4.4640 1.0880 19.5620 O 0 0 0 0 0 0 0 0 0 0 0 0 0.0000 0.0000 0.0000 R 0 0 0 0 0 1 0 0 0 0 0 0 1 2 1 0 3 2 1 0 M END ''') embed_r_groups(fragment, bax_mol) expected = ''' RDKit 3D 3 2 0 0 0 0 0 0 0 0999 V2000 3.7070 1.4910 20.6340 R 0 0 0 0 0 1 0 0 0 0 0 0 4.4640 1.0880 19.5620 O 0 0 0 0 0 0 0 0 0 0 0 0 4.1550 0.2360 18.5580 R 0 0 0 0 0 1 0 0 0 0 0 0 1 2 1 0 3 2 1 0 M END ''' assert MolToMolBlock(fragment) == expected
def protonate_molecule(mol_in: Mol, ph=7.4) -> Mol: molblock_in = MolToMolBlock(mol_in) babel_mol = pybel.readstring('mol', molblock_in) babel_mol.OBMol.AddHydrogens(False, True, ph) molblock_out = babel_mol.write('mol') mol = MolFromMolBlock(molblock_out, removeHs=False, sanitize=False) try: SanitizeMol(mol) except ValueError: # Try again, but without ph correction babel_mol = pybel.readstring('mol', molblock_in) babel_mol.OBMol.AddHydrogens(False, False) molblock_out = babel_mol.write('mol') mol = MolFromMolBlock(molblock_out, removeHs=False, sanitize=False) SanitizeMol(mol) return mol
def test_embed_r_groups__noparentatom(bax_mol): fragment = MolFromMolBlock(''' RDKit 3D 8 7 0 0 0 0 0 0 0 0999 V2000 0.0000 0.0000 0.0000 R 0 0 0 0 0 1 0 0 0 0 0 0 1.1111 1.1111 11.1111 N 0 0 0 0 0 0 0 0 0 0 0 0 0.4480 2.7040 24.2850 C 0 0 0 0 0 0 0 0 0 0 0 0 2.3160 3.2190 24.7430 H 0 0 0 0 0 0 0 0 0 0 0 0 -0.0220 3.4870 25.2640 N 0 0 0 0 0 0 0 0 0 0 0 0 -0.2620 1.9460 23.6160 O 0 0 0 0 0 0 0 0 0 0 0 0 0.0000 0.0000 0.0000 R 0 0 0 0 0 1 0 0 0 0 0 0 0.5870 4.1890 25.5880 H 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 3 2 1 0 2 4 1 0 3 5 1 0 3 6 1 0 7 5 1 0 5 8 1 0 M END''') with pytest.raises(LookupError) as e: embed_r_groups(fragment, bax_mol) assert 'group not found in parent' in str(e.value)
def convert_molblockgz(molgz): """Convert compressed molblock to RDKit molecule Args: molgz: (str) zlib compressed molblock Returns: rdkit.Chem.Mol: molecule """ return MolFromMolBlock(zlib.decompress(molgz))
def test0InchiWritePubChem(self): for fp, f in self.dataset.items(): inchi_db = self.dataset_inchi[fp] same, diff, reasonable = 0, 0, 0 for m in f: if m is None: # pragma: nocover continue ref_inchi = inchi_db[m.GetProp('PUBCHEM_COMPOUND_CID')] x, y = MolToInchi(m), ref_inchi if x != y: # print("---------------") # print(m.GetProp('PUBCHEM_COMPOUND_CID')) # print(MolToSmiles(m)) # print(y) # print(x) if re.search(r'.[1-9]?ClO4', x) is not None: reasonable += 1 continue SanitizeMol(m) if filter(lambda i: i >= 8, [len(r) for r in m.GetRingInfo().AtomRings()]): reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # if it is because RDKit does not think the bond is stereo z = MolToInchi(MolFromMolBlock(MolToMolBlock(m))) if y != z and inchiDiffPrefix(y, z) == 'b': reasonable += 1 continue # some warning try: MolToInchi(m, treatWarningAsError=True) except InchiReadWriteError as inst: _, error = inst.args if 'Metal' in error: reasonable += 1 continue diff += 1 print('InChI mismatch for PubChem Compound ' + m.GetProp('PUBCHEM_COMPOUND_CID')) print(MolToSmiles(m, True)) print(inchiDiff(x, y)) print() else: same += 1 fmt = "\n{0}InChI write Summary: {1} identical, {2} suffix variance, {3} reasonable{4}" print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET)) self.assertEqual(same, 1162) self.assertEqual(diff, 0) self.assertEqual(reasonable, 19)
def calculate_coords(apps, schema_editor): # We can't import the Person model directly as it may be a newer # version than this migration expects. We use the historical version. Batch = apps.get_model("cbh_chembl_model_extension", "CBHCompoundBatch") for field in Batch.objects.all(): mol = MolFromMolBlock(field.ctab) AllChem.Compute2DCoords(mol) try: field.ctab = MolToMolBlock(mol, includeStereo=True) except: print "test" field.save()
def generate_lignin(num_monomers: int = 1) -> Chem.Mol: """Generates lignin molecule. parameters ---------- num_monomers : int Number of monomers in lignin molecule. """ # Set the percentage of S sg_ratio = 0 pct_s = sg_ratio / (1 + sg_ratio) # Set the initial and maximum number of monomers to be modeled. ini_num_monos = 1 max_num_monos = num_monomers # Maximum time to simulate, in seconds t_max = 1 # seconds mono_add_rate = 1e4 # monomers/second # Use a random number and the given sg_ratio to determine the monolignol types to be initially modeled monomer_draw = np.random.rand(ini_num_monos) initial_monomers = create_initial_monomers(pct_s, monomer_draw) # Initially allow only oxidation events. After they are used to determine the initial state, add # GROW to the events, which allows additional monomers to be added to the reaction at the # specified rate and with the specified ratio initial_events = create_initial_events(initial_monomers, rxn_rates) initial_state = create_initial_state(initial_events, initial_monomers) initial_events.append(Event(GROW, [], rate=mono_add_rate)) # simulate lignin creation result = run_kmc(rxn_rates, initial_state, initial_events, n_max=max_num_monos, t_max=t_max, sg_ratio=sg_ratio) # using RDKit nodes = result[MONO_LIST] adj = result[ADJ_MATRIX] block = generate_mol(adj, nodes) mol = MolFromMolBlock(block) mol = Chem.AddHs(mol) return mol
def testMakePNG(self): # smoke test only--that it doesn't fail, not that it looks correct (that's outside the scope of this package) # The choices shown resulted (at last check) in 3 fragments, one of which has a branch try: silent_remove(TEST_PNG) result = create_sample_kmc_result(num_initial_monos=24, max_monos=24, seed=1, max_time=SHORT_TIME) summary = analyze_adj_matrix(result[ADJ_MATRIX]) adj_analysis_to_stdout(summary, break_co_bonds=False) nodes = result[MONO_LIST] adj = result[ADJ_MATRIX] block = generate_mol(adj, nodes) mol = MolFromMolBlock(block) Compute2DCoords(mol) MolToFile(mol, TEST_PNG, size=(2000, 1200)) self.assertTrue(os.path.isfile(TEST_PNG)) finally: silent_remove(TEST_PNG, disable=DISABLE_REMOVE) pass
def produce_output(adj_matrix, mono_list, cfg): if cfg[SUPPRESS_SMI] and not (cfg[SAVE_JSON] or cfg[SAVE_PNG] or cfg[SAVE_SVG]): format_list = [SAVE_TCL] mol = None # Make IDE happy else: # Default out is SMILES, which requires getting an rdKit molecule object; also required for everything # except the TCL format format_list = [SAVE_TCL, SAVE_JSON, SAVE_PNG, SAVE_SVG] block = generate_mol(adj_matrix, mono_list) mol = MolFromMolBlock(block) try: smi_str = MolToSmiles(mol) + '\n' except: raise InvalidDataError("Error in producing SMILES string.") # if SMI is to be saved, don't output to stdout if cfg[SAVE_SMI]: fname = create_out_fname(cfg[BASENAME], base_dir=cfg[OUT_DIR], ext=SAVE_SMI) str_to_file(smi_str, fname, print_info=True) else: print("\nSMILES representation: \n", MolToSmiles(mol), "\n") if cfg[SAVE_PNG] or cfg[SAVE_SVG] or cfg[SAVE_JSON]: # PNG and SVG make 2D images and thus need coordinates # JSON will save coordinates--zero's if not computed; might as well compute and save non-zero values Compute2DCoords(mol) for save_format in format_list: if cfg[save_format]: fname = create_out_fname(cfg[BASENAME], base_dir=cfg[OUT_DIR], ext=save_format) if save_format == SAVE_TCL: gen_tcl(adj_matrix, mono_list, tcl_fname=fname, chain_id=cfg[CHAIN_ID], psf_fname=cfg[PSF_FNAME], toppar_dir=cfg[TOPPAR_DIR], out_dir=cfg[OUT_DIR]) if save_format == SAVE_JSON: json_str = MolToJSON(mol) str_to_file(json_str + '\n', fname) elif save_format == SAVE_PNG or save_format == SAVE_SVG: MolToFile(mol, fname, size=cfg[IMAGE_SIZE]) print(f"Wrote file: {fname}")
valid = valid[valid[[prop, 'molUFF']].notna().all(1)] # Rescale Y matrix y_train_raw = train[[prop]].values y_valid_raw = valid[[prop]].values y_scaler = RobustNanScaler() y_train_scaled = y_scaler.fit_transform(y_train_raw) y_valid_scaled = y_scaler.transform(y_valid_raw) from rdkit.Chem import MolFromMolBlock # Transform MOLs into X matrix preprocessor = MolPreprocessor(n_neighbors=48) train_inputs = preprocessor.fit((MolFromMolBlock(mol) for _, mol in train.molUFF.iteritems())) valid_inputs = preprocessor.fit((MolFromMolBlock(mol) for _, mol in valid.molUFF.iteritems())) def rbf_expansion(distances, mu=0, delta=0.2, kmax=150): k = np.arange(0, kmax) logits = -(np.atleast_2d(distances).T - (-mu + delta * k))**2 / delta return np.exp(logits) def precalc_rbfs(inputs): for item in tqdm(inputs): item['distance_rbf'] = rbf_expansion(item['distance']) del item['distance'] return inputs
import numpy as np import pandas as pd from rdkit.Chem import MolFromMolBlock parser = ArgumentParser() parser.add_argument('--removeHs', '-k', action='store_true', help='remove H atoms when generating mol') if __name__ == '__main__': args = parser.parse_args() ids = [] mols = [] for i in range(1, 133886): ids.append(f'dsgdb9nsd_{i:06}') try: with open(f'./data/input/structures/dsgdb9nsd_{i:06}.mol', 'r') as mol: mols.append(MolFromMolBlock(mol.read(), removeHs=args.removeHs)) except: mols.append(np.nan) df = pd.DataFrame() df['ids'] = ids df['mols'] = mols if args.removeHs: with open('./data/input/mols_without_Hs.pickle', 'wb') as f: pickle.dump(df, f) else: with open('./data/input/mols_with_Hs.pickle', 'wb') as f: pickle.dump(df, f)
def bax_mol(): return MolFromMolBlock(''' RDKit 3D 48 50 0 0 0 0 0 0 0 0999 V2000 -1.9780 2.2590 25.9480 C 0 0 0 0 0 0 0 0 0 0 0 0 -1.2280 3.4280 25.8570 C 0 0 0 0 0 0 0 0 0 0 0 0 -1.6980 4.5910 26.4520 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.9200 4.6600 27.1000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.4480 2.7040 24.2850 C 0 0 0 0 0 0 0 0 0 0 0 0 2.3380 2.3780 22.9000 C 0 0 0 0 0 0 0 0 0 0 0 0 1.6560 2.2710 21.6650 C 0 0 0 0 0 0 0 0 0 0 0 0 2.3330 1.8220 20.5270 C 0 0 0 0 0 0 0 0 0 0 0 0 3.7070 1.4910 20.6340 C 0 0 0 0 0 0 0 0 0 0 0 0 4.3820 1.5940 21.8510 C 0 0 0 0 0 0 0 0 0 0 0 0 3.6920 2.0440 22.9670 C 0 0 0 0 0 0 0 0 0 0 0 0 4.4640 1.0880 19.5620 O 0 0 0 0 0 0 0 0 0 0 0 0 3.7130 -1.0290 18.8880 C 0 0 0 0 0 0 0 0 0 0 0 0 3.4170 -1.9280 17.8630 C 0 0 0 0 0 0 0 0 0 0 0 0 4.0410 -0.3090 16.2360 C 0 0 1 0 0 0 0 0 0 0 0 0 4.3280 0.6230 17.2290 C 0 0 0 0 0 0 0 0 0 0 0 0 4.1010 -1.0340 13.8920 N 0 0 0 0 0 0 0 0 0 0 0 0 3.2530 -1.0040 12.7050 C 0 0 0 0 0 0 0 0 0 0 0 0 -3.7100 3.5350 27.1680 C 0 0 1 0 0 0 0 0 0 0 0 0 -3.2190 2.2650 26.5840 C 0 0 0 0 0 0 0 0 0 0 0 0 -4.0500 0.9970 26.6980 C 0 0 1 0 0 0 0 0 0 0 0 0 -5.2380 1.1420 26.2110 F 0 0 0 0 0 0 0 0 0 0 0 0 -3.4800 0.0010 26.1220 F 0 0 0 0 0 0 0 0 0 0 0 0 -4.2250 0.6490 27.9280 F 0 0 0 0 0 0 0 0 0 0 0 0 -5.2760 3.5990 28.0420 Cl 0 0 0 0 0 0 0 0 0 0 0 0 -0.0220 3.4870 25.2640 N 0 0 0 0 0 0 0 0 0 0 0 0 1.7480 2.8090 24.0510 N 0 0 0 0 0 0 0 0 0 0 0 0 -0.2620 1.9460 23.6160 O 0 0 0 0 0 0 0 0 0 0 0 0 4.1550 0.2360 18.5580 C 0 0 0 0 0 0 0 0 0 0 0 0 3.5810 -1.5370 16.5780 N 0 0 0 0 0 0 0 0 0 0 0 0 4.1310 0.0000 14.7620 C 0 0 1 0 0 0 0 0 0 0 0 0 4.1550 1.1730 14.3910 O 0 0 0 0 0 0 0 0 0 0 0 0 -1.1120 5.4400 26.4100 H 0 0 0 0 0 0 0 0 0 0 0 0 2.3160 3.2190 24.7430 H 0 0 0 0 0 0 0 0 0 0 0 0 0.5870 4.1890 25.5880 H 0 0 0 0 0 0 0 0 0 0 0 0 3.0780 -2.8780 18.0820 H 0 0 0 0 0 0 0 0 0 0 0 0 4.6590 1.5710 16.9900 H 0 0 0 0 0 0 0 0 0 0 0 0 4.6630 -1.8240 14.0630 H 0 0 0 0 0 0 0 0 0 0 0 0 0.6580 2.5250 21.6040 H 0 0 0 0 0 0 0 0 0 0 0 0 -3.2350 5.5450 27.5270 H 0 0 0 0 0 0 0 0 0 0 0 0 2.7140 -0.0800 12.6750 H 0 0 0 0 0 0 0 0 0 0 0 0 2.5610 -1.8190 12.7400 H 0 0 0 0 0 0 0 0 0 0 0 0 3.8620 -1.0910 11.8290 H 0 0 0 0 0 0 0 0 0 0 0 0 4.1900 2.1340 23.8660 H 0 0 0 0 0 0 0 0 0 0 0 0 5.3800 1.3390 21.9200 H 0 0 0 0 0 0 0 0 0 0 0 0 1.8410 1.7330 19.6240 H 0 0 0 0 0 0 0 0 0 0 0 0 -1.6140 1.3830 25.5420 H 0 0 0 0 0 0 0 0 0 0 0 0 3.6030 -1.3050 19.8760 H 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 1 20 1 0 1 47 1 0 2 3 1 0 2 26 1 0 3 4 1 0 3 33 1 0 4 19 1 0 4 40 1 0 5 26 1 0 5 27 1 0 5 28 1 0 6 7 1 0 6 11 1 0 6 27 1 0 7 8 1 0 7 39 1 0 8 9 1 0 8 46 1 0 9 10 1 0 9 12 1 0 10 11 1 0 10 45 1 0 11 44 1 0 12 29 1 0 13 14 1 0 13 29 1 0 13 48 1 0 14 30 1 0 14 36 1 0 15 16 1 0 15 30 1 1 15 31 1 0 16 29 1 0 16 37 1 0 17 18 1 0 17 31 1 0 17 38 1 0 18 41 1 0 18 42 1 0 18 43 1 0 19 20 1 0 19 25 1 1 20 21 1 0 21 22 1 6 21 23 1 0 21 24 1 0 26 35 1 0 27 34 1 0 31 32 1 6 M END ''')
def process( self, input_file: str, output_file: str = "", output_file_sdf: str = "", sdf_append: bool = False, #images_prefix: str = "", format_output: bool = True, write_header: bool = True, osra_output_format: str = "", output_formats: list = None, dry_run: bool = False, csv_delimiter: str = ";", use_gm: bool = True, gm_dpi: int = 300, gm_trim: bool = True, n_jobs: int = -1, input_type: str = "", standardize_mols: bool = True, annotate: bool = True, chemspider_token: str = "", custom_page: int = 0, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OSRA. Parameters ---------- input_file : str Path to file to be processed by OSRA. output_file : str File to write output in. output_file_sdf : str | File to write SDF output in. "sdf" output format hasn't to be in `output_formats` to write SDF output. | If "sdf_osra" output format is requested, suffix "-osra.sdf" will be added. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. NOT IMPLEMENTED | images_prefix : str Prefix for images of extracted compounds which will be written. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts. | If True and `output_file` is set, the CSV file will be written. | If False, the value of "content" key of returned dict will be None. write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. osra_output_format : str | Output format from OSRA. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "smi", "can", "sdf" | If "sdf", additional information like coordinates cannot be retrieved (not implemented yet). output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OSRA. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | When output produces by OSRA is unreadable by RDKit, you can at least have that output from OSRA. | Default value: ["smiles"] +-----------------+--------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=================+==============+============================================================================================+ | smiles | RDKit | canonical | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_osra | OSRA ("smi") | SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | smiles_can_osra | OSRA ("can") | canonical SMILES | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ | sdf_osra | OSRA ("sdf") | If present, an additional SDF file will be created. | +-----------------+--------------+--------------------------------------------------------------------------------------------+ dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. use_gm : bool | If True, use GraphicsMagick to convert PDF to temporary PNG images before processing. | If False, OSRA will use it's own conversion of PDF to image. | Using gm is more reliable since OSRA (v2.1.0) is showing wrong information when converting directly from PDF (namely: coordinates, bond length and possibly more ones) and also there are sometimes incorrectly recognised structures. gm_dpi : int How many DPI will temporary PNG images have. gm_trim : bool If True, gm will trim the temporary PNG images. n_jobs : int | If `use_gm` and input file is PDF, how many jobs to use for OSRA processing of temporary PNG images. | If -1 all CPUs are used. | If 1 is given, no parallel computing code is used at all, which is useful for debugging. | For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. input_type : str | When empty, input (MIME) type will be determined from magic bytes. | Or you can specify "pdf" or "image" and magic bytes check will be skipped. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. annotate : bool | If True, try to annotate entities in PubChem and ChemSpider. Compound IDs will be assigned by searching with each identifier, separately for SMILES, InChI etc. | If entity has InChI key yet, prefer it in searching. | If "*" is present in SMILES, skip annotation. chemspider_token : str Your personal token for accessing the ChemSpider API. Make account there to obtain it. custom_page : bool When `use_gm` is False, this will set the page for all extracted compounds. continue_on_failure : bool | If True, continue running even if OSRA returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OSRA - stderr: str ... standard error output from OSRA - exit_code: int ... exit code from OSRA - content: - list of OrderedDicts ... when `format_output` is True. - None ... when `format_output` is False | If `osra_output_format` is "sdf", additional information like 'bond_length' cannot be retrieved. | If `use_gm` is True then stdout, stderr and exit_code will be lists containing items from each temporary image extracted by OSRA. Notes ----- Only with `format_output` set to True you can use molecule standardization and more molecule formats. Otherwise you will only get raw stdout from OSRA (which can also be written to file if `output_file` is set). """ options_internal = self.options_internal.copy() osra_smiles_outputs = ["smi", "can"] # OSRA output format check if osra_output_format: options_internal["output_format"] = osra_output_format else: osra_output_format = options_internal["output_format"] osra_valid_output_formats = { "can": "smiles_can_osra", "smi": "smiles_osra", "sdf": "sdf_osra" } if osra_output_format not in osra_valid_output_formats: raise ValueError( "Unknown OSRA output format. Possible values: {}".format( osra_valid_output_formats.values())) if osra_output_format == "sdf": self.logger.warning( "OSRA's output format is set to \"sdf\" so additional information like coordinates cannot be retrieved." ) # output formats check is_output_sdf = False is_output_sdf_osra = False if not output_formats: output_formats = ["smiles"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = ["smiles", "inchi", "inchikey", "sdf"] output_formats = [ x for x in output_formats if x in possible_output_formats or x == osra_valid_output_formats[osra_output_format] ] if ("sdf" in output_formats or "sdf_osra" in output_formats) and not output_file_sdf: self.logger.warning( "Cannot write SDF output: 'output_file_sdf' is not set.") if output_file_sdf: is_output_sdf = True if "sdf_osra" in output_formats and osra_output_format == "sdf" and output_file_sdf: is_output_sdf_osra = True if ("smiles_osra" in output_formats or "smiles_can_osra" in output_formats) and osra_output_format == "sdf": try: output_formats.remove("smiles_osra") except ValueError: pass try: output_formats.remove("smiles_can_osra") except ValueError: pass self.logger.warning( "SMILES or canonical SMILES output from OSRA is requested, but OSRA's output format is \"{}\"." .format(osra_output_format)) # input file type check possible_input_types = ["pdf", "image"] if not input_type: input_type = get_input_file_type(input_file) if input_type not in possible_input_types: use_gm = False self.logger.warning( "Input file MIME type ('{}') is not one of {}. You can specify 'input_type' directly (see docstring)." .format(input_type, possible_input_types)) elif input_type not in possible_input_types: raise ValueError("Possible 'input_type' values are {}".format( possible_input_types)) #options = ChainMap({k: v for k, v in {"images_prefix": images_prefix}.items() if v}, # options_internal) if annotate: if not chemspider_token: self.logger.warning( "Cannot perform annotation in ChemSpider: 'chemspider_token' is empty." ) [ output_formats.append(x) for x in ["smiles", "inchi", "inchikey"] if x not in output_formats ] output_formats = sorted(output_formats) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) commands.extend( ["--bond", "--coordinates", "--page", "--guess", "--print"]) if dry_run: return " ".join(commands) osra_output_list = [] if input_type == "image" or not use_gm: osra_output_list.append( self._process(input_file, commands, page=custom_page if custom_page else 1)) elif input_type == "pdf": with tempfile.TemporaryDirectory() as temp_dir: stdout, stderr, exit_code = pdf_to_images(input_file, temp_dir, dpi=gm_dpi, trim=gm_trim) osra_output_list = Parallel(n_jobs=n_jobs)( delayed(self._process)( temp_image_file, commands, page=page) for temp_image_file, page in get_temp_images(temp_dir)) # summarize OSRA results to_return = { "stdout": [], "stderr": [], "exit_code": [], "content": None, "pages": [] } for result in osra_output_list: if result["stdout"]: to_return["stdout"].append(result["stdout"]) to_return["stderr"].append(result["stderr"]) to_return["exit_code"].append(result["exit_code"]) to_return["pages"].append(result["page"]) if not continue_on_failure: errors = [(page + 1, error) for page, (exit_code, error) in enumerate( zip(to_return["exit_code"], to_return["stderr"])) if exit_code > 0] if errors: self.logger.warning("OSRA errors:") for page, error in errors: eprint("\tError on page {}:".format(page)) eprint("\n\t\t".join("\n{}".format(error).splitlines())) return to_return if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write("\n".join(to_return["stdout"])) return to_return output_cols = OrderedDict([("bond_length", 1), ("resolution", 2), ("confidence", 3), ("page", 4), ("coordinates", 5)]) if osra_output_format in osra_smiles_outputs: compound_template_dict = OrderedDict.fromkeys( output_formats + list(output_cols.keys())) else: compound_template_dict = OrderedDict.fromkeys(["page"] + output_formats) if any(to_return["stdout"]): if standardize_mols: standardizer = Standardizer() compounds = [] if is_output_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) for output, page in zip(to_return["stdout"], to_return["pages"]): if osra_output_format in osra_smiles_outputs: lines = [x.strip() for x in output.split("\n") if x] else: lines = [x for x in output.split("$$$$") if x.strip()] for line in lines: """ # so much problems with --learn # we can't simply split output by " " when --learn is present, because its output is like "1,2,2,2 1" if "learn" in filtered_cols: learn_start = filtered_cols.index("learn") + 1 # "smiles" col isn't in output_cols learn_end = filtered_cols.index("learn") + 1 + 3 line[learn_start:learn_end] = [" ".join(line[learn_start:learn_end])] """ if not line: continue if osra_output_format in osra_smiles_outputs: line = [x.strip() for x in line.split()] if custom_page: line[output_cols["page"]] = custom_page elif use_gm: line[output_cols["page"]] = page mol = MolFromSmiles( line[0], sanitize=False if standardize_mols else True) elif osra_output_format == "sdf": line = "\n" + line.strip() mol = MolFromMolBlock( line, strictParsing=False, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: compound = compound_template_dict.copy() if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": compound["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_osra" and osra_output_format == "smi": compound["smiles_osra"] = line[0] elif f == "smiles_can_osra" and osra_output_format == "can": compound["smiles_can_osra"] = line[0] elif f == "inchi": inchi = MolToInchi(mol) if inchi: compound["inchi"] = inchi else: compound["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( MolToSmiles(mol))) elif f == "inchikey": inchi = MolToInchi(mol) if inchi: compound["inchikey"] = InchiToInchiKey( inchi) else: compound["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}" .format(MolToSmiles(mol))) elif f == "sdf": compound["sdf"] = MolToMolBlock( mol, includeStereo=True) elif f == "sdf_osra": compound["sdf_osra"] = line if is_output_sdf: writer.write(mol) if osra_output_format in osra_smiles_outputs: compound.update([(x[0], x[1]) for x in zip( list(output_cols.keys()), line[1:])]) else: compound[ "page"] = page if use_gm else custom_page if custom_page else 1 compounds.append(compound) else: self.logger.warning("Cannot convert to RDKit mol: " + line[0]) if is_output_sdf_osra: with open(output_file_sdf + "-osra.sdf", mode="w", encoding="utf-8") as f: f.write("".join(to_return["stdout"])) to_return["content"] = sorted(compounds, key=lambda x: x["page"]) if annotate: chemspider = ChemSpider( chemspider_token) if chemspider_token else None for i, ent in enumerate(to_return["content"]): self.logger.info("Annotating entity {}/{}...".format( i + 1, len(to_return["content"]))) ent.update( OrderedDict([("pch_cids_by_inchikey", ""), ("chs_cids_by_inchikey", ""), ("pch_cids_by_smiles", ""), ("chs_cids_by_smiles", ""), ("pch_cids_by_inchi", ""), ("chs_cids_by_inchi", ""), ("pch_iupac_name", ""), ("chs_common_name", ""), ("pch_synonyms", "")])) results = [] # prefer InChI key if "inchikey" in ent and ent["inchikey"]: try: results = get_compounds(ent["inchikey"], "inchikey") if results: if len(results) == 1: result = results[0] synonyms = result.synonyms if synonyms: ent["pch_synonyms"] = "\"{}\"".format( "\",\"".join(synonyms)) ent["pch_iupac_name"] = result.iupac_name ent["pch_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.cid) for c in results])) except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results = chemspider.search( ent["inchikey"]) if chemspider_token else [] if results: if len(results) == 1: result = results[0] ent["chs_common_name"] = result.common_name ent["chs_cids_by_inchikey"] = "\"{}\"".format( ",".join([str(c.csid) for c in results])) else: for search_field, col_pch, col_chs in [ ("smiles", "pch_cids_by_smiles", "chs_cids_by_smiles"), ("inchi", "pch_cids_by_inchi", "chs_cids_by_inchi") ]: results_pch = [] results_chs = [] if search_field == "smiles" and "smiles" in ent and ent[ "smiles"] and "*" not in ent["smiles"]: try: results_pch = get_compounds( ent["smiles"], "smiles") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["smiles"]) if chemspider_token else [] elif search_field == "inchi" and "inchi" in ent and ent[ "inchi"]: try: results_pch = get_compounds( ent["inchi"], "inchi") except (BadRequestError, NotFoundError, PubChemHTTPError, ResponseParseError, ServerError, TimeoutError, PubChemPyError): pass results_chs = chemspider.search( ent["inchi"]) if chemspider_token else [] if results_pch: ent[col_pch] = "\"{}\"".format(",".join( [str(c.cid) for c in results_pch])) if results_chs: ent[col_chs] = "\"{}\"".format(",".join( [str(c.csid) for c in results_chs])) sleep(0.5) if output_file: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) if is_output_sdf: writer.close() elif not any(to_return["stdout"]) and output_file: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(compound_template_dict.keys()), write_header=write_header) return to_return
def test1InchiReadPubChem(self): for f in self.dataset.values(): same, diff, reasonable = 0, 0, 0 for m in f: if m is None: # pragma: nocover continue x = MolToInchi(m) y = None RDLogger.DisableLog('rdApp.error') mol = MolFromInchi(x) RDLogger.EnableLog('rdApp.error') if mol is not None: y = MolToInchi( MolFromSmiles(MolToSmiles(mol, isomericSmiles=True))) if y is None: # metal involved? try: MolToInchi(m, treatWarningAsError=True) except InchiReadWriteError as inst: _, error = inst.args if 'Metal' in error or \ 'Charges were rearranged' in error: reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # RDKit does not like the SMILES? use MolBlock instead inchiMol = MolFromInchi(x) if inchiMol: rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock( MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue # InChI messed up the radical? unsanitizedInchiMol = MolFromInchi(x, sanitize=False) if sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in m.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]) != sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in unsanitizedInchiMol.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]): reasonable += 1 continue diff += 1 cid = m.GetProp('PUBCHEM_COMPOUND_CID') print(COLOR_GREEN + 'Empty mol for PubChem Compound ' + cid + '\n' + COLOR_RESET) continue if x != y: # if there was warning in the first place, then this is # tolerable try: MolToInchi(m, treatWarningAsError=True) MolFromInchi(x, treatWarningAsError=True) except InchiReadWriteError as inst: reasonable += 1 continue # or if there are big rings SanitizeMol(m) if filter(lambda i: i >= 8, [len(r) for r in m.GetRingInfo().AtomRings()]): reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # or if RDKit loses bond stereo s = MolToSmiles(m, True) if MolToSmiles(MolFromSmiles(s), True) != s: reasonable += 1 continue # or if it is RDKit SMILES writer unhappy about the mol inchiMol = MolFromInchi(x) rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue diff += 1 print(COLOR_GREEN + 'Molecule mismatch for PubChem Compound ' + cid + COLOR_RESET) print(inchiDiff(x, y)) print() else: same += 1 fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}" print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET)) self.assertEqual(same, 621) self.assertEqual(diff, 0) self.assertEqual(reasonable, 560)
def save(self, force_insert=False, force_update=False, *args, **kwargs): changed = False new = not bool(CompoundStructures.objects.filter(pk=self.pk).count()) if settings.OPEN_SOURCE: if self.molfile: if not new: # The structure already exists and we only want to modify it super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) # this should trigger CMPD_STR_UPDATE_TRIG, which deletes compound images and properties and nulls standard inchi, key, smiles, and molformula changed = True # newInchi = inchiFromPipe(self.molfile, settings.INCHI_BINARIES_LOCATION['1.02']) #if newInchi != self.standard_inchi: # self.standard_inchi = newInchi # changed = True mol = MolFromInchi(self.standard_inchi.encode("ascii")) if mol: # self.canonical_smiles = MolToSmiles(mol) if not self.standard_inchi: raise NoStandardInchi("for CompundStructure, pk = " + str(self.pk)) newInchiKey = InchiToInchiKey(self.standard_inchi.encode("ascii")) if self.standard_inchi_key != newInchiKey: self.standard_inchi_key = newInchiKey mol = MolFromInchi(self.standard_inchi.encode("ascii")) # self.canonical_smiles = MolToSmiles(mol) changed = True self.molfile = MolToMolBlock(MolFromMolBlock(str(self.molfile))) # This is how we do kekulisation in RDKit... self.clean_fields() self.validate_unique() super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) else: if self.molfile: if not new: # The structure already exists and we only want to modify it super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) # this should trigger CMPD_STR_UPDATE_TRIG, which deletes compound images and properties and nulls standard inchi, key, smiles, and molformula changed = True data = getStructure(self.molfile) newInchi = data['InChI'] if newInchi != self.standard_inchi: self.standard_inchi = newInchi self.standard_inchi_key = data['InChIKey'] #self.molformula = data['Molecular_Formula'] self.canonical_smiles = data['Canonical_Smiles'] changed = True if not self.standard_inchi: raise NoStandardInchi("for CompundStructure, pk = " + str(self.pk)) if not self.standard_inchi_key: self.standard_inchi_key = InchiToInchiKey(self.standard_inchi.encode("ascii")) self.clean_fields() self.validate_unique() super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) if changed: self.molecule.structure_key = self.standard_inchi_key self.molecule.structure_type = "MOL" self.molecule.molfile_update = datetime.now() self.molecule.save() structureChanged.send(sender=self.__class__, instance=self)
# Rescale Y matrix y_train_raw = train[[prop]].values y_valid_raw = valid[[prop]].values y_scaler = RobustNanScaler() y_train_scaled = y_scaler.fit_transform(y_train_raw) y_valid_scaled = y_scaler.transform(y_valid_raw) from rdkit.Chem import MolFromMolBlock # Transform MOLs into X matrix preprocessor = MolPreprocessor(n_neighbors=48) train_inputs = preprocessor.fit( (MolFromMolBlock(mol) for _, mol in train.mol.iteritems())) valid_inputs = preprocessor.fit( (MolFromMolBlock(mol) for _, mol in valid.mol.iteritems())) def rbf_expansion(distances, mu=0, delta=0.2, kmax=150): k = np.arange(0, kmax) logits = -(np.atleast_2d(distances).T - (-mu + delta * k))**2 / delta return np.exp(logits) def precalc_rbfs(inputs): for item in tqdm(inputs): item['distance_rbf'] = rbf_expansion(item['distance'])
def predict(request: HttpRequest) -> Union[HttpResponseBadRequest, JsonResponse]: """ Returns a list of all available models in JSON format. Required POST parameters are either 'molblocks' containing a list of molblocks, or 'smiles' containing a list auf SMILES. Also required is 'models' with a list of model names which should be used for predictions. Parameters ---------- request : HttpRequest Incoming POST request as HttpRequest instance Returns ------- Union[HttpResponseBadRequest, JsonResponse] Returns a HttpResponseBadRequest instance if any required parameters are missing or if an error occurs. Otherwise a JsonResponse instance containing all predictions and probabilities (if available) is returned. """ if 'molblocks' in request.POST and 'smiles' in request.POST \ or 'molblocks' not in request.POST and 'smiles' not in request.POST: return HttpResponseBadRequest('molblocks OR smiles have to be provided') if 'models' not in request.POST: return HttpResponseBadRequest('model(s) have to be provided') model_names = request.POST.getlist('models') model_names = set(model_names) descs = [] fps = [] for model in model_names: if model in model_manager.models: descs.extend(model_manager.models[model].descriptors) fps.extend(model_manager.models[model].fingerprints) else: return HttpResponseBadRequest(f'{model} not available') descs = list(set(descs)) fps = list(set(fps)) mols = [] if 'molblocks' in request.POST: molblocks = request.POST.getlist('molblocks') if not molblocks or molblocks == ['']: return HttpResponseBadRequest('List of molblocks is empty') for mb in molblocks: if mb == '': mols.append(None) break mols.append(MolFromMolBlock(mb)) else: smiles = request.POST.getlist('smiles') if not smiles or smiles == ['']: return HttpResponseBadRequest('List of SMILES is empty') for smi in smiles: if smi == '': mols.append(None) break mols.append(MolFromSmiles(smi)) if None in mols: return HttpResponseBadRequest('molblocks or smiles contain invalid entries') df = pd.DataFrame.from_dict(dict(ROMol=mols)) df = rdkit_support.compute_descriptors(df, descs) bad_ix, bad_desc = rdkit_support.filter_descriptor_values(df[descs]) if len(bad_ix) > 0: df.drop(index=bad_ix, inplace=True) if len(df) == 0: return HttpResponseBadRequest('Every molecule leads to bad descriptor values') df = fplib.compute_fingerprints(df, fps) df.drop(columns='ROMol', inplace=True) res = {ix: {} for ix in df.index} for model in model_names: m = model_manager.models[model] cols = copy(m.descriptors) for fp in m.fingerprints: for fp_col in df.columns[len(descs):]: if fp_col.startswith(f'{fp.alias}['): cols.append(fp_col) pred = m.predict(df[cols]) pred.columns = [col.replace(f'{model}_', '') for col in pred.columns] pred = pred.to_dict('index') for ix in pred: res[ix][model] = pred[ix] return JsonResponse(res)