def inchi_to_graph(inchi, max_atomic_number=118, device=torch.device('cpu')): """ Converts an inchi string to a DGL Graph object and associate the one hot encoding features for each node. :param inchi: An inchi string :param max_atomic_number: The max_atomic_number determines the final size of the nodes feature matrix :return: DGL.Graph """ mol = MolFromInchi(inchi) num_atoms = mol.GetNumAtoms() # DGLGraph creation from rdkit mol object graph = dgl.DGLGraph() graph.add_nodes(num_atoms) for bond in mol.GetBonds(): src = bond.GetBeginAtomIdx() dest = bond.GetEndAtomIdx() graph.add_edge(src, dest) # Edges in DGL are directional, to ensure bidirectionality, add reverse edge graph.add_edge(dest, src) # One hot encoding for nodes features one_hot_indexes = [] for atom_index in range(num_atoms): one_hot_indexes.append([mol.GetAtomWithIdx(atom_index).GetAtomicNum()]) graph.ndata['x'] = torch.zeros(num_atoms, max_atomic_number) \ .scatter_(1, torch.tensor(one_hot_indexes), 1).to(device) return graph
def test_sequence_minimal(): # Violacein mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer().compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' assert MolToSmiles(ans) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=C/C1=C1\\C(O)=Nc2ccccc21' # L-Lactate mol = MolFromInchi('')
def test_keep_biggest(): mol = Filters.keep_biggest(MolFromSmiles('CCCC.CC')) assert MolToSmiles(mol) == 'CCCC' mol = Filters.keep_biggest(MolFromSmiles('CCCCC.CC.[H].CCC')) assert MolToSmiles(mol) == 'CCCCC' mol = Filters.keep_biggest(MolFromInchi( 'InChI=1S/C5H12N2O2.C4H7NO4/c6-3-1-2-4(7)5(8)9;5-2(4(8)9)1-3(6)7/h4H,1-3,6-7H2,(H,8,9);2H,1,5H2,(H,6,7)(H,8,9)/t4-;2-/m00/s1')) assert MolToInchi(mol) == 'InChI=1S/C4H7NO4/c5-2(4(8)9)1-3(6)7/h2H,1,5H2,(H,6,7)(H,8,9)/t2-/m0/s1' mol = Filters.keep_biggest(MolFromInchi('InChI=1S/Mo.4O/q;;;2*-1')) assert MolToInchi(mol) == 'InChI=1S/Mo'
def annotate_chemical_svg(network): """Annotate chemical nodes with SVGs depiction. :param network: dict, network of elements as outputted by the sbml_to_json method :return: dict, network annotated """ from rdkit.Chem import MolFromInchi from rdkit.Chem.Draw import rdMolDraw2D from rdkit.Chem.AllChem import Compute2DCoords from urllib import parse for node in network['elements']['nodes']: if node['data']['type'] == 'chemical' and node['data'][ 'inchi'] is not None: inchi = node['data']['inchi'] try: mol = MolFromInchi(inchi) # if mol is None: # raise BaseException('Mol is None') Compute2DCoords(mol) drawer = rdMolDraw2D.MolDraw2DSVG(200, 200) drawer.DrawMolecule(mol) drawer.FinishDrawing() svg_draft = drawer.GetDrawingText().replace("svg:", "") svg = 'data:image/svg+xml;charset=utf-8,' + parse.quote( svg_draft) node['data']['svg'] = svg except BaseException as e: msg = 'SVG depiction failed from inchi: "{}"'.format(inchi) logging.warning(msg) logging.warning("Below the RDKit backtrace...") logging.warning(e) node['data']['svg'] = None return network
def drawChemicalList(self, id_inchi, subplot_size=[200, 200]): from rdkit.Chem import MolFromInchi from rdkit.Chem import Draw toRet = {} inchi_list = list(set([id_inchi[i] for i in id_inchi])) list_mol = [MolFromInchi(inchi) for inchi in inchi_list] for i in range(len(list_mol)): cp_list_mol = copy.deepcopy(list_mol) cp_list_mol.pop(i) tmp_list_mol = [list_mol[i]]+cp_list_mol img = Draw.MolsToGridImage(tmp_list_mol, molsPerRow=1, subImgSize=(subplot_size[0], subplot_size[1]), useSVG=True) #add the groups tag with the id's of the reactions -- should have be size width=subplot_size[0] height=subplot_size[1]*len(list_mol) bond_0_count = 0 svg_str = '' for line in img.splitlines(): add_line = True m0 = re.findall("(\d+\.\d+)", line) if m0: for y in m0: if float(y)>subplot_size[1]: add_line = False m1 = re.findall("height=\'\d+", line) if m1: line = re.sub(r"height=\'\d+", "height=\'"+str(subplot_size[1]), line) #line.replace(str(subplot_size[i]*len(list_mol)), str(subplot_size[1])) if add_line: svg_str += line+'\n' for y in id_inchi: if id_inchi[y]==inchi_list[i]: toRet[y] = svg_str return toRet
def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}): # Import (if needed) if itype == 'smiles': rdmol = MolFromSmiles(idepic, sanitize=True) elif itype == 'inchi': rdmol = MolFromInchi(idepic, sanitize=True) else: raise NotImplementedError( '"{}" is not a valid input type'.format(itype)) if rdmol is None: # Check imprt raise self.DepictionError( 'Import error from depiction "{}" of type "{}"'.format( idepic, itype)) # Export odepic = dict() for item in otype: if item == 'smiles': odepic[item] = MolToSmiles( rdmol ) # MolToSmiles is tricky, one mays want to check the possible options.. elif item == 'inchi': odepic[item] = MolToInchi(rdmol) elif item == 'inchikey': odepic[item] = MolToInchiKey(rdmol) else: raise NotImplementedError( '"{}" is not a valid output type'.format(otype)) return odepic
def convert_depiction(idepic, itype='smiles', otype={'inchikey'}): """Convert chemical depiction to others type of depictions :param idepic: string depiction to be converted, str :param itype: type of depiction provided as input, str :param otype: types of depiction to be generated, {"", "", ..} :return odepic: generated depictions, {"otype1": "odepic1", ..} Usage example: - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'}) - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'}) """ # Import (if needed) if itype == 'smiles': rdmol = MolFromSmiles(idepic, sanitize=True) elif itype == 'inchi': rdmol = MolFromInchi(idepic, sanitize=True) else: raise NotImplementedError('"{}" is not a valid input type'.format(itype)) if rdmol is None: # Check imprt raise Exception('Import error from depiction "{}" of type "{}"'.format(idepic, itype)) # Export odepic = dict() for item in otype: if item == 'smiles': odepic[item] = MolToSmiles(rdmol) # MolToSmiles is tricky, one mays want to check the possible options.. elif item == 'inchi': odepic[item] = MolToInchi(rdmol) elif item == 'inchikey': odepic[item] = MolToInchiKey(rdmol) else: raise NotImplementedError('"{}" is not a valid output type'.format(otype)) return odepic
def _transform(self, x): try: mol = MolFromInchi(x['standard_inchi']) except: mol = MolFromSmiles(x['Compound_SMILES']) info = {} AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, self.dim, bitInfo=info) return list(info.keys())
def test_remove_stereo(): mol = Filters.remove_stereo(MolFromSmiles('C[C@@H](C(=O)[O-])O')) assert MolToSmiles(mol) == 'CC(O)C(=O)[O-]' mol = Filters.remove_stereo(MolFromInchi( 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')) assert MolToSmiles(mol) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(O)=Nc2ccccc21' mol = Filters.commute_inchi(mol) # Expected to change tautomerism assert MolToSmiles(mol) == 'O=C1NC(C2=CNC3=C2C=C(O)C=C3)=CC1=C1C(=O)NC2=CC=CC=C21'
def add_exact_mass(specs): for s in specs: mol = MolFromSmiles(s.get('smiles')) if mol is None: mol = MolFromInchi(s.get('inchi')) exact_mass_smi = CalcExactMolWt(mol) if abs(exact_mass_smi - s.get('parent_mass', 0.0) > 1): print(exact_mass_smi, s.get('parent_mass')) s.set('exact_mass', exact_mass_smi)
def standarize_mol_by_inchi(mol, neutralize=True): newmol = AddHs(mol) sinchi, code, msg = generate_inchi(newmol, FixedH=False, RecMet=False) if neutralize: nsinchi = neutralize_inchi(sinchi) else: nsinchi = sinchi newmol = MolFromInchi(nsinchi, removeHs=False) newmol = AddHs(newmol, explicitOnly=True) return newmol
def test_sequence_rr_legacy(): # Violacein mol = MolFromInchi( 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' ) ans = Standardizer(sequence_fun='sequence_rr_legacy').compute(mol) assert MolToInchi( ans ) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' assert MolToSmiles( ans ) == '[H]OC1=NC(C2=C([H])N([H])C3=C2C([H])=C(O[H])C([H])=C3[H])=C([H])/C1=C1\\C(O[H])=NC2=C([H])C([H])=C([H])C([H])=C21'
def commute_inchi(cls, mol_in): """Convert RDKit compound back and forth to InChi. Returns a new compound after the initial one has been converted back and forth to InChi. :param mol_in: RDKit Mol :return mol_out: RDKit Mol """ inchi = MolToInchi(mol_in, logLevel=None) # this is talkative... mol_out = MolFromInchi(inchi, sanitize=False, removeHs=False, logLevel=None, treatWarningAsError=False) if not mol_out: raise ValueError("Failed InChi validity filter.") # Copy the properties cls._copy_properties(mol_in, mol_out) return mol_out
def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}): """Convert chemical depiction to others type of depictions Usage example: - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'}) - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'}) :param idepic: Input string :param itype: The type of input :param otype: Type of output. Valid options: inchi, smiles, inchikey :type idepic: str :type itype: str :type otype: dict :rtype: dict :return: Dictionnary of results """ # Import (if needed) if itype == 'smiles': rdmol = MolFromSmiles(idepic, sanitize=True) elif itype == 'inchi': rdmol = MolFromInchi(idepic, sanitize=True) else: raise NotImplementedError('"{}" is not a valid input type'.format(itype)) if rdmol is None: # Check imprt raise self.DepictionError('Import error from depiction "{}" of type "{}"'.format(idepic, itype)) # Export odepic = dict() for item in otype: if item == 'smiles': odepic[item] = MolToSmiles(rdmol) # MolToSmiles is tricky, one mays want to check the possible options.. elif item == 'inchi': odepic[item] = MolToInchi(rdmol) elif item == 'inchikey': odepic[item] = MolToInchiKey(rdmol) else: raise NotImplementedError('"{}" is not a valid output type'.format(otype)) return odepic
def rdmols_from_document(document, build_from="inchi", add_hs=True): """ Convert back a document to a set of rdmols. This method is a companion of "as_document". :param document: a document produced by the "as_mongo_document" method, dict :param build_from: the type of depiction to be used to build back the rdmols, str in ["inchi", "smiles"] :param add_hs: add Hs to RDKit mol object, default is True :returns list_list_rdmols: list of list of rdmols """ assert build_from in ["inchi", "smiles"] assert add_hs in [True, False] list_list_rdmols = list() list_stoechiometry = document['list_stoechiometry'] if build_from == 'inchi': for list_inchis in document['list_list_inchis']: list_rdmols = list() for inchi in list_inchis: rd_mol = MolFromInchi(inchi, sanitize=True) if add_hs: rd_mol = AddHs(rd_mol) list_rdmols.append(rd_mol) list_list_rdmols.append(list_rdmols) elif build_from == 'smiles': for list_smiles in document['list_list_smiles']: list_rdmols = list() for smiles in list_smiles: rd_mol = MolFromSmiles(smiles, sanitize=True) if add_hs: rd_mol = AddHs(rd_mol) list_rdmols.append(rd_mol) list_list_rdmols.append(list_rdmols) else: raise NotImplementedError() return list_list_rdmols, list_stoechiometry
def generate_structure_and_dictionary(batch): """ Adding the structure data to a compound batch object """ chirality = "1" if batch.id: print "not updating" # currently we dont update existing compound records else: if not batch.ctab: #blinded compound uox_id = generate_uox_id() batch.blinded_batch_id = uox_id batch.save(validate=False) else: if not batch.canonical_smiles or not batch.related_molregno_id: try: pybelmol = readstring("mol", str(batch.ctab).encode("ascii")) batch.canonical_smiles = pybelmol.write("can").split( "\t")[0] batch.properties["cdxml"] = pybelmol.write("cdxml") except: pass try: mol = MolFromInchi( batch.standard_inchi.encode('ascii', 'ignore')) if mol: batch.std_ctab = MolToMolBlock(mol, includeStereo=True) except: pass inchi_key = batch.standard_inchi_key inchi = batch.standard_inchi if not batch.related_molregno_id: try: moldict = MoleculeDictionary.objects.get( project=batch.project, structure_type="MOL", # chirality=chirality, structure_key=batch.standard_inchi_key) except ObjectDoesNotExist: uox_id = None forced_uox_id = batch.warnings.get( "original_uox_id", None) if forced_uox_id: count_existing_objects = CBHCompoundBatch.objects.filter( related_molregno__chembl__chembl_id= forced_uox_id).count() count_existing_objects += CBHCompoundBatch.objects.filter( blinded_batch_id=forced_uox_id).count() if count_existing_objects == 0: uox_id = forced_uox_id #Now check if there is a chembl and remove if so ChemblIdLookup.objects.filter( chembl_id=uox_id).delete() else: print( "Had to generate a new compound ID for %s" % forced_uox_id) if not uox_id: uox_id = generate_uox_id() rnd = random.randint(-1000000000, -2) uox_id_lookup = ChemblIdLookup.objects.create( chembl_id=uox_id, entity_type="COMPOUND", entity_id=rnd) moldict = MoleculeDictionary.objects.get_or_create( chembl=uox_id_lookup, project=batch.project, structure_type="MOL", structure_key=batch.standard_inchi_key)[0] uox_id_lookup.entity_id = moldict.molregno uox_id_lookup.save() structure = CompoundStructures( molecule=moldict, molfile=batch.std_ctab, standard_inchi_key=inchi_key, standard_inchi=inchi) structure.save() if structure.molecule_id: generateCompoundPropertiesTask(structure) batch.related_molregno = moldict batch.save(validate=False) return batch
def test_commute_inchi(): inchi = 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1' mol = Filters.commute_inchi(MolFromInchi(inchi)) assert MolToInchi(mol) == inchi
def test1InchiReadPubChem(self): for f in self.dataset.values(): same, diff, reasonable = 0, 0, 0 for m in f: if m is None: # pragma: nocover continue x = MolToInchi(m) y = None RDLogger.DisableLog('rdApp.error') mol = MolFromInchi(x) RDLogger.EnableLog('rdApp.error') if mol is not None: y = MolToInchi( MolFromSmiles(MolToSmiles(mol, isomericSmiles=True))) if y is None: # metal involved? try: MolToInchi(m, treatWarningAsError=True) except InchiReadWriteError as inst: _, error = inst.args if 'Metal' in error or \ 'Charges were rearranged' in error: reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # RDKit does not like the SMILES? use MolBlock instead inchiMol = MolFromInchi(x) if inchiMol: rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock( MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue # InChI messed up the radical? unsanitizedInchiMol = MolFromInchi(x, sanitize=False) if sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in m.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]) != sum([ a.GetNumRadicalElectrons() * a.GetAtomicNum() for a in unsanitizedInchiMol.GetAtoms() if a.GetNumRadicalElectrons() != 0 ]): reasonable += 1 continue diff += 1 cid = m.GetProp('PUBCHEM_COMPOUND_CID') print(COLOR_GREEN + 'Empty mol for PubChem Compound ' + cid + '\n' + COLOR_RESET) continue if x != y: # if there was warning in the first place, then this is # tolerable try: MolToInchi(m, treatWarningAsError=True) MolFromInchi(x, treatWarningAsError=True) except InchiReadWriteError as inst: reasonable += 1 continue # or if there are big rings SanitizeMol(m) if filter(lambda i: i >= 8, [len(r) for r in m.GetRingInfo().AtomRings()]): reasonable += 1 continue # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage) # or if RDKit loses bond stereo s = MolToSmiles(m, True) if MolToSmiles(MolFromSmiles(s), True) != s: reasonable += 1 continue # or if it is RDKit SMILES writer unhappy about the mol inchiMol = MolFromInchi(x) rdDepictor.Compute2DCoords(inchiMol) z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol))) if x == z: reasonable += 1 continue diff += 1 print(COLOR_GREEN + 'Molecule mismatch for PubChem Compound ' + cid + COLOR_RESET) print(inchiDiff(x, y)) print() else: same += 1 fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}" print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET)) self.assertEqual(same, 621) self.assertEqual(diff, 0) self.assertEqual(reasonable, 560)
def save(self, force_insert=False, force_update=False, *args, **kwargs): changed = False new = not bool(CompoundStructures.objects.filter(pk=self.pk).count()) if settings.OPEN_SOURCE: if self.molfile: if not new: # The structure already exists and we only want to modify it super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) # this should trigger CMPD_STR_UPDATE_TRIG, which deletes compound images and properties and nulls standard inchi, key, smiles, and molformula changed = True # newInchi = inchiFromPipe(self.molfile, settings.INCHI_BINARIES_LOCATION['1.02']) #if newInchi != self.standard_inchi: # self.standard_inchi = newInchi # changed = True mol = MolFromInchi(self.standard_inchi.encode("ascii")) if mol: # self.canonical_smiles = MolToSmiles(mol) if not self.standard_inchi: raise NoStandardInchi("for CompundStructure, pk = " + str(self.pk)) newInchiKey = InchiToInchiKey(self.standard_inchi.encode("ascii")) if self.standard_inchi_key != newInchiKey: self.standard_inchi_key = newInchiKey mol = MolFromInchi(self.standard_inchi.encode("ascii")) # self.canonical_smiles = MolToSmiles(mol) changed = True self.molfile = MolToMolBlock(MolFromMolBlock(str(self.molfile))) # This is how we do kekulisation in RDKit... self.clean_fields() self.validate_unique() super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) else: if self.molfile: if not new: # The structure already exists and we only want to modify it super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) # this should trigger CMPD_STR_UPDATE_TRIG, which deletes compound images and properties and nulls standard inchi, key, smiles, and molformula changed = True data = getStructure(self.molfile) newInchi = data['InChI'] if newInchi != self.standard_inchi: self.standard_inchi = newInchi self.standard_inchi_key = data['InChIKey'] #self.molformula = data['Molecular_Formula'] self.canonical_smiles = data['Canonical_Smiles'] changed = True if not self.standard_inchi: raise NoStandardInchi("for CompundStructure, pk = " + str(self.pk)) if not self.standard_inchi_key: self.standard_inchi_key = InchiToInchiKey(self.standard_inchi.encode("ascii")) self.clean_fields() self.validate_unique() super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) if changed: self.molecule.structure_key = self.standard_inchi_key self.molecule.structure_type = "MOL" self.molecule.molfile_update = datetime.now() self.molecule.save() structureChanged.send(sender=self.__class__, instance=self)
from rdkit.Chem import MolFromInchi inchi = 'InChI=1S/C6H6O4/c7-5(8)3-1-2-4-6(9)10/h1-4H,(H,7,8)(H,9,10)/b3-1+,4-2+' mol = MolFromInchi(inchi) assert mol
def get_charge_from_inchi(inchi, removeHs=False): mol = MolFromInchi(inchi, removeHs=removeHs) netc = GetFormalCharge(mol) del mol return netc
# Using production model print("Production model running...") w_path = os.path.join(MODELS_PATH, f"{data}_noHs.pt") model = MPNNPredictor( node_in_feats=49, edge_in_feats=10, global_feats=4, n_tasks=1, output_f=output_f, ).to(DEVICE) model.load_state_dict(torch.load(w_path, map_location=DEVICE)) gis = [ molecule_importance(MolFromInchi(inchi), model)[4] for inchi in tqdm(inchis) ] global_importances = np.vstack(gis) np.save(os.path.join(DATA_PATH, f"importances{data}.npy"), arr=global_importances) # Using oof models global_importances_oof = [] kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) for idx_split, (_, idx_test) in enumerate(kf.split(inchis)): print("Split {}/{} running...".format(idx_split + 1, N_FOLDS)) inchis_test, values_test = ( inchis[idx_test].tolist(),
def test_sequence_tunable(): # Check default arguments args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations = inspect.getfullargspec(sequence_tunable) default_params = dict(zip(args[-len(defaults):], defaults)) assert default_params == { 'OP_REMOVE_ISOTOPE':True, 'OP_NEUTRALISE_CHARGE': True, 'OP_REMOVE_STEREO': False, 'OP_COMMUTE_INCHI': False, 'OP_KEEP_BIGGEST': True, 'OP_ADD_HYDROGEN': True, 'OP_KEKULIZE': True, 'OP_NEUTRALISE_CHARGE_LATE': True } # Violacein, default parameter mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer(sequence_fun='sequence_tunable').compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' assert MolToSmiles(ans) == '[H]OC1=NC(C2=C([H])N([H])C3=C2C([H])=C(O[H])C([H])=C3[H])=C([H])/C1=C1\\C(O[H])=NC2=C([H])C([H])=C([H])C([H])=C21' # Violacein, strip stereo mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)' assert MolToSmiles(ans) == '[H]OC1=C([H])C2=C(C([H])=C1[H])N([H])C([H])=C2C1=C([H])C(=C2C(=O)N([H])C3=C([H])C([H])=C([H])C([H])=C23)C(=O)N1[H]' # Violacien, implicit Hs mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' assert MolToSmiles(ans) == 'OC1=CC2=C(C=C1)NC=C2C1=C/C(=C2/C3=CC=CC=C3N=C2O)C(O)=N1' # Violacien, no kekulerization mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_KEKULIZE': False}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+' assert MolToSmiles(ans) == '[H]OC1=NC(c2c([H])n([H])c3c([H])c([H])c(O[H])c([H])c23)=C([H])/C1=C1\\C(O[H])=Nc2c([H])c([H])c([H])c([H])c21' # Violacien, strip stereo & implicit Hs & no kekulerization mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True, 'OP_ADD_HYDROGEN': False, 'OP_KEKULIZE': False}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)' assert MolToSmiles(ans) == 'O=C1NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(=O)Nc2ccccc21' # Lactate, default parameter mol = MolFromSmiles('C[C@@H](C(=O)[O-])O') ans = Standardizer(sequence_fun='sequence_tunable').compute(mol) assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/t2-/m0/s1' assert MolToSmiles(ans) == '[H]OC(=O)[C@@]([H])(O[H])C([H])([H])[H]' # L-lactate, implicit Hs mol = MolFromSmiles('C[C@@H](C(=O)[O-])O') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/t2-/m0/s1' assert MolToSmiles(ans) == 'C[C@H](O)C(=O)O' # L-lactate, no stereo mol = MolFromSmiles('C[C@@H](C(=O)[O-])O') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)' assert MolToSmiles(ans) == '[H]OC(=O)C([H])(O[H])C([H])([H])[H]' # L-lactate, no charge neutralisation mol = MolFromSmiles('C[C@@H](C(=O)[O-])O') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_NEUTRALISE_CHARGE': False, 'OP_NEUTRALISE_CHARGE_LATE': False}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1/t2-/m0/s1' assert MolToSmiles(ans) == '[H]O[C@]([H])(C(=O)[O-])C([H])([H])[H]' # L-lactate, implicit Hs & no stereo mol = MolFromSmiles('C[C@@H](C(=O)[O-])O') ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False, 'OP_REMOVE_STEREO': True}).compute(mol) assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)' assert MolToSmiles(ans) == 'CC(O)C(=O)O'
def process(self, input: Union[str, list] = "", input_file: str = "", output_file: str = "", output_file_sdf: str = "", output_file_cml: str = "", sdf_append: bool = False, format_output: bool = True, opsin_output_format: str = "", output_formats: list = None, write_header: bool = True, dry_run: bool = False, csv_delimiter: str = ";", standardize_mols: bool = True, normalize_plurals: bool = True, continue_on_failure: bool = False) -> OrderedDict: r""" Process the input file with OPSIN. Parameters ---------- input : str or list | str: String with IUPAC names, one per line. | list: List of IUPAC names. input_file : str Path to file to be processed by OPSIN. One IUPAC name per line. output_file : str File to write output in. output_file_sdf : str File to write SDF output in. output_file_cml : str | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml". | Not supported by RDKit so standardization and conversion to other formats cannot be done. sdf_append : bool If True, append new molecules to existing SDF file or create new one if doesn't exist. format_output : bool | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys: | "iupac", <output formats>, ..., "error" | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error" | If False, the value of "content" key of returned dict will be None. opsin_output_format : str | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__). | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey" output_formats : list | If True and `format_output` is also True, this specifies which molecule formats will be output. | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__ or with `osra_output_format` here. | Default value: ["smiles"] +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | Value | Source | Note | +=======================+=======================+============================================================================================+ | smiles | RDKit | canonical | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_opsin | OPSIN ("smi") | SMILES | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | smiles_extended_opsin | OPSIN ("extendedsmi") | Extended SMILES. Not supported by RDKit. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi | RDKit | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchi_opsin | OPSIN ("inchi") | InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchi_opsin | OPSIN ("stdinchi") | standard InChI | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | inchikey | RDKit | The same applies as for "inchi". Also molecule cannot be created from InChI-key. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | stdinchikey_opsin | OPSIN ("stdinchikey") | Standard InChI-key. Cannot be used by RDKit to create molecule. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ | sdf | RDKit | If present, an additional SDF file will be created. | +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+ write_header : bool If True and if `output_file` is set and `output_format` is True, write a CSV write_header. dry_run : bool If True, only return list of commands to be called by subprocess. csv_delimiter : str Delimiter for output CSV file. standardize_mols : bool If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules. normalize_plurals : bool | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can set your own regex pattern with `plural_patterns` in __init__. continue_on_failure : bool | If True, continue running even if OPSIN returns non-zero exit code. | If False and error occurs, print it and return. Returns ------- dict Keys: - stdout: str ... standard output from OPSIN - stderr: str ... standard error output from OPSIN - exit_code: int ... exit code from OPSIN - content: - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error" - None ... when format_output is False """ options_internal = self.options_internal.copy() opsin_nonreadable_formats = ["cml", "stdinchikey"] if input and input_file: input_file = "" self.logger.warning( "Both 'input' and 'input_file' are set, but 'input' will be prefered." ) elif not input and not input_file: raise ValueError("One of 'input' or 'input_file' must be set.") # OSRA output format check if opsin_output_format: options_internal["output_format"] = opsin_output_format else: opsin_output_format = options_internal["output_format"] opsin_valid_output_formats = { "cml": "cml_opsin", "smi": "smiles_opsin", "extendedsmi": "smiles_extended_opsin", "inchi": "inchi_opsin", "stdinchi": "stdinchi_opsin", "stdinchikey": "stdinchikey_opsin" } if opsin_output_format not in opsin_valid_output_formats: raise ValueError( "Unknown OPSIN output format. Possible values: {}".format( list(opsin_valid_output_formats.keys()))) if standardize_mols and opsin_output_format in opsin_nonreadable_formats: self.logger.warning( "OPSIN output format is \"{}\", which cannot be used by RDKit." .format(opsin_output_format)) # output formats check if not output_formats: output_formats = ["smiles"] else: if opsin_output_format == "stdinchikey": output_formats = ["stdinchikey_opsin"] elif opsin_output_format == "extendedsmi": output_formats = ["smiles_extended_opsin"] else: output_formats = sorted(list(set(output_formats))) possible_output_formats = [ "smiles", "inchi", "inchikey", "sdf" ] output_formats = [ x for x in output_formats if x in possible_output_formats or x == opsin_valid_output_formats[opsin_output_format] ] if normalize_plurals: if input_file: with open(input_file, mode="r", encoding="utf-8") as f: input = "\n".join([x.strip() for x in f.readlines()]) input_file = "" input = self.normalize_iupac(input) commands, _, _ = self.build_commands(options_internal, self._OPTIONS_REAL, self.path_to_binary) if input_file: commands.append(input) stdout, stderr, exit_code = common_subprocess(commands) elif input: if isinstance(input, list): input = "\n".join([x.strip() for x in input]) stdout, stderr, exit_code = common_subprocess(commands, stdin=input) else: raise UserWarning("Input is empty.") if dry_run: return " ".join(commands) to_return = { "stdout": stdout, "stderr": stderr, "exit_code": exit_code, "content": None } if not continue_on_failure and exit_code > 0: self.logger.warning("OPSIN error:") eprint("\n\t".join("\n{}".format(stderr).splitlines())) return to_return if output_file_cml and opsin_output_format == "cml": with open(output_file_cml, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return elif output_file_cml and opsin_output_format != "cml": self.logger.warning( "Output file for CML is requested, but OPSIN output format is '{}'" .format(opsin_output_format)) if not format_output: if output_file: with open(output_file, mode="w", encoding="utf-8") as f: f.write(stdout) return to_return compounds = [] standardizer = Standardizer() empty_cols = OrderedDict([(x, "") for x in output_formats]) if output_file_sdf: if sdf_append: if not os.path.isfile(output_file_sdf): open(output_file_sdf, mode="w", encoding="utf-8").close() writer = SDWriter( open(output_file_sdf, mode="a", encoding="utf-8")) else: writer = SDWriter(output_file_sdf) stdout = stdout.split("\n") del stdout[-1] stderr = [ x.strip() for x in stderr.split("\n")[1:] if x ] # remove first line of stderr because there is OPSIN message (y u du dis...) if input_file: with open(input_file, mode="r", encoding="utf-8") as f: lines = iter(f.readlines()) else: lines = iter(input.split("\n")) mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats + ["error"]) e = 0 for i, line in enumerate(lines): line = line.strip() converted = stdout[i].strip() mol_output = mol_output_template.copy() if converted: if opsin_output_format == "stdinchikey": compounds.append( OrderedDict([("iupac", line), ("stdinchikey_opsin", converted), ("error", "")])) continue elif opsin_output_format == "extendedsmi": compounds.append( OrderedDict([("iupac", line), ("smiles_extended_opsin", converted), ("error", "")])) continue if opsin_output_format == "smi": mol = MolFromSmiles( converted, sanitize=False if standardize_mols else True) elif opsin_output_format in ["inchi", "stdinchi"]: mol = MolFromInchi( converted, sanitize=False if standardize_mols else True, removeHs=False if standardize_mols else True) if mol: if standardize_mols: try: mol = standardizer.standardize(mol) except ValueError as e: self.logger.warning( "Cannot standardize '{}': {}".format( MolToSmiles(mol), str(e))) for f in output_formats: if f == "smiles": mol_output["smiles"] = MolToSmiles( mol, isomericSmiles=True) elif f == "smiles_opsin" and opsin_output_format == "smi": mol_output["smiles_opsin"] = converted elif f == "inchi": inchi = MolToInchi(mol) if inchi: mol_output["inchi"] = inchi else: mol_output["inchi"] = "" self.logger.warning( "Cannot convert to InChI: {}".format( converted)) elif f == "inchi_opsin" and opsin_output_format == "inchi": mol_output["inchi_opsin"] = converted elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi": mol_output["stdinchi_opsin"] = converted elif f == "inchikey": inchi = MolToInchi(mol) if inchi: mol_output["inchikey"] = InchiToInchiKey(inchi) else: mol_output["inchikey"] = "" self.logger.warning( "Cannot create InChI-key from InChI: {}". format(converted)) elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey": mol_output["stdinchikey_opsin"] = converted elif f == "sdf": mol_output["sdf"] = MolToMolBlock( mol, includeStereo=True) if output_file_sdf: writer.write(mol) mol_output.update( OrderedDict([("iupac", line), ("error", "")])) else: mol_output.update([ ("iupac", line), ("error", "Cannot convert to RDKit mol: {}".format(converted)) ]) mol_output.update(empty_cols) self.logger.warning(compounds[-1].error) else: try: error = stderr[e].strip() except IndexError: error = "" mol_output.update([("iupac", line), ("error", error)]) mol_output.update(empty_cols) e += 1 compounds.append(mol_output) to_return["content"] = compounds if output_file and compounds: dict_to_csv(to_return["content"], output_file=output_file, csv_delimiter=csv_delimiter, write_header=write_header) elif output_file and not compounds: write_empty_file(output_file, csv_delimiter=csv_delimiter, header=list(mol_output_template.keys()), write_header=write_header) return to_return