def get_formula(smiles, inchi): formula_from_smiles = "" formula_from_inchi = "" try: if len(smiles) > 5: formula_from_smiles = str( CalcMolFormula(Chem.MolFromSmiles(smiles))) else: formula_from_smiles = "" except: formula_from_smiles = "" try: if len(inchi) > 5: formula_from_inchi = str(CalcMolFormula(Chem.MolFromInchi(inchi))) else: formula_from_inchi = "" except: formula_from_inchi = "" if len(formula_from_smiles) > 2 and len(formula_from_inchi) > 2: return formula_from_smiles, formula_from_inchi if len(formula_from_smiles) > 2: return formula_from_smiles, "" if len(formula_from_inchi) > 2: return formula_from_inchi, "" return "", ""
def syncProperties(smiles): try: mol = Chem.MolFromSmiles(smiles) formula = CalcMolFormula(mol) charge = GetFormalCharge(mol) formula = formula.replace(str(charge), '') return formula, charge, 'calculated properties from structure' except: return False, False, 'property calculation error'
def filter_pubchem(ms): ms_filtered = [] elements = set(['C', 'H', 'O', 'N', 'S', 'P', 'Cl', 'B', 'Br', 'Se']) for m in ms: mw = CalcExactMolWt(m) if mw < 100 or mw > 1500: continue if GetFormalCharge(m) != 0: continue atoms = [a.GetSymbol() for a in m.GetAtoms()] c = Counter(atoms) if 'C' in c and 'H' in c: if 'S' in c and c['S'] > 5: continue if 'Cl' in c and c['Cl'] > 5: continue if 'Br' in c and c['Br'] > 5: continue if 'B' in c and c['B'] > 5: continue if set(c.keys()).issubset(elements): ms_filtered.append(CalcMolFormula(m)) return ms_filtered
def count_struct_isomers(smiles_list): """ Counts the number of molecules with the same molecular formula Keyword arguments: smiles_list -- a list of smiles strings of the set/subset of molecules to look at Returns: """ # formula: isomer count dict_isomers = {} # formula : smiles list dict_smiles = {} # weight : isomer count dict_exactwt = {} for mol_smiles in smiles_list: mol = MolFromSmiles(mol_smiles) formula = CalcMolFormula(mol) weight = ExactMolWt(mol) if formula in dict_isomers.keys(): dict_isomers[formula] += 1 # increase the isomer count by 1 dict_smiles[formula].append( mol_smiles) # These are MOD's smiles, not RDKit's dict_exactwt[ weight] += 1 # Weight calculated by RDKit, not MOD's in-built else: dict_isomers[formula] = 1 dict_smiles[formula] = [mol_smiles] dict_exactwt[weight] = 1 return dict_exactwt # modify this as per your needs
def smi_to_formula(smi_str): """ Given a smiles string in arbitrary format, return the smiles string as produced by RDKit, the molecular formula, and the molecular weight using only the most abundant isotopes :param smi_str: str, standard SMILES format :return: str, the molecular formula in standard chemistry notation """ # Use RDKit to make a SMILES from a SMILES so that we get a unique string for any given SMILES entry mol = Chem.MolFromSmiles(smi_str) if mol is None: raise InvalidDataError( f"The input SMILES string '{smi_str}' could not be recognized by RDKit" ) Chem.Kekulize(mol) rd_smi = Chem.MolToSmiles(mol, kekuleSmiles=True) mol_formula = CalcMolFormula(mol) stoich_dict = parse_stoich(mol_formula) dbe = calc_dbe(stoich_dict) mol_mass = 0 for atom_type, num_atoms in stoich_dict.items(): mass_most_abundant_isotope = LIGNIN_ISOTOPE_DICT[atom_type][MASS][0] mol_mass += mass_most_abundant_isotope * num_atoms mw_deprot = round(mol_mass - LIGNIN_ISOTOPE_DICT[HYDROG][MASS][0], MAX_SIG_FIGS) mw_prot = round(mol_mass + LIGNIN_ISOTOPE_DICT[HYDROG][MASS][0], MAX_SIG_FIGS) return rd_smi, mol_formula, round(mol_mass, MAX_SIG_FIGS), mw_deprot, mw_prot, dbe
def _desc_list(mol, names): descriptors = dict() for name, fn in Descriptors.descList: if not names or name in names: descriptors[name] = fn(mol) if 'MolecularFormula' not in descriptors: descriptors['MolecularFormula'] = CalcMolFormula(mol) return descriptors
def get_molecular_formula(self, smi): try: m = Chem.MolFromSmiles(smi) return CalcMolFormula(m) except: '-'
def annotate_ms(ms_pred, smi, ion_mode='+', treeDepth=2): mzs = np.array(ms_pred['mz']) intensities = np.array(ms_pred['intensity']) mol = Chem.MolFromSmiles(smi) # only M+H and M-H is considered now. if ion_mode=='+': precursor = CalcExactMolWt(mol) + 1.0032 else: precursor = CalcExactMolWt(mol) - 1.0032 formula = CalcMolFormula(mol) frags = np.unique(generateFragments(smi, treeDepth=2)) frags_new = np.array([Chem.MolFromSmiles(s) for s in frags]) frags_formula = np.unique([CalcMolFormula(f) for f in frags_new]) loss_formula = [] for f in frags_formula: l = subtract_formula(formula, f) if l == '': continue if check_formula(l): loss_formula.append(l) add_H = add_formula(l, 'H') de_H = subtract_formula(l, 'H') if check_formula(add_H): loss_formula.append(add_H) if check_formula(de_H): loss_formula.append(de_H) loss_formula = np.unique(loss_formula) loss_mass = np.array([getFormulaExactMass(f) for f in loss_formula]) ms_new = pd.DataFrame(columns=['mz', 'intensity', 'annotate_loss', 'exact_mass']) for i, mz in enumerate(mzs): intensity = intensities[i] diff = precursor - mz if abs(diff) < 0.5: annotate_loss = ['precursor'] accurate_mass = [precursor] if min(np.abs(loss_mass - diff)) < 0.5: match = np.where(np.abs(loss_mass - diff) < 0.5)[0] annotate_loss = loss_formula[match] accurate_mass = precursor - loss_mass[match] else: annotate_loss = '' accurate_mass = '' ms_new.loc[len(ms_new)] = [mz, intensity, annotate_loss, accurate_mass] return ms_new
def to_formula(self): """ str: the chemical formula of the molecule. Raises: RuntimeError""" # formula may be undefined if atoms are uncertainly typed # e.g. if the molecule was initialize through SMARTS try: with Suppressor(): return CalcMolFormula(self) except RuntimeError: raise ValueError('Formula is undefined for {}'.format(self))
def process(fname): results = [] label = int(os.path.basename(fname).replace('.json', '')) with open(fname, 'r') as f: data = json.load(f) ok = [] for d in data: smi = d['smiles'] if smi is None: continue # Validate SMILES errs = molvs.validate_smiles(smi) if errs: # print('Validation error(s):', errs) continue # Standardize SMILES smi = molvs.standardize_smiles(smi) # Check if exists already if smi in pubchem: # print('Exists in PubChem') continue ok.append(smi) #print('Kept:', len(ok)) atc_codes = [atc_lookup[i] for i in atc_model.predict(ok)] for smi, atc_code in zip(ok, atc_codes): mol = Chem.MolFromSmiles(smi) formula = CalcMolFormula(mol) h = md5(smi.encode('utf8')).hexdigest() im = Draw.MolToImage(mol) im_path = os.path.join(images_dir, '{}.png'.format(h)) im.save(im_path) results.append({ 'label': label, 'smiles': smi, 'formula': formula, 'image': im_path, 'atc_code': atc_code, 'created_at': datetime.utcnow().isoformat() }) # Save generated compounds with open(fname, 'w') as f: json.dump(results, f)
def make_image(smi, base=False): mol = Chem.MolFromSmiles(smi) formula = CalcMolFormula(mol) if base: formula = '{} (base)'.format(formula) mol_im = Draw.MolToImage(mol, size=(w, h_)) im = Image.new('RGB', (w, h), color='white') im.paste(mol_im) draw = ImageDraw.Draw(im) tw, th = draw.textsize(formula) draw.text((w / 2 - tw / 2, h - th - vpadding), formula, font=font, fill='black') return im
def canon(df, idx): print('trying to canonize smile for idx: {}'.format(idx)) try: smile = df.loc[idx]['smiles'] m = Chem.MolFromSmiles(smile) m = Chem.AddHs(m) c_smile = Chem.MolToSmiles(m) df.loc[idx, 'c_smiles'] = c_smile df.loc[idx, 'status'] = 0 formula = CalcMolFormula(m) if 'Cl' in formula: formula = formula.replace('Cl', '') formula = formula + 'Cl' df.loc[idx, 'Formula'] = formula except Exception as e: df.loc[idx, 'status'] = -2 print("could not convert smile {} of molecule {} : {}".format( smile, idx, df.loc[idx]['Name'])) print('Exception: {}'.format(e)) return df
def _chembl_desc_list(mol): mol = _neutralise_sulphoxide(mol) descriptors = dict() for name, fn in Descriptors.descList: if name in CBL_DESC_LIST: if name == 'ExactMolWt': mol = _remove_isotope_info(deepcopy(mol)) descriptors[name] = fn(mol) if 'MolecularFormula' not in descriptors: descriptors['MolecularFormula'] = CalcMolFormula(mol) descriptors['Ro3Pass'] = _ro3_pass(descriptors['MolWt'], descriptors['NumHAcceptors'], descriptors['NumHDonors'], descriptors['MolLogP'], descriptors['NumRotatableBonds'], descriptors['TPSA']) descriptors['NumRo5'] = _num_ro5_violations(descriptors['MolLogP'], descriptors['MolWt'], descriptors['NumHAcceptors'], descriptors['NumHDonors']) descriptors['MonoisotopicMolWt'] = descriptors.pop('ExactMolWt') return descriptors
def process_compound(line): id, smi, label = line.split('\t') atc = atcs.get(id, set()) label = labels[int(label)].replace('/', '_') name = names.get(id) mol = Chem.MolFromSmiles(smi) formula = CalcMolFormula(mol) # Just generate all images # so we don't have to worry about them later im_fname = 'img/{}.png'.format(id) if not os.path.exists(im_fname): im = Draw.MolToImage(mol) im.save(im_fname) return label, { 'id': id, 'name': name, 'formula': formula, 'atc_codes': list([atc[:5] for atc in atc]) }
summary = pd.DataFrame({'smiles': smiles, 'ion_mode': modes, 'energy': energies}) # example 1 idx = 551 smi = smiles[idx] mol = Chem.MolFromSmiles(smi) ms_pred = model_predict(smi, model) ms_real = ms[idx] # annotation mzs = np.array(ms_pred['mz']) intensities = np.array(ms_pred['intensity']) mol = Chem.MolFromSmiles(smi) precursor = CalcExactMolWt(mol) - 1.0032 formula = CalcMolFormula(mol) frags = np.unique(generateFragments(smi, treeDepth=2)) frags_new = [Chem.MolFromSmiles(s) for s in frags] frags_formula = np.unique([CalcMolFormula(f) for f in frags_new]) loss_formula = [] for f in frags_formula: l = subtract_formula(formula, f) if l == '': continue if check_formula(l): loss_formula.append(l) add_H = add_formula(l, 'H') de_H = subtract_formula(l, 'H') if check_formula(add_H): loss_formula.append(add_H) if check_formula(de_H):
def getMoleculeFeatures(self): """Get the essential features of the constructed rdMol for the input component.""" mD = self.__rdMol.GetPropsAsDict() logger.debug("mol props %r", mD.items()) # formula = CalcMolFormula(self.__rdMol) ccId = self.__ccId ifCharge = Chem.rdmolops.GetFormalCharge(self.__rdMol) # inchiKey = Chem.inchi.MolToInchiKey(self.__rdMol) inchi = Chem.inchi.MolToInchi(self.__rdMol) smiles = Chem.rdmolfiles.MolToSmiles(self.__rdMol, isomericSmiles=False, canonical=True) isoSmiles = Chem.rdmolfiles.MolToSmiles(self.__rdMol, isomericSmiles=True, canonical=True) logger.debug("%s formula %s", ccId, formula) details = ComponentDetails(ccId=ccId, formula=formula, ifCharge=ifCharge) descriptors = ComponentDescriptors(smiles=smiles, isoSmiles=isoSmiles, inchi=inchi, inchiKey=inchiKey) # typeCounts = defaultdict(int) ccAtomD = {} ccAtomIdD = {} for ii, at in enumerate(self.__rdMol.GetAtoms(), 1): atIdx = at.GetIdx() aType = at.GetSymbol() typeCounts[aType] += 1 atName = self.__atomIdxD[ ii] if ii in self.__atomIdxD else aType + str( typeCounts[aType]) # atNo = at.GetAtomicNum() isAromatic = at.GetIsAromatic() isChiral = at.GetChiralTag() > 0 iCharge = at.GetFormalCharge() # cipStereo = at.GetProp("_CIPCode") atD = at.GetPropsAsDict(includePrivate=True, includeComputed=True) cipStereo = None if "_CIPCode" in atD: cipStereo = atD["_CIPCode"] if cipStereo and cipStereo not in ["S", "R"]: logger.error("%s (%s): Unexpected atom CIP stereo setting %r", ccId, atName, cipStereo) # ccAtomD[atName] = ComponentAtom(name=atName, aType=aType, isAromatic=isAromatic, isChiral=isChiral, CIP=cipStereo, fCharge=iCharge) ccAtomIdD[atIdx] = atName # nL = at.GetProp(includePrivate=True, includeComputed=True) atD = at.GetPropsAsDict(includePrivate=True, includeComputed=True) logger.debug("%s Atom %s %s %r %r %s", ccId, atName, aType, isAromatic, isChiral, cipStereo) # ccBondD = {} for bnd in self.__rdMol.GetBonds(): atI = bnd.GetBeginAtomIdx() atJ = bnd.GetEndAtomIdx() atNameI = ccAtomIdD[atI] atNameJ = ccAtomIdD[atJ] isAromatic = bnd.GetIsAromatic() # # bType = bnd.GetBondType() # iType = 0 cipStereo = None tS = bnd.GetStereo() if tS == Chem.rdchem.BondStereo.STEREOE: cipStereo = "E" elif tS == Chem.rdchem.BondStereo.STEREOZ: cipStereo = "Z" # bL = bnd.GetPropNames(includePrivate=True, includeComputed=True) bD = bnd.GetPropsAsDict(includePrivate=True, includeComputed=True) iType = bD["_MolFileBondType"] logger.debug("Bond %s %s iType %r cipStereo %r aromatic %r", atNameI, atNameJ, iType, cipStereo, isAromatic) # if cipStereo and cipStereo not in ["E", "Z"]: logger.error( "%s (%s %s): Unexpected bond CIP stereo setting %r", ccId, atNameI, atNameJ, cipStereo) # ccBondD[(atNameI, atNameJ)] = ComponentBond(iType=iType, isAromatic=isAromatic, CIP=cipStereo) # ccD = { "details": details, "descriptors": descriptors, "atoms": ccAtomD, "bonds": ccBondD } return ccD
from rdkit import Chem import sys import re from rdkit.Chem.rdMolDescriptors import CalcMolFormula formula = CalcMolFormula(Chem.MolFromSmiles(sys.argv[1])) print re.sub(r'([a-z]*)([A-Z])', r'\1 \2', formula).lstrip()
def parse_f(f): names = [''] cid = -1 CAS = f.split('/')[1] if '/' in f else f CAS = CAS.split('.')[0] if CAS in ignored_CASs: return None failed_mol = False try: if CAS in syn_data: d = syn_data[CAS] if 'pubchem' in d: raise Exception( 'Pubchem specified, not trying to use the mol file') elif 'formula' in d: raise Exception( 'Formula specified, not trying to use the mol file') try: mol = Chem.MolFromMolFile(f) assert mol is not None except: print('Cannot read %s' % f) 1 / 0 try: inchi_val = inchi.MolToInchi(mol) except: print('BAILING ON %s' % f) 1 / 0 mol = inchi.MolFromInchi(inchi_val) # Works better for ions if mol is None: print('BAILING ON reconversion to mol %s' % f) 1 / 0 except: failed_mol = True if CAS in syn_data: d = syn_data[CAS] if 'pubchem' in d: if str(d['pubchem']) in mycache: cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula = mycache[ str(d['pubchem'])] else: pc = Compound.from_cid(d['pubchem']) cid = pc.cid iupac_name = pc.iupac_name names = pc.synonyms mw = pc.molecular_weight smi = pc.canonical_smiles inchi_val = pc.inchi inchikey = pc.inchikey formula = pc.molecular_formula mycache[str(d['pubchem'])] = (cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula) else: cid = -1 names = d['synonyms'] if 'synonyms' in d else [''] mw = float(d['MW']) smi = d['smiles'] if 'smiles' in d else '' formula = d['formula'] if 'formula' in d else '' inchi_val = d['inchi'] if 'inchi' in d else '' inchikey = d['inchikey'] if 'inchikey' in d else '' iupac_name = '' else: print('FAILED on %s and no custom data was available either' % CAS) return None if not failed_mol: smi = Chem.MolToSmiles(mol, True) inchi_val = inchi.MolToInchi(mol) inchikey = inchi.InchiToInchiKey(inchi_val) mw = Descriptors.MolWt(mol) # for i in mol.GetAtoms(): # if i.GetIsotope(): # mw = Descriptors.ExactMolWt(mol) # break formula = CalcMolFormula(mol, True, True) iupac_name = '' try: if not failed_mol: if str(inchikey) in mycache: cid, iupac_name, names = mycache[str(inchikey)] else: try: pc = get_compounds(inchikey, 'inchikey')[0] cid = pc.cid iupac_name = pc.iupac_name names = pc.synonyms mycache[str(inchikey)] = (cid, iupac_name, names) except: mycache[str(inchikey)] = (-1, '', ['']) except: cid = -1 iupac_name = '' names = [''] other_CAS = [] if CAS in pdf_data: d = pdf_data[CAS] name = d['Name'] if 'Other Names' in d: syns = d['Other Names'] else: syns = [] if not iupac_name: iupac_name = name else: syns.insert(0, name) if 'Deleted CAS' in d: other_CAS.extend(d['Deleted CAS']) if 'Alternate CAS' in d: other_CAS.extend(d['Alternate CAS']) syns = [i for i in syns if i not in dup_names] names = syns + [i for i in names if i not in all_names] + other_CAS actual_names = [] for name in names: if name in all_user_names: # If the name is in the user db, only add it if it corresponds to this CAS number if CAS in syn_data and 'synonyms' in syn_data[ CAS] and name in syn_data[CAS]['synonyms']: actual_names.append(name) else: # Discard it otherwise pass else: # If the name is not in the user db we're all good actual_names.append(name) if CAS in syn_data and 'synonyms' in syn_data[CAS]: # If the user has any syns for this cas number, add those names if the name hasn't already been aded for n in syn_data[CAS]['synonyms']: if n not in actual_names: actual_names.append(n) actual_names = [i for i in actual_names if i] if inchi_val is not None: inchi_val = inchi_val.replace('InChI=1S/', '') formula = serialize_formula(formula) s = '%d\t%s\t%s\t%g\t%s\t%s\t%s\t%s\t' % (cid, CAS, formula, mw, smi, inchi_val, inchikey, iupac_name) s += '\t'.join(actual_names) print(s) return None
def process(init_data, use_cache=True): ''' Examples -------- >>> res = process({'CAS': '10170-69-1', 'synonyms': ['14267-36-8', 'NSC 22319'], 'name': 'Manganese, decacarbonyldi-, (Mn-Mn)'}) >>> res['inchi'], res['smiles'], res['cid'], res['CAS'] ('InChI=1S/10CO.2Mn/c10*1-2;;', '[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[Mn].[Mn]', 517769, '10170-69-1') ''' # print(locals()) init_data = init_data.copy() cc = cc_CAS = cc_name = cc_inchi = cc_inchikey = cc_smiles = cc_synonyms = cc_deprecated_CASs = None if 'CAS' in init_data: try: cc = common_chemistry_data(init_data['CAS']) cc_CAS, cc_name, cc_inchi, cc_inchikey, cc_smiles, cc_synonyms, cc_deprecated_CASs = cc except ValueError: # Compund is not in common chemistry; this is OK pass cid = iupac_name = p_MW = p_inchi = p_inchikey = p_smiles = p_formula = p_synonyms = None if init_data.get('mol', None) is not None: # If not in common chemistry or no InChi there, but if we have a mol file, get the inchi and inchikey for the # pubchem lookup mol = Chem.MolFromMolFile(init_data['mol']) if mol is not None: init_data['inchi'] = MolToInchi(mol) init_data['inchikey'] = InchiToInchiKey(init_data['inchi']) can_search_pubchem = (init_data.get('pubchem') is not None or init_data.get('CASRN', cc_CAS) is not None or init_data.get('inchi', cc_inchi) is not None or init_data.get('inchikey', cc_inchikey) is not None or init_data.get('smiles', cc_smiles) is not None) if can_search_pubchem: try: p = find_pubchem_from_ids( pubchem=init_data.get('pubchem'), CASRN=init_data.get('CASRN', cc_CAS), inchi=init_data.get('inchi', cc_inchi), inchikey=init_data.get('inchikey', cc_inchikey), smiles=init_data.get('smiles', cc_smiles), use_cache=use_cache) except Exception as e: p = None print(e, 'exception') if p is not None: cid, iupac_name, p_MW, p_inchi, p_inchikey, p_smiles, p_formula, p_synonyms = p # print(locals()) mol = None # Be aware some smiles descriptions are wrong # Start with user overridding if 'mol' in init_data: mol = Chem.MolFromMolFile(init_data['mol']) if mol is None and 'smiles' in init_data: mol = Chem.MolFromSmiles(init_data['smiles']) if mol is None and 'inchi' in init_data: mol = MolFromInchi( init_data['inchi']) if init_data['inchi'].startswith( "InChI=1S/") else MolFromInchi("InChI=1S/" + init_data['inchi']) # Trust common chemistry next if mol is None and cc_smiles is not None: mol = Chem.MolFromSmiles(cc_smiles) if mol is None and cc_inchi is not None: mol = MolFromInchi(cc_inchi) if cc_inchi.startswith( "InChI=1S/") else MolFromInchi("InChI=1S/" + cc_inchi) # Did we pull up the structure from pubchem?? if mol is None and p_smiles is not None: mol = Chem.MolFromSmiles(p_smiles) if mol is None and p_inchi is not None: mol = MolFromInchi(p_inchi) if p_inchi.startswith( "InChI=1S/") else MolFromInchi("InChI=1S/" + p_inchi) if mol is None: raise ValueError("No structure found") smiles = Chem.MolToSmiles(mol, True) inchi = MolToInchi(mol) inchikey = InchiToInchiKey(inchi) #MW = Descriptors.ExactMolWt(mol) formula = CalcMolFormula(mol, True, True) formula = serialize_formula(formula) MW = molecular_weight(nested_formula_parser(formula)) # print(inchi, cc_inchi, p_inchi) # print(inchikey, cc_inchikey, p_inchikey) # print(smiles, cc_smiles, p_smiles) # output values if 'pubchem' in init_data: cid = init_data['pubchem'] elif cid is None: cid = -1 if cc_CAS is not None: CAS = cc_CAS elif 'CAS' in init_data: CAS = init_data['CAS'] else: raise ValueError("CAS culd not be found") if 'formula' in init_data: # Override rdkit formula = init_data['formula'] if 'MW' in init_data: # Override rdkit MW = init_data['MW'] if 'smiles' in init_data: smiles = init_data['smiles'] if 'inchi' in init_data: inchi = init_data['inchi'] if 'inchikey' in init_data: inchikey = init_data['inchikey'] if inchikey == '*' or smiles == '*' or inchi == '*': raise ValueError("Failure in rdkit") # Do we have a name specified in the settings? if 'name' in init_data: name = init_data['name'] elif cc_name is not None: name = cc_name elif iupac_name is not None: name = iupac_name else: raise ValueError("There is no name for this compound") synonyms = [] if cc_synonyms is not None: synonyms += cc_synonyms if cc_deprecated_CASs is not None: synonyms += cc_deprecated_CASs if p_synonyms is not None: synonyms += p_synonyms if 'synonyms' in init_data: synonyms += init_data['synonyms'] synonyms = list(set(synonyms)) if name in synonyms: synonyms.remove(name) if synonyms: def key_sort_str(s): return len(s), s.lower() synonyms = sorted(synonyms, key=key_sort_str) # synonyms = natsorted(synonyms) # synonyms = [] return { 'cid': cid, 'CAS': CAS, 'formula': formula, 'MW': MW, 'smiles': smiles, 'inchi': inchi, 'inchikey': inchikey, 'name': name, 'synonyms': synonyms }
def get_chemicalFormula(mol): ''' Chemical Formula ''' return CalcMolFormula(mol)
if val.isalpha(): break else: checkIsItFirst = True num += str(val) secondPart = secondPart[1:] if not checkIsItFirst: num = 1 return num for index, row in df.iterrows(): smiles = row['smiles'] readedFormula = row['stoichiometry'] molObj = Chem.MolFromSmiles(smiles) formula = CalcMolFormula(molObj) atoms = rdkit.Chem.rdchem.Mol.GetAtoms(molObj) if isnan(formula) or isnan(readedFormula): print("*************** NAN VALUE : " + str(readedFormula) + " - " + str(formula)) else: if readedFormula != formula: for atom in atoms: atomSymbol = rdkit.Chem.rdchem.Atom.GetSymbol(atom) atomSymbolSTR = str(atomSymbol) readedNumber = GetNumber(str(readedFormula), atomSymbolSTR) createdNumber = GetNumber(str(formula), atomSymbolSTR) if readedNumber != createdNumber: print(readedFormula) print(formula) print(count)
df = pd.DataFrame(all_dicts) return df print('doing biocyc') molecules = [] for path in biocyc_paths: mol_files = glob.glob(os.path.join(path, 'MetaCyc-MOLfiles/*.mol')) for f in mol_files: cpd_id = os.path.basename(f).replace('.mol', '') with open(f, 'r', encoding='utf-8', errors='ignore') as fid: t = fid.read() name = t.split('\n')[0].strip().strip('"') mol = Chem.MolFromMolFile(f, sanitize=True) if mol is not None: formula = CalcMolFormula(mol) try: Chem.rdmolops.Kekulize(mol, clearAromaticFlags=True) smiles = Chem.MolToSmiles(mol, isomericSmiles=True) original_smiles = smiles except: original_smiles = None molecules.append({ 'original_id': str(cpd_id), 'name': str(name), 'source': str('BioCyc'), 'formula': str(formula), 'original_smiles': str(original_smiles), 'unique_id': str(uuid.uuid4()) }) else:
mol, status = neutralise_charges(mol) mol, status = desalt(mol) SanitizeMol(mol) mol, status = neutralise_charges(mol) SanitizeMol(mol) Kekulize(mol, clearAromaticFlags=True) new_smiles = MolToSmiles(mol, isomericSmiles=True) new_inchikey = MolToInchiKey(mol) new_inchi = MolToInchi(mol) mw = ExactMolWt(mol) sdf.loc[i, 'smiles'] = new_smiles sdf.loc[i, 'inchi_key'] = new_inchikey sdf.loc[i, 'inchi'] = new_inchi sdf.loc[i, 'neutral_mass'] = mw sdf.loc[i, 'formula'] = CalcMolFormula(mol) setup_cols = { 'inchi_key': 'metatlas_inchikey', 'inchi': 'metatlas_inchi', 'formula': 'metatlas_formula', 'neutral_mass': 'metatlas_mw', 'code': 'original_id', 'library': 'source' } sdf.rename(columns=setup_cols).to_csv( '/Users/bpb/Downloads/Tim-Tec-Compounds.tab', sep='\t', index=None) df = pd.merge(sdf[[ 'code', 'inchi_key', 'inchi', 'name', 'neutral_mass', 'original_smiles' ]],