Beispiel #1
0
def get_formula(smiles, inchi):
    formula_from_smiles = ""
    formula_from_inchi = ""
    try:
        if len(smiles) > 5:
            formula_from_smiles = str(
                CalcMolFormula(Chem.MolFromSmiles(smiles)))
        else:
            formula_from_smiles = ""
    except:
        formula_from_smiles = ""

    try:
        if len(inchi) > 5:
            formula_from_inchi = str(CalcMolFormula(Chem.MolFromInchi(inchi)))
        else:
            formula_from_inchi = ""
    except:
        formula_from_inchi = ""

    if len(formula_from_smiles) > 2 and len(formula_from_inchi) > 2:
        return formula_from_smiles, formula_from_inchi

    if len(formula_from_smiles) > 2:
        return formula_from_smiles, ""

    if len(formula_from_inchi) > 2:
        return formula_from_inchi, ""

    return "", ""
Beispiel #2
0
def syncProperties(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        formula = CalcMolFormula(mol)
        charge = GetFormalCharge(mol)
        formula = formula.replace(str(charge), '')
        return formula, charge, 'calculated properties from structure'
    except:
        return False, False, 'property calculation error'
Beispiel #3
0
def filter_pubchem(ms):
    ms_filtered = []
    elements = set(['C', 'H', 'O', 'N', 'S', 'P', 'Cl', 'B', 'Br', 'Se'])
    for m in ms:
        mw = CalcExactMolWt(m)
        if mw < 100 or mw > 1500:
            continue

        if GetFormalCharge(m) != 0:
            continue

        atoms = [a.GetSymbol() for a in m.GetAtoms()]
        c = Counter(atoms)
        if 'C' in c and 'H' in c:
            if 'S' in c and c['S'] > 5:
                continue
            if 'Cl' in c and c['Cl'] > 5:
                continue
            if 'Br' in c and c['Br'] > 5:
                continue
            if 'B' in c and c['B'] > 5:
                continue
            if set(c.keys()).issubset(elements):
                ms_filtered.append(CalcMolFormula(m))
    return ms_filtered
Beispiel #4
0
def count_struct_isomers(smiles_list):
    """
	Counts the number of molecules with the same molecular formula
	Keyword arguments:
	smiles_list -- a list of smiles strings of the set/subset of molecules to look at
	Returns: 
	"""
    # formula: isomer count
    dict_isomers = {}
    # formula : smiles list
    dict_smiles = {}
    # weight : isomer count
    dict_exactwt = {}

    for mol_smiles in smiles_list:
        mol = MolFromSmiles(mol_smiles)
        formula = CalcMolFormula(mol)
        weight = ExactMolWt(mol)
        if formula in dict_isomers.keys():
            dict_isomers[formula] += 1  # increase the isomer count by 1
            dict_smiles[formula].append(
                mol_smiles)  # These are MOD's smiles, not RDKit's
            dict_exactwt[
                weight] += 1  # Weight calculated by RDKit, not MOD's in-built
        else:
            dict_isomers[formula] = 1
            dict_smiles[formula] = [mol_smiles]
            dict_exactwt[weight] = 1
    return dict_exactwt  # modify this as per your needs
Beispiel #5
0
def smi_to_formula(smi_str):
    """
    Given a smiles string in arbitrary format, return the smiles string as produced by RDKit,
        the molecular formula, and the molecular weight using only the most abundant isotopes
    :param smi_str: str, standard SMILES format
    :return: str, the molecular formula in standard chemistry notation
    """
    # Use RDKit to make a SMILES from a SMILES so that we get a unique string for any given SMILES entry
    mol = Chem.MolFromSmiles(smi_str)
    if mol is None:
        raise InvalidDataError(
            f"The input SMILES string '{smi_str}' could not be recognized by RDKit"
        )
    Chem.Kekulize(mol)
    rd_smi = Chem.MolToSmiles(mol, kekuleSmiles=True)
    mol_formula = CalcMolFormula(mol)
    stoich_dict = parse_stoich(mol_formula)
    dbe = calc_dbe(stoich_dict)
    mol_mass = 0
    for atom_type, num_atoms in stoich_dict.items():
        mass_most_abundant_isotope = LIGNIN_ISOTOPE_DICT[atom_type][MASS][0]
        mol_mass += mass_most_abundant_isotope * num_atoms

    mw_deprot = round(mol_mass - LIGNIN_ISOTOPE_DICT[HYDROG][MASS][0],
                      MAX_SIG_FIGS)
    mw_prot = round(mol_mass + LIGNIN_ISOTOPE_DICT[HYDROG][MASS][0],
                    MAX_SIG_FIGS)

    return rd_smi, mol_formula, round(mol_mass,
                                      MAX_SIG_FIGS), mw_deprot, mw_prot, dbe
Beispiel #6
0
def _desc_list(mol, names):
    descriptors = dict()
    for name, fn in Descriptors.descList:
        if not names or name in names:
            descriptors[name] = fn(mol)
    if 'MolecularFormula' not in descriptors:
        descriptors['MolecularFormula'] = CalcMolFormula(mol)
    return descriptors
Beispiel #7
0
    def get_molecular_formula(self, smi):

        try:
            m = Chem.MolFromSmiles(smi)
            return CalcMolFormula(m)

        except:
            '-'
Beispiel #8
0
def annotate_ms(ms_pred, smi, ion_mode='+', treeDepth=2):
    mzs = np.array(ms_pred['mz'])
    intensities = np.array(ms_pred['intensity'])
    mol = Chem.MolFromSmiles(smi)
    # only M+H and M-H is considered now.
    if ion_mode=='+':
        precursor = CalcExactMolWt(mol) + 1.0032
    else:
        precursor = CalcExactMolWt(mol) - 1.0032
    formula = CalcMolFormula(mol)
    frags = np.unique(generateFragments(smi, treeDepth=2))
    frags_new = np.array([Chem.MolFromSmiles(s) for s in frags])
    frags_formula = np.unique([CalcMolFormula(f) for f in frags_new])
    loss_formula = []
    for f in frags_formula:
        l = subtract_formula(formula, f)
        if l == '':
            continue
        if check_formula(l):
            loss_formula.append(l)
        add_H = add_formula(l, 'H')
        de_H = subtract_formula(l, 'H')
        if check_formula(add_H):
            loss_formula.append(add_H)
        if check_formula(de_H):
            loss_formula.append(de_H)
    loss_formula = np.unique(loss_formula)
    loss_mass = np.array([getFormulaExactMass(f) for f in loss_formula])
    ms_new = pd.DataFrame(columns=['mz', 'intensity', 'annotate_loss', 'exact_mass'])
    for i, mz in enumerate(mzs):
        intensity = intensities[i]
        diff = precursor - mz
        if abs(diff) < 0.5:
            annotate_loss = ['precursor']
            accurate_mass = [precursor]
        if min(np.abs(loss_mass - diff)) < 0.5:
            match = np.where(np.abs(loss_mass - diff) < 0.5)[0]
            annotate_loss = loss_formula[match]
            accurate_mass = precursor - loss_mass[match]
        else:
            annotate_loss = ''
            accurate_mass = ''      
        ms_new.loc[len(ms_new)] = [mz, intensity, annotate_loss, accurate_mass]
    return ms_new
Beispiel #9
0
    def to_formula(self):
        """ str: the chemical formula of the molecule.

        Raises:
            RuntimeError"""

        # formula may be undefined if atoms are uncertainly typed
        # e.g. if the molecule was initialize through SMARTS
        try:
            with Suppressor():
                return CalcMolFormula(self)
        except RuntimeError:
            raise ValueError('Formula is undefined for {}'.format(self))
Beispiel #10
0
def process(fname):
    results = []
    label = int(os.path.basename(fname).replace('.json', ''))
    with open(fname, 'r') as f:
        data = json.load(f)

    ok = []
    for d in data:
        smi = d['smiles']
        if smi is None: continue

        # Validate SMILES
        errs = molvs.validate_smiles(smi)
        if errs:
            # print('Validation error(s):', errs)
            continue

        # Standardize SMILES
        smi = molvs.standardize_smiles(smi)

        # Check if exists already
        if smi in pubchem:
            # print('Exists in PubChem')
            continue

        ok.append(smi)

    #print('Kept:', len(ok))
    atc_codes = [atc_lookup[i] for i in atc_model.predict(ok)]

    for smi, atc_code in zip(ok, atc_codes):
        mol = Chem.MolFromSmiles(smi)
        formula = CalcMolFormula(mol)

        h = md5(smi.encode('utf8')).hexdigest()
        im = Draw.MolToImage(mol)
        im_path = os.path.join(images_dir, '{}.png'.format(h))
        im.save(im_path)

        results.append({
            'label': label,
            'smiles': smi,
            'formula': formula,
            'image': im_path,
            'atc_code': atc_code,
            'created_at': datetime.utcnow().isoformat()
        })

    # Save generated compounds
    with open(fname, 'w') as f:
        json.dump(results, f)
Beispiel #11
0
def make_image(smi, base=False):
    mol = Chem.MolFromSmiles(smi)
    formula = CalcMolFormula(mol)
    if base: formula = '{} (base)'.format(formula)
    mol_im = Draw.MolToImage(mol, size=(w, h_))
    im = Image.new('RGB', (w, h), color='white')
    im.paste(mol_im)
    draw = ImageDraw.Draw(im)
    tw, th = draw.textsize(formula)
    draw.text((w / 2 - tw / 2, h - th - vpadding),
              formula,
              font=font,
              fill='black')
    return im
def canon(df, idx):
    print('trying to canonize smile for idx: {}'.format(idx))
    try:
        smile = df.loc[idx]['smiles']
        m = Chem.MolFromSmiles(smile)
        m = Chem.AddHs(m)
        c_smile = Chem.MolToSmiles(m)
        df.loc[idx, 'c_smiles'] = c_smile
        df.loc[idx, 'status'] = 0

        formula = CalcMolFormula(m)
        if 'Cl' in formula:
            formula = formula.replace('Cl', '')
            formula = formula + 'Cl'

        df.loc[idx, 'Formula'] = formula

    except Exception as e:
        df.loc[idx, 'status'] = -2
        print("could not convert smile {} of molecule {} : {}".format(
            smile, idx, df.loc[idx]['Name']))
        print('Exception: {}'.format(e))
    return df
Beispiel #13
0
def _chembl_desc_list(mol):
    mol = _neutralise_sulphoxide(mol)
    descriptors = dict()
    for name, fn in Descriptors.descList:
        if name in CBL_DESC_LIST:
            if name == 'ExactMolWt':
                mol = _remove_isotope_info(deepcopy(mol))
            descriptors[name] = fn(mol)
    if 'MolecularFormula' not in descriptors:
        descriptors['MolecularFormula'] = CalcMolFormula(mol)
    descriptors['Ro3Pass'] = _ro3_pass(descriptors['MolWt'],
                                       descriptors['NumHAcceptors'],
                                       descriptors['NumHDonors'],
                                       descriptors['MolLogP'],
                                       descriptors['NumRotatableBonds'],
                                       descriptors['TPSA'])
    descriptors['NumRo5'] = _num_ro5_violations(descriptors['MolLogP'],
                                                descriptors['MolWt'],
                                                descriptors['NumHAcceptors'],
                                                descriptors['NumHDonors'])
    descriptors['MonoisotopicMolWt'] = descriptors.pop('ExactMolWt')
    return descriptors
Beispiel #14
0
def process_compound(line):
    id, smi, label = line.split('\t')

    atc = atcs.get(id, set())
    label = labels[int(label)].replace('/', '_')
    name = names.get(id)
    mol = Chem.MolFromSmiles(smi)
    formula = CalcMolFormula(mol)

    # Just generate all images
    # so we don't have to worry about them later
    im_fname = 'img/{}.png'.format(id)
    if not os.path.exists(im_fname):
        im = Draw.MolToImage(mol)
        im.save(im_fname)

    return label, {
        'id': id,
        'name': name,
        'formula': formula,
        'atc_codes': list([atc[:5] for atc in atc])
    }
summary = pd.DataFrame({'smiles': smiles, 'ion_mode': modes, 'energy': energies})


# example 1
idx = 551
smi = smiles[idx]
mol = Chem.MolFromSmiles(smi)
ms_pred = model_predict(smi, model)
ms_real = ms[idx]

# annotation
mzs = np.array(ms_pred['mz'])
intensities = np.array(ms_pred['intensity'])
mol = Chem.MolFromSmiles(smi)
precursor = CalcExactMolWt(mol) - 1.0032
formula = CalcMolFormula(mol)
frags = np.unique(generateFragments(smi, treeDepth=2))
frags_new = [Chem.MolFromSmiles(s) for s in frags]
frags_formula = np.unique([CalcMolFormula(f) for f in frags_new])
loss_formula = []
for f in frags_formula:
    l = subtract_formula(formula, f)
    if l == '':
        continue
    if check_formula(l):
        loss_formula.append(l)
    add_H = add_formula(l, 'H')
    de_H = subtract_formula(l, 'H')
    if check_formula(add_H):
        loss_formula.append(add_H)
    if check_formula(de_H):
    def getMoleculeFeatures(self):
        """Get the essential features of the constructed rdMol for the input component."""
        mD = self.__rdMol.GetPropsAsDict()
        logger.debug("mol props %r", mD.items())
        #
        formula = CalcMolFormula(self.__rdMol)
        ccId = self.__ccId
        ifCharge = Chem.rdmolops.GetFormalCharge(self.__rdMol)
        #
        inchiKey = Chem.inchi.MolToInchiKey(self.__rdMol)
        inchi = Chem.inchi.MolToInchi(self.__rdMol)
        smiles = Chem.rdmolfiles.MolToSmiles(self.__rdMol,
                                             isomericSmiles=False,
                                             canonical=True)
        isoSmiles = Chem.rdmolfiles.MolToSmiles(self.__rdMol,
                                                isomericSmiles=True,
                                                canonical=True)
        logger.debug("%s formula %s", ccId, formula)
        details = ComponentDetails(ccId=ccId,
                                   formula=formula,
                                   ifCharge=ifCharge)
        descriptors = ComponentDescriptors(smiles=smiles,
                                           isoSmiles=isoSmiles,
                                           inchi=inchi,
                                           inchiKey=inchiKey)
        #
        typeCounts = defaultdict(int)
        ccAtomD = {}
        ccAtomIdD = {}
        for ii, at in enumerate(self.__rdMol.GetAtoms(), 1):
            atIdx = at.GetIdx()
            aType = at.GetSymbol()
            typeCounts[aType] += 1
            atName = self.__atomIdxD[
                ii] if ii in self.__atomIdxD else aType + str(
                    typeCounts[aType])
            # atNo = at.GetAtomicNum()
            isAromatic = at.GetIsAromatic()
            isChiral = at.GetChiralTag() > 0
            iCharge = at.GetFormalCharge()
            # cipStereo = at.GetProp("_CIPCode")
            atD = at.GetPropsAsDict(includePrivate=True, includeComputed=True)
            cipStereo = None
            if "_CIPCode" in atD:
                cipStereo = atD["_CIPCode"]
            if cipStereo and cipStereo not in ["S", "R"]:
                logger.error("%s (%s): Unexpected atom CIP stereo setting %r",
                             ccId, atName, cipStereo)
            #
            ccAtomD[atName] = ComponentAtom(name=atName,
                                            aType=aType,
                                            isAromatic=isAromatic,
                                            isChiral=isChiral,
                                            CIP=cipStereo,
                                            fCharge=iCharge)
            ccAtomIdD[atIdx] = atName
            # nL = at.GetProp(includePrivate=True, includeComputed=True)
            atD = at.GetPropsAsDict(includePrivate=True, includeComputed=True)
            logger.debug("%s Atom %s %s %r %r %s", ccId, atName, aType,
                         isAromatic, isChiral, cipStereo)
        #
        ccBondD = {}
        for bnd in self.__rdMol.GetBonds():
            atI = bnd.GetBeginAtomIdx()
            atJ = bnd.GetEndAtomIdx()
            atNameI = ccAtomIdD[atI]
            atNameJ = ccAtomIdD[atJ]
            isAromatic = bnd.GetIsAromatic()
            #
            # bType = bnd.GetBondType()
            # iType = 0
            cipStereo = None
            tS = bnd.GetStereo()
            if tS == Chem.rdchem.BondStereo.STEREOE:
                cipStereo = "E"
            elif tS == Chem.rdchem.BondStereo.STEREOZ:
                cipStereo = "Z"

            # bL = bnd.GetPropNames(includePrivate=True, includeComputed=True)
            bD = bnd.GetPropsAsDict(includePrivate=True, includeComputed=True)
            iType = bD["_MolFileBondType"]
            logger.debug("Bond %s %s iType %r cipStereo %r aromatic %r",
                         atNameI, atNameJ, iType, cipStereo, isAromatic)
            #
            if cipStereo and cipStereo not in ["E", "Z"]:
                logger.error(
                    "%s (%s %s): Unexpected bond CIP stereo setting %r", ccId,
                    atNameI, atNameJ, cipStereo)
            #
            ccBondD[(atNameI, atNameJ)] = ComponentBond(iType=iType,
                                                        isAromatic=isAromatic,
                                                        CIP=cipStereo)
        #
        ccD = {
            "details": details,
            "descriptors": descriptors,
            "atoms": ccAtomD,
            "bonds": ccBondD
        }
        return ccD
from rdkit import Chem
import sys
import re
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
formula = CalcMolFormula(Chem.MolFromSmiles(sys.argv[1]))
print re.sub(r'([a-z]*)([A-Z])', r'\1 \2', formula).lstrip()
Beispiel #18
0
def parse_f(f):
    names = ['']
    cid = -1
    CAS = f.split('/')[1] if '/' in f else f
    CAS = CAS.split('.')[0]
    if CAS in ignored_CASs:
        return None
    failed_mol = False
    try:
        if CAS in syn_data:
            d = syn_data[CAS]
            if 'pubchem' in d:
                raise Exception(
                    'Pubchem specified, not trying to use the mol file')
            elif 'formula' in d:
                raise Exception(
                    'Formula specified, not trying to use the mol file')
        try:
            mol = Chem.MolFromMolFile(f)
            assert mol is not None
        except:
            print('Cannot read %s' % f)
            1 / 0
        try:
            inchi_val = inchi.MolToInchi(mol)
        except:
            print('BAILING ON %s' % f)
            1 / 0
        mol = inchi.MolFromInchi(inchi_val)  # Works better for ions
        if mol is None:
            print('BAILING ON reconversion to mol %s' % f)
            1 / 0
    except:
        failed_mol = True
        if CAS in syn_data:
            d = syn_data[CAS]
            if 'pubchem' in d:
                if str(d['pubchem']) in mycache:
                    cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula = mycache[
                        str(d['pubchem'])]
                else:
                    pc = Compound.from_cid(d['pubchem'])
                    cid = pc.cid
                    iupac_name = pc.iupac_name
                    names = pc.synonyms
                    mw = pc.molecular_weight
                    smi = pc.canonical_smiles
                    inchi_val = pc.inchi
                    inchikey = pc.inchikey
                    formula = pc.molecular_formula

                    mycache[str(d['pubchem'])] = (cid, iupac_name, names, mw,
                                                  smi, inchi_val, inchikey,
                                                  formula)
            else:
                cid = -1
                names = d['synonyms'] if 'synonyms' in d else ['']
                mw = float(d['MW'])
                smi = d['smiles'] if 'smiles' in d else ''
                formula = d['formula'] if 'formula' in d else ''
                inchi_val = d['inchi'] if 'inchi' in d else ''
                inchikey = d['inchikey'] if 'inchikey' in d else ''
                iupac_name = ''
        else:
            print('FAILED on %s and no custom data was available either' % CAS)
            return None

    if not failed_mol:
        smi = Chem.MolToSmiles(mol, True)
        inchi_val = inchi.MolToInchi(mol)
        inchikey = inchi.InchiToInchiKey(inchi_val)
        mw = Descriptors.MolWt(mol)
        #        for i in mol.GetAtoms():
        #            if i.GetIsotope():
        #                mw = Descriptors.ExactMolWt(mol)
        #                break

        formula = CalcMolFormula(mol, True, True)
        iupac_name = ''
    try:
        if not failed_mol:
            if str(inchikey) in mycache:
                cid, iupac_name, names = mycache[str(inchikey)]
            else:
                try:
                    pc = get_compounds(inchikey, 'inchikey')[0]
                    cid = pc.cid
                    iupac_name = pc.iupac_name
                    names = pc.synonyms
                    mycache[str(inchikey)] = (cid, iupac_name, names)
                except:
                    mycache[str(inchikey)] = (-1, '', [''])
    except:
        cid = -1
        iupac_name = ''
        names = ['']

    other_CAS = []
    if CAS in pdf_data:
        d = pdf_data[CAS]
        name = d['Name']
        if 'Other Names' in d:
            syns = d['Other Names']
        else:
            syns = []
        if not iupac_name:
            iupac_name = name
        else:
            syns.insert(0, name)
        if 'Deleted CAS' in d:
            other_CAS.extend(d['Deleted CAS'])
        if 'Alternate CAS' in d:
            other_CAS.extend(d['Alternate CAS'])

        syns = [i for i in syns if i not in dup_names]
        names = syns + [i for i in names if i not in all_names] + other_CAS
    actual_names = []
    for name in names:
        if name in all_user_names:
            # If the name is in the user db, only add it if it corresponds to this CAS number
            if CAS in syn_data and 'synonyms' in syn_data[
                    CAS] and name in syn_data[CAS]['synonyms']:
                actual_names.append(name)
            else:
                # Discard it otherwise
                pass
        else:
            # If the name is not in the user db we're all good
            actual_names.append(name)
    if CAS in syn_data and 'synonyms' in syn_data[CAS]:
        # If the user has any syns for this cas number, add those names if the name hasn't already been aded
        for n in syn_data[CAS]['synonyms']:
            if n not in actual_names:
                actual_names.append(n)

    actual_names = [i for i in actual_names if i]

    if inchi_val is not None:
        inchi_val = inchi_val.replace('InChI=1S/', '')

    formula = serialize_formula(formula)
    s = '%d\t%s\t%s\t%g\t%s\t%s\t%s\t%s\t' % (cid, CAS, formula, mw, smi,
                                              inchi_val, inchikey, iupac_name)

    s += '\t'.join(actual_names)
    print(s)
    return None
Beispiel #19
0
def process(init_data, use_cache=True):
    '''
    
    Examples
    --------
    
    >>> res = process({'CAS': '10170-69-1', 'synonyms': ['14267-36-8', 'NSC 22319'], 'name': 'Manganese, decacarbonyldi-, (Mn-Mn)'})
    >>> res['inchi'], res['smiles'], res['cid'], res['CAS']
    ('InChI=1S/10CO.2Mn/c10*1-2;;', '[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[C-]#[O+].[Mn].[Mn]', 517769, '10170-69-1')
    '''
    # print(locals())
    init_data = init_data.copy()
    cc = cc_CAS = cc_name = cc_inchi = cc_inchikey = cc_smiles = cc_synonyms = cc_deprecated_CASs = None
    if 'CAS' in init_data:
        try:
            cc = common_chemistry_data(init_data['CAS'])
            cc_CAS, cc_name, cc_inchi, cc_inchikey, cc_smiles, cc_synonyms, cc_deprecated_CASs = cc
        except ValueError:
            # Compund is not in common chemistry; this is OK
            pass

    cid = iupac_name = p_MW = p_inchi = p_inchikey = p_smiles = p_formula = p_synonyms = None

    if init_data.get('mol', None) is not None:
        # If not in common chemistry or no InChi there, but if we have a mol file, get the inchi and inchikey for the
        # pubchem lookup
        mol = Chem.MolFromMolFile(init_data['mol'])
        if mol is not None:
            init_data['inchi'] = MolToInchi(mol)
            init_data['inchikey'] = InchiToInchiKey(init_data['inchi'])

    can_search_pubchem = (init_data.get('pubchem') is not None
                          or init_data.get('CASRN', cc_CAS) is not None
                          or init_data.get('inchi', cc_inchi) is not None
                          or init_data.get('inchikey', cc_inchikey) is not None
                          or init_data.get('smiles', cc_smiles) is not None)

    if can_search_pubchem:
        try:
            p = find_pubchem_from_ids(
                pubchem=init_data.get('pubchem'),
                CASRN=init_data.get('CASRN', cc_CAS),
                inchi=init_data.get('inchi', cc_inchi),
                inchikey=init_data.get('inchikey', cc_inchikey),
                smiles=init_data.get('smiles', cc_smiles),
                use_cache=use_cache)
        except Exception as e:
            p = None
            print(e, 'exception')
        if p is not None:
            cid, iupac_name, p_MW, p_inchi, p_inchikey, p_smiles, p_formula, p_synonyms = p
    # print(locals())
    mol = None
    # Be aware some smiles descriptions are wrong
    # Start with user overridding
    if 'mol' in init_data:
        mol = Chem.MolFromMolFile(init_data['mol'])
    if mol is None and 'smiles' in init_data:
        mol = Chem.MolFromSmiles(init_data['smiles'])
    if mol is None and 'inchi' in init_data:
        mol = MolFromInchi(
            init_data['inchi']) if init_data['inchi'].startswith(
                "InChI=1S/") else MolFromInchi("InChI=1S/" +
                                               init_data['inchi'])
    # Trust common chemistry next
    if mol is None and cc_smiles is not None:
        mol = Chem.MolFromSmiles(cc_smiles)
    if mol is None and cc_inchi is not None:
        mol = MolFromInchi(cc_inchi) if cc_inchi.startswith(
            "InChI=1S/") else MolFromInchi("InChI=1S/" + cc_inchi)
    # Did we pull up the structure from pubchem??
    if mol is None and p_smiles is not None:
        mol = Chem.MolFromSmiles(p_smiles)
    if mol is None and p_inchi is not None:
        mol = MolFromInchi(p_inchi) if p_inchi.startswith(
            "InChI=1S/") else MolFromInchi("InChI=1S/" + p_inchi)
    if mol is None:
        raise ValueError("No structure found")

    smiles = Chem.MolToSmiles(mol, True)
    inchi = MolToInchi(mol)
    inchikey = InchiToInchiKey(inchi)
    #MW = Descriptors.ExactMolWt(mol)
    formula = CalcMolFormula(mol, True, True)
    formula = serialize_formula(formula)
    MW = molecular_weight(nested_formula_parser(formula))

    # print(inchi, cc_inchi, p_inchi)
    # print(inchikey, cc_inchikey, p_inchikey)
    # print(smiles, cc_smiles, p_smiles)

    # output values
    if 'pubchem' in init_data:
        cid = init_data['pubchem']
    elif cid is None:
        cid = -1

    if cc_CAS is not None:
        CAS = cc_CAS
    elif 'CAS' in init_data:
        CAS = init_data['CAS']
    else:
        raise ValueError("CAS culd not be found")

    if 'formula' in init_data:
        # Override rdkit
        formula = init_data['formula']

    if 'MW' in init_data:
        # Override rdkit
        MW = init_data['MW']

    if 'smiles' in init_data:
        smiles = init_data['smiles']
    if 'inchi' in init_data:
        inchi = init_data['inchi']
    if 'inchikey' in init_data:
        inchikey = init_data['inchikey']

    if inchikey == '*' or smiles == '*' or inchi == '*':
        raise ValueError("Failure in rdkit")

    # Do we have a name specified in the settings?
    if 'name' in init_data:
        name = init_data['name']
    elif cc_name is not None:
        name = cc_name
    elif iupac_name is not None:
        name = iupac_name
    else:
        raise ValueError("There is no name for this compound")

    synonyms = []
    if cc_synonyms is not None:
        synonyms += cc_synonyms
    if cc_deprecated_CASs is not None:
        synonyms += cc_deprecated_CASs
    if p_synonyms is not None:
        synonyms += p_synonyms
    if 'synonyms' in init_data:
        synonyms += init_data['synonyms']
    synonyms = list(set(synonyms))
    if name in synonyms:
        synonyms.remove(name)
    if synonyms:

        def key_sort_str(s):
            return len(s), s.lower()

        synonyms = sorted(synonyms, key=key_sort_str)
        # synonyms = natsorted(synonyms)
    # synonyms = []

    return {
        'cid': cid,
        'CAS': CAS,
        'formula': formula,
        'MW': MW,
        'smiles': smiles,
        'inchi': inchi,
        'inchikey': inchikey,
        'name': name,
        'synonyms': synonyms
    }
Beispiel #20
0
def get_chemicalFormula(mol):
    ''' Chemical Formula '''
    return CalcMolFormula(mol)
Beispiel #21
0
            if val.isalpha():
                break
            else:
                checkIsItFirst = True
                num += str(val)
                secondPart = secondPart[1:]
        if not checkIsItFirst:
            num = 1
    return num


for index, row in df.iterrows():
    smiles = row['smiles']
    readedFormula = row['stoichiometry']
    molObj = Chem.MolFromSmiles(smiles)
    formula = CalcMolFormula(molObj)
    atoms = rdkit.Chem.rdchem.Mol.GetAtoms(molObj)
    if isnan(formula) or isnan(readedFormula):
        print("*************** NAN VALUE : " + str(readedFormula) + " - " +
              str(formula))
    else:
        if readedFormula != formula:
            for atom in atoms:
                atomSymbol = rdkit.Chem.rdchem.Atom.GetSymbol(atom)
                atomSymbolSTR = str(atomSymbol)
                readedNumber = GetNumber(str(readedFormula), atomSymbolSTR)
                createdNumber = GetNumber(str(formula), atomSymbolSTR)
                if readedNumber != createdNumber:
                    print(readedFormula)
                    print(formula)
                    print(count)
Beispiel #22
0
    df = pd.DataFrame(all_dicts)
    return df


print('doing biocyc')
molecules = []
for path in biocyc_paths:
    mol_files = glob.glob(os.path.join(path, 'MetaCyc-MOLfiles/*.mol'))
    for f in mol_files:
        cpd_id = os.path.basename(f).replace('.mol', '')
        with open(f, 'r', encoding='utf-8', errors='ignore') as fid:
            t = fid.read()
        name = t.split('\n')[0].strip().strip('"')
        mol = Chem.MolFromMolFile(f, sanitize=True)
        if mol is not None:
            formula = CalcMolFormula(mol)
            try:
                Chem.rdmolops.Kekulize(mol, clearAromaticFlags=True)
                smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
                original_smiles = smiles
            except:
                original_smiles = None
            molecules.append({
                'original_id': str(cpd_id),
                'name': str(name),
                'source': str('BioCyc'),
                'formula': str(formula),
                'original_smiles': str(original_smiles),
                'unique_id': str(uuid.uuid4())
            })
        else:
Beispiel #23
0
    mol, status = neutralise_charges(mol)
    mol, status = desalt(mol)
    SanitizeMol(mol)
    mol, status = neutralise_charges(mol)
    SanitizeMol(mol)

    Kekulize(mol, clearAromaticFlags=True)
    new_smiles = MolToSmiles(mol, isomericSmiles=True)
    new_inchikey = MolToInchiKey(mol)
    new_inchi = MolToInchi(mol)
    mw = ExactMolWt(mol)
    sdf.loc[i, 'smiles'] = new_smiles
    sdf.loc[i, 'inchi_key'] = new_inchikey
    sdf.loc[i, 'inchi'] = new_inchi
    sdf.loc[i, 'neutral_mass'] = mw
    sdf.loc[i, 'formula'] = CalcMolFormula(mol)

setup_cols = {
    'inchi_key': 'metatlas_inchikey',
    'inchi': 'metatlas_inchi',
    'formula': 'metatlas_formula',
    'neutral_mass': 'metatlas_mw',
    'code': 'original_id',
    'library': 'source'
}
sdf.rename(columns=setup_cols).to_csv(
    '/Users/bpb/Downloads/Tim-Tec-Compounds.tab', sep='\t', index=None)

df = pd.merge(sdf[[
    'code', 'inchi_key', 'inchi', 'name', 'neutral_mass', 'original_smiles'
]],