Exemple #1
0
def inchi_to_graph(inchi, max_atomic_number=118, device=torch.device('cpu')):
    """
    Converts an inchi string to a DGL Graph object and associate the one hot encoding features for each node.
    :param inchi: An inchi string
    :param max_atomic_number: The max_atomic_number determines the final size of the nodes feature matrix
    :return: DGL.Graph
    """
    mol = MolFromInchi(inchi)
    num_atoms = mol.GetNumAtoms()
    # DGLGraph creation from rdkit mol object
    graph = dgl.DGLGraph()
    graph.add_nodes(num_atoms)
    for bond in mol.GetBonds():
        src = bond.GetBeginAtomIdx()
        dest = bond.GetEndAtomIdx()
        graph.add_edge(src, dest)
        # Edges in DGL are directional, to ensure bidirectionality, add reverse edge
        graph.add_edge(dest, src)

    # One hot encoding for nodes features
    one_hot_indexes = []
    for atom_index in range(num_atoms):
        one_hot_indexes.append([mol.GetAtomWithIdx(atom_index).GetAtomicNum()])
    graph.ndata['x'] = torch.zeros(num_atoms, max_atomic_number) \
        .scatter_(1, torch.tensor(one_hot_indexes), 1).to(device)

    return graph
Exemple #2
0
def test_sequence_minimal():
    # Violacein
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer().compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    assert MolToSmiles(ans) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=C/C1=C1\\C(O)=Nc2ccccc21'
    # L-Lactate
    mol = MolFromInchi('')
Exemple #3
0
def test_keep_biggest():
    mol = Filters.keep_biggest(MolFromSmiles('CCCC.CC'))
    assert MolToSmiles(mol) == 'CCCC'
    mol = Filters.keep_biggest(MolFromSmiles('CCCCC.CC.[H].CCC'))
    assert MolToSmiles(mol) == 'CCCCC'
    mol = Filters.keep_biggest(MolFromInchi(
        'InChI=1S/C5H12N2O2.C4H7NO4/c6-3-1-2-4(7)5(8)9;5-2(4(8)9)1-3(6)7/h4H,1-3,6-7H2,(H,8,9);2H,1,5H2,(H,6,7)(H,8,9)/t4-;2-/m00/s1'))
    assert MolToInchi(mol) == 'InChI=1S/C4H7NO4/c5-2(4(8)9)1-3(6)7/h2H,1,5H2,(H,6,7)(H,8,9)/t2-/m0/s1'
    mol = Filters.keep_biggest(MolFromInchi('InChI=1S/Mo.4O/q;;;2*-1'))
    assert MolToInchi(mol) == 'InChI=1S/Mo'
def annotate_chemical_svg(network):
    """Annotate chemical nodes with SVGs depiction.

    :param network: dict, network of elements as outputted by the sbml_to_json method
    :return: dict, network annotated
    """
    from rdkit.Chem import MolFromInchi
    from rdkit.Chem.Draw import rdMolDraw2D
    from rdkit.Chem.AllChem import Compute2DCoords
    from urllib import parse

    for node in network['elements']['nodes']:
        if node['data']['type'] == 'chemical' and node['data'][
                'inchi'] is not None:
            inchi = node['data']['inchi']
            try:
                mol = MolFromInchi(inchi)
                # if mol is None:
                #     raise BaseException('Mol is None')
                Compute2DCoords(mol)
                drawer = rdMolDraw2D.MolDraw2DSVG(200, 200)
                drawer.DrawMolecule(mol)
                drawer.FinishDrawing()
                svg_draft = drawer.GetDrawingText().replace("svg:", "")
                svg = 'data:image/svg+xml;charset=utf-8,' + parse.quote(
                    svg_draft)
                node['data']['svg'] = svg
            except BaseException as e:
                msg = 'SVG depiction failed from inchi: "{}"'.format(inchi)
                logging.warning(msg)
                logging.warning("Below the RDKit backtrace...")
                logging.warning(e)
                node['data']['svg'] = None

    return network
Exemple #5
0
 def drawChemicalList(self, id_inchi, subplot_size=[200, 200]):
     from rdkit.Chem import MolFromInchi
     from rdkit.Chem import Draw
     toRet = {}
     inchi_list = list(set([id_inchi[i] for i in id_inchi]))
     list_mol = [MolFromInchi(inchi) for inchi in inchi_list]
     for i in range(len(list_mol)):
         cp_list_mol = copy.deepcopy(list_mol)
         cp_list_mol.pop(i)
         tmp_list_mol = [list_mol[i]]+cp_list_mol
         img = Draw.MolsToGridImage(tmp_list_mol, molsPerRow=1, subImgSize=(subplot_size[0], subplot_size[1]), useSVG=True)
         #add the groups tag with the id's of the reactions -- should have be size width=subplot_size[0] height=subplot_size[1]*len(list_mol)
         bond_0_count = 0
         svg_str = ''
         for line in img.splitlines():
             add_line = True
             m0 = re.findall("(\d+\.\d+)", line)
             if m0:
                 for y in m0:
                     if float(y)>subplot_size[1]:
                         add_line = False
             m1 = re.findall("height=\'\d+", line)
             if m1:
                 line = re.sub(r"height=\'\d+", "height=\'"+str(subplot_size[1]), line)
                 #line.replace(str(subplot_size[i]*len(list_mol)), str(subplot_size[1]))
             if add_line:
                 svg_str += line+'\n'
         for y in id_inchi:
             if id_inchi[y]==inchi_list[i]:
                 toRet[y] = svg_str
     return toRet
Exemple #6
0
 def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}):
     # Import (if needed)
     if itype == 'smiles':
         rdmol = MolFromSmiles(idepic, sanitize=True)
     elif itype == 'inchi':
         rdmol = MolFromInchi(idepic, sanitize=True)
     else:
         raise NotImplementedError(
             '"{}" is not a valid input type'.format(itype))
     if rdmol is None:  # Check imprt
         raise self.DepictionError(
             'Import error from depiction "{}" of type "{}"'.format(
                 idepic, itype))
     # Export
     odepic = dict()
     for item in otype:
         if item == 'smiles':
             odepic[item] = MolToSmiles(
                 rdmol
             )  # MolToSmiles is tricky, one mays want to check the possible options..
         elif item == 'inchi':
             odepic[item] = MolToInchi(rdmol)
         elif item == 'inchikey':
             odepic[item] = MolToInchiKey(rdmol)
         else:
             raise NotImplementedError(
                 '"{}" is not a valid output type'.format(otype))
     return odepic
Exemple #7
0
def convert_depiction(idepic, itype='smiles', otype={'inchikey'}):
    """Convert chemical depiction to others type of depictions
    
    :param  idepic: string depiction to be converted, str
    :param   itype: type of depiction provided as input, str
    :param   otype: types of depiction to be generated, {"", "", ..}
    :return odepic: generated depictions, {"otype1": "odepic1", ..}
    
    Usage example:
    - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'})
    - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'})
    """
    # Import (if needed)
    if itype == 'smiles':
        rdmol = MolFromSmiles(idepic, sanitize=True)
    elif itype == 'inchi':
        rdmol = MolFromInchi(idepic, sanitize=True)
    else:
        raise NotImplementedError('"{}" is not a valid input type'.format(itype))
    if rdmol is None:  # Check imprt
        raise Exception('Import error from depiction "{}" of type "{}"'.format(idepic, itype))
    
    # Export
    odepic = dict()
    for item in otype:
        if item == 'smiles':
            odepic[item] = MolToSmiles(rdmol)  # MolToSmiles is tricky, one mays want to check the possible options..
        elif item == 'inchi':
            odepic[item] = MolToInchi(rdmol)
        elif item == 'inchikey':
            odepic[item] = MolToInchiKey(rdmol)
        else:
            raise NotImplementedError('"{}" is not a valid output type'.format(otype))

    return odepic
Exemple #8
0
 def _transform(self, x):
     try:
         mol = MolFromInchi(x['standard_inchi'])
     except:
         mol = MolFromSmiles(x['Compound_SMILES'])
     info = {}
     AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, self.dim, bitInfo=info)
     return list(info.keys())
Exemple #9
0
def test_remove_stereo():
    mol = Filters.remove_stereo(MolFromSmiles('C[C@@H](C(=O)[O-])O'))
    assert MolToSmiles(mol) == 'CC(O)C(=O)[O-]'
    mol = Filters.remove_stereo(MolFromInchi(
        'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'))
    assert MolToSmiles(mol) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(O)=Nc2ccccc21'
    mol = Filters.commute_inchi(mol)  # Expected to change tautomerism
    assert MolToSmiles(mol) == 'O=C1NC(C2=CNC3=C2C=C(O)C=C3)=CC1=C1C(=O)NC2=CC=CC=C21'
def add_exact_mass(specs):
    for s in specs:
        mol = MolFromSmiles(s.get('smiles'))
        if mol is None:
            mol = MolFromInchi(s.get('inchi'))
        exact_mass_smi = CalcExactMolWt(mol)
        if abs(exact_mass_smi - s.get('parent_mass', 0.0) > 1):
            print(exact_mass_smi, s.get('parent_mass'))
        s.set('exact_mass', exact_mass_smi)
def standarize_mol_by_inchi(mol, neutralize=True):
    newmol = AddHs(mol)
    sinchi, code, msg = generate_inchi(newmol, FixedH=False, RecMet=False)
    if neutralize:
        nsinchi = neutralize_inchi(sinchi)
    else:
        nsinchi = sinchi
    newmol = MolFromInchi(nsinchi, removeHs=False)
    newmol = AddHs(newmol, explicitOnly=True)
    return newmol
Exemple #12
0
def test_sequence_rr_legacy():
    # Violacein
    mol = MolFromInchi(
        'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    )
    ans = Standardizer(sequence_fun='sequence_rr_legacy').compute(mol)
    assert MolToInchi(
        ans
    ) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    assert MolToSmiles(
        ans
    ) == '[H]OC1=NC(C2=C([H])N([H])C3=C2C([H])=C(O[H])C([H])=C3[H])=C([H])/C1=C1\\C(O[H])=NC2=C([H])C([H])=C([H])C([H])=C21'
Exemple #13
0
    def commute_inchi(cls, mol_in):
        """Convert RDKit compound back and forth to InChi.

        Returns a new compound after the initial one has been converted
        back and forth to InChi.
        
        :param   mol_in:  RDKit Mol
        :return  mol_out: RDKit Mol
        """
        inchi = MolToInchi(mol_in, logLevel=None)  # this is talkative...
        mol_out = MolFromInchi(inchi,
                               sanitize=False,
                               removeHs=False,
                               logLevel=None,
                               treatWarningAsError=False)
        if not mol_out:
            raise ValueError("Failed InChi validity filter.")
        # Copy the properties
        cls._copy_properties(mol_in, mol_out)
        return mol_out
Exemple #14
0
    def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}):
        """Convert chemical depiction to others type of depictions

        Usage example:
         - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'})
         - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'})

        :param idepic: Input string
        :param itype: The type of input
        :param otype: Type of output. Valid options: inchi, smiles, inchikey

        :type idepic: str 
        :type itype: str
        :type otype: dict

        :rtype: dict
        :return: Dictionnary of results
        """
        # Import (if needed)
        if itype == 'smiles':
            rdmol = MolFromSmiles(idepic, sanitize=True)
        elif itype == 'inchi':
            rdmol = MolFromInchi(idepic, sanitize=True)
        else:
            raise NotImplementedError('"{}" is not a valid input type'.format(itype))
        if rdmol is None:  # Check imprt
            raise self.DepictionError('Import error from depiction "{}" of type "{}"'.format(idepic, itype))
        # Export
        odepic = dict()
        for item in otype:
            if item == 'smiles':
                odepic[item] = MolToSmiles(rdmol)  # MolToSmiles is tricky, one mays want to check the possible options..
            elif item == 'inchi':
                odepic[item] = MolToInchi(rdmol)
            elif item == 'inchikey':
                odepic[item] = MolToInchiKey(rdmol)
            else:
                raise NotImplementedError('"{}" is not a valid output type'.format(otype))
        return odepic
Exemple #15
0
def rdmols_from_document(document, build_from="inchi", add_hs=True):
    """
    Convert back a document to a set of rdmols. This method is a companion of "as_document".

    :param document: a document produced by the "as_mongo_document" method, dict
    :param build_from: the type of depiction to be used to build back the rdmols, str in ["inchi", "smiles"]
    :param add_hs: add Hs to RDKit mol object, default is True
    :returns list_list_rdmols: list of list of rdmols
    """
    assert build_from in ["inchi", "smiles"]
    assert add_hs in [True, False]

    list_list_rdmols = list()
    list_stoechiometry = document['list_stoechiometry']
    if build_from == 'inchi':
        for list_inchis in document['list_list_inchis']:
            list_rdmols = list()
            for inchi in list_inchis:
                rd_mol = MolFromInchi(inchi, sanitize=True)
                if add_hs:
                    rd_mol = AddHs(rd_mol)
                list_rdmols.append(rd_mol)
            list_list_rdmols.append(list_rdmols)
    elif build_from == 'smiles':
        for list_smiles in document['list_list_smiles']:
            list_rdmols = list()
            for smiles in list_smiles:
                rd_mol = MolFromSmiles(smiles, sanitize=True)
                if add_hs:
                    rd_mol = AddHs(rd_mol)
                list_rdmols.append(rd_mol)
            list_list_rdmols.append(list_rdmols)
    else:
        raise NotImplementedError()

    return list_list_rdmols, list_stoechiometry
Exemple #16
0
def generate_structure_and_dictionary(batch):
    """
    Adding the structure data to a compound batch object
    """
    chirality = "1"
    if batch.id:
        print "not updating"
        # currently we dont update existing compound records
    else:
        if not batch.ctab:
            #blinded compound

            uox_id = generate_uox_id()
            batch.blinded_batch_id = uox_id
            batch.save(validate=False)

        else:
            if not batch.canonical_smiles or not batch.related_molregno_id:
                try:

                    pybelmol = readstring("mol",
                                          str(batch.ctab).encode("ascii"))
                    batch.canonical_smiles = pybelmol.write("can").split(
                        "\t")[0]
                    batch.properties["cdxml"] = pybelmol.write("cdxml")
                except:
                    pass
                try:
                    mol = MolFromInchi(
                        batch.standard_inchi.encode('ascii', 'ignore'))
                    if mol:
                        batch.std_ctab = MolToMolBlock(mol, includeStereo=True)
                except:
                    pass
                inchi_key = batch.standard_inchi_key
                inchi = batch.standard_inchi
                if not batch.related_molregno_id:
                    try:
                        moldict = MoleculeDictionary.objects.get(
                            project=batch.project,
                            structure_type="MOL",
                            # chirality=chirality,
                            structure_key=batch.standard_inchi_key)
                    except ObjectDoesNotExist:
                        uox_id = None
                        forced_uox_id = batch.warnings.get(
                            "original_uox_id", None)
                        if forced_uox_id:
                            count_existing_objects = CBHCompoundBatch.objects.filter(
                                related_molregno__chembl__chembl_id=
                                forced_uox_id).count()
                            count_existing_objects += CBHCompoundBatch.objects.filter(
                                blinded_batch_id=forced_uox_id).count()
                            if count_existing_objects == 0:
                                uox_id = forced_uox_id
                                #Now check if there is a chembl and remove if so
                                ChemblIdLookup.objects.filter(
                                    chembl_id=uox_id).delete()
                            else:
                                print(
                                    "Had to generate a new compound ID for %s"
                                    % forced_uox_id)

                        if not uox_id:
                            uox_id = generate_uox_id()
                        rnd = random.randint(-1000000000, -2)
                        uox_id_lookup = ChemblIdLookup.objects.create(
                            chembl_id=uox_id,
                            entity_type="COMPOUND",
                            entity_id=rnd)

                        moldict = MoleculeDictionary.objects.get_or_create(
                            chembl=uox_id_lookup,
                            project=batch.project,
                            structure_type="MOL",
                            structure_key=batch.standard_inchi_key)[0]
                        uox_id_lookup.entity_id = moldict.molregno
                        uox_id_lookup.save()
                        structure = CompoundStructures(
                            molecule=moldict,
                            molfile=batch.std_ctab,
                            standard_inchi_key=inchi_key,
                            standard_inchi=inchi)
                        structure.save()
                        if structure.molecule_id:
                            generateCompoundPropertiesTask(structure)
                    batch.related_molregno = moldict
                batch.save(validate=False)

    return batch
Exemple #17
0
def test_commute_inchi():
    inchi = 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1'
    mol = Filters.commute_inchi(MolFromInchi(inchi))
    assert MolToInchi(mol) == inchi
Exemple #18
0
    def test1InchiReadPubChem(self):
        for f in self.dataset.values():
            same, diff, reasonable = 0, 0, 0
            for m in f:
                if m is None:  # pragma: nocover
                    continue
                x = MolToInchi(m)
                y = None
                RDLogger.DisableLog('rdApp.error')
                mol = MolFromInchi(x)
                RDLogger.EnableLog('rdApp.error')
                if mol is not None:
                    y = MolToInchi(
                        MolFromSmiles(MolToSmiles(mol, isomericSmiles=True)))
                if y is None:
                    # metal involved?
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        _, error = inst.args
                        if 'Metal' in error or \
                                'Charges were rearranged' in error:
                            reasonable += 1
                            continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # RDKit does not like the SMILES? use MolBlock instead
                    inchiMol = MolFromInchi(x)
                    if inchiMol:
                        rdDepictor.Compute2DCoords(inchiMol)
                        z = MolToInchi(MolFromMolBlock(
                            MolToMolBlock(inchiMol)))
                        if x == z:
                            reasonable += 1
                            continue
                    # InChI messed up the radical?
                    unsanitizedInchiMol = MolFromInchi(x, sanitize=False)
                    if sum([
                            a.GetNumRadicalElectrons() * a.GetAtomicNum()
                            for a in m.GetAtoms()
                            if a.GetNumRadicalElectrons() != 0
                    ]) != sum([
                            a.GetNumRadicalElectrons() * a.GetAtomicNum()
                            for a in unsanitizedInchiMol.GetAtoms()
                            if a.GetNumRadicalElectrons() != 0
                    ]):
                        reasonable += 1
                        continue

                    diff += 1
                    cid = m.GetProp('PUBCHEM_COMPOUND_CID')
                    print(COLOR_GREEN + 'Empty mol for PubChem Compound ' +
                          cid + '\n' + COLOR_RESET)
                    continue
                if x != y:
                    # if there was warning in the first place, then this is
                    # tolerable
                    try:
                        MolToInchi(m, treatWarningAsError=True)
                        MolFromInchi(x, treatWarningAsError=True)
                    except InchiReadWriteError as inst:
                        reasonable += 1
                        continue
                    # or if there are big rings
                    SanitizeMol(m)
                    if filter(lambda i: i >= 8,
                              [len(r) for r in m.GetRingInfo().AtomRings()]):
                        reasonable += 1
                        continue
                    # THERE ARE NO EXAMPLES FOR THE FOLLOWING (no coverage)
                    # or if RDKit loses bond stereo
                    s = MolToSmiles(m, True)
                    if MolToSmiles(MolFromSmiles(s), True) != s:
                        reasonable += 1
                        continue
                    # or if it is RDKit SMILES writer unhappy about the mol
                    inchiMol = MolFromInchi(x)
                    rdDepictor.Compute2DCoords(inchiMol)
                    z = MolToInchi(MolFromMolBlock(MolToMolBlock(inchiMol)))
                    if x == z:
                        reasonable += 1
                        continue

                    diff += 1
                    print(COLOR_GREEN +
                          'Molecule mismatch for PubChem Compound ' + cid +
                          COLOR_RESET)
                    print(inchiDiff(x, y))
                    print()
                else:
                    same += 1
            fmt = "\n{0}InChI read Summary: {1} identical, {2} variance, {3} reasonable variance{4}"
            print(fmt.format(COLOR_GREEN, same, diff, reasonable, COLOR_RESET))
            self.assertEqual(same, 621)
            self.assertEqual(diff, 0)
            self.assertEqual(reasonable, 560)
Exemple #19
0
    def save(self, force_insert=False, force_update=False, *args, **kwargs):

        changed = False
        new  =  not bool(CompoundStructures.objects.filter(pk=self.pk).count())
        if settings.OPEN_SOURCE:
            if self.molfile:
                if not new: # The structure already exists and we only want to modify it
                    super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) # this should trigger CMPD_STR_UPDATE_TRIG, which deletes compound images and properties and nulls standard inchi, key, smiles, and molformula
                    changed = True
             #   newInchi = inchiFromPipe(self.molfile, settings.INCHI_BINARIES_LOCATION['1.02'])
                #if newInchi != self.standard_inchi:
                 #   self.standard_inchi = newInchi
                  #  changed = True
            mol = MolFromInchi(self.standard_inchi.encode("ascii"))
            if mol:
            # self.canonical_smiles = MolToSmiles(mol)
                if not self.standard_inchi:
                    raise NoStandardInchi("for CompundStructure, pk = " + str(self.pk))

                newInchiKey = InchiToInchiKey(self.standard_inchi.encode("ascii"))
                if self.standard_inchi_key != newInchiKey:
                    self.standard_inchi_key = newInchiKey
                    mol = MolFromInchi(self.standard_inchi.encode("ascii"))
                    # self.canonical_smiles = MolToSmiles(mol)
                    changed = True
                    self.molfile = MolToMolBlock(MolFromMolBlock(str(self.molfile))) # This is how we do kekulisation in RDKit...

                self.clean_fields()
                self.validate_unique()
                super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs)

        else:
            if self.molfile:
                if not new: # The structure already exists and we only want to modify it
                    super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs) # this should trigger CMPD_STR_UPDATE_TRIG, which deletes compound images and properties and nulls standard inchi, key, smiles, and molformula
                    changed = True

                data = getStructure(self.molfile)

                newInchi = data['InChI']
                if newInchi != self.standard_inchi:
                    self.standard_inchi = newInchi
                    self.standard_inchi_key = data['InChIKey']
                    #self.molformula = data['Molecular_Formula']
                    self.canonical_smiles = data['Canonical_Smiles']
                    changed = True

            if not self.standard_inchi:
                raise NoStandardInchi("for CompundStructure, pk = " + str(self.pk))

            if not self.standard_inchi_key:
                self.standard_inchi_key = InchiToInchiKey(self.standard_inchi.encode("ascii"))

            self.clean_fields()
            self.validate_unique()
            super(CompoundStructures, self).save(force_insert, force_update, *args, **kwargs)

        if changed:
            self.molecule.structure_key = self.standard_inchi_key
            self.molecule.structure_type = "MOL"
            self.molecule.molfile_update = datetime.now()
            self.molecule.save()
            structureChanged.send(sender=self.__class__, instance=self)
Exemple #20
0
from rdkit.Chem import MolFromInchi
inchi = 'InChI=1S/C6H6O4/c7-5(8)3-1-2-4-6(9)10/h1-4H,(H,7,8)(H,9,10)/b3-1+,4-2+'
mol = MolFromInchi(inchi)
assert mol
def get_charge_from_inchi(inchi, removeHs=False):
    mol = MolFromInchi(inchi, removeHs=removeHs)
    netc = GetFormalCharge(mol)
    del mol
    return netc
Exemple #22
0
        # Using production model
        print("Production model running...")
        w_path = os.path.join(MODELS_PATH, f"{data}_noHs.pt")

        model = MPNNPredictor(
            node_in_feats=49,
            edge_in_feats=10,
            global_feats=4,
            n_tasks=1,
            output_f=output_f,
        ).to(DEVICE)

        model.load_state_dict(torch.load(w_path, map_location=DEVICE))

        gis = [
            molecule_importance(MolFromInchi(inchi), model)[4]
            for inchi in tqdm(inchis)
        ]
        global_importances = np.vstack(gis)
        np.save(os.path.join(DATA_PATH, f"importances{data}.npy"),
                arr=global_importances)

        # Using oof models
        global_importances_oof = []

        kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

        for idx_split, (_, idx_test) in enumerate(kf.split(inchis)):
            print("Split {}/{} running...".format(idx_split + 1, N_FOLDS))
            inchis_test, values_test = (
                inchis[idx_test].tolist(),
Exemple #23
0
def test_sequence_tunable():
    # Check default arguments
    args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations = inspect.getfullargspec(sequence_tunable)
    default_params = dict(zip(args[-len(defaults):], defaults))
    assert default_params == {
            'OP_REMOVE_ISOTOPE':True,
            'OP_NEUTRALISE_CHARGE': True,
            'OP_REMOVE_STEREO': False,
            'OP_COMMUTE_INCHI': False,
            'OP_KEEP_BIGGEST': True,
            'OP_ADD_HYDROGEN': True,
            'OP_KEKULIZE': True,
            'OP_NEUTRALISE_CHARGE_LATE': True
    }
    # Violacein, default parameter
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer(sequence_fun='sequence_tunable').compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    assert MolToSmiles(ans) == '[H]OC1=NC(C2=C([H])N([H])C3=C2C([H])=C(O[H])C([H])=C3[H])=C([H])/C1=C1\\C(O[H])=NC2=C([H])C([H])=C([H])C([H])=C21'
    # Violacein, strip stereo
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)'
    assert MolToSmiles(ans) == '[H]OC1=C([H])C2=C(C([H])=C1[H])N([H])C([H])=C2C1=C([H])C(=C2C(=O)N([H])C3=C([H])C([H])=C([H])C([H])=C23)C(=O)N1[H]'
    # Violacien, implicit Hs
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    assert MolToSmiles(ans) == 'OC1=CC2=C(C=C1)NC=C2C1=C/C(=C2/C3=CC=CC=C3N=C2O)C(O)=N1'
    # Violacien, no kekulerization
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_KEKULIZE': False}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
    assert MolToSmiles(ans) == '[H]OC1=NC(c2c([H])n([H])c3c([H])c([H])c(O[H])c([H])c23)=C([H])/C1=C1\\C(O[H])=Nc2c([H])c([H])c([H])c([H])c21'
    # Violacien, strip stereo & implicit Hs & no kekulerization
    mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True, 'OP_ADD_HYDROGEN': False, 'OP_KEKULIZE': False}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)'
    assert MolToSmiles(ans) == 'O=C1NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(=O)Nc2ccccc21'
    # Lactate, default parameter
    mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
    ans = Standardizer(sequence_fun='sequence_tunable').compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/t2-/m0/s1'
    assert MolToSmiles(ans) == '[H]OC(=O)[C@@]([H])(O[H])C([H])([H])[H]'
    # L-lactate, implicit Hs
    mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/t2-/m0/s1'
    assert MolToSmiles(ans) == 'C[C@H](O)C(=O)O'
    # L-lactate, no stereo
    mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)'
    assert MolToSmiles(ans) == '[H]OC(=O)C([H])(O[H])C([H])([H])[H]'
    # L-lactate, no charge neutralisation
    mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_NEUTRALISE_CHARGE': False, 'OP_NEUTRALISE_CHARGE_LATE': False}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1/t2-/m0/s1'
    assert MolToSmiles(ans) == '[H]O[C@]([H])(C(=O)[O-])C([H])([H])[H]'
    # L-lactate, implicit Hs & no stereo
    mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
    ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False, 'OP_REMOVE_STEREO': True}).compute(mol)
    assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)'
    assert MolToSmiles(ans) == 'CC(O)C(=O)O'
Exemple #24
0
    def process(self,
                input: Union[str, list] = "",
                input_file: str = "",
                output_file: str = "",
                output_file_sdf: str = "",
                output_file_cml: str = "",
                sdf_append: bool = False,
                format_output: bool = True,
                opsin_output_format: str = "",
                output_formats: list = None,
                write_header: bool = True,
                dry_run: bool = False,
                csv_delimiter: str = ";",
                standardize_mols: bool = True,
                normalize_plurals: bool = True,
                continue_on_failure: bool = False) -> OrderedDict:
        r"""
        Process the input file with OPSIN.

        Parameters
        ----------
        input : str or list
            | str: String with IUPAC names, one per line.
            | list: List of IUPAC names.
        input_file : str
            Path to file to be processed by OPSIN. One IUPAC name per line.
        output_file : str
            File to write output in.
        output_file_sdf : str
            File to write SDF output in.
        output_file_cml : str
            | File to write CML (Chemical Markup Language) output in. `opsin_output_format` must be "cml".
            | Not supported by RDKit so standardization and conversion to other formats cannot be done.
        sdf_append : bool
            If True, append new molecules to existing SDF file or create new one if doesn't exist.
        format_output : bool
            | If True, the value of "content" key of returned dict will be list of OrderedDicts with keys:
            | "iupac", <output formats>, ..., "error"
            | If True and `output_file` is set it will be created as CSV file with columns: "iupac", <output formats>, ..., "error"
            | If False, the value of "content" key of returned dict will be None.
        opsin_output_format : str
            | Output format from OPSIN. Temporarily overrides the option `output_format` set during instantiation (in __init__).
            | Choices: "cml", "smi", "extendedsmi", "inchi", "stdinchi", "stdinchikey"
        output_formats : list
            | If True and `format_output` is also True, this specifies which molecule formats will be output.
            | You can specify more than one format, but only one format from OPSIN. This format must be also set with `output_format` in __init__
              or with `osra_output_format` here.
            | Default value: ["smiles"]

            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         Value         |         Source        |                                            Note                                            |
            +=======================+=======================+============================================================================================+
            |         smiles        |         RDKit         |                                          canonical                                         |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      smiles_opsin     |     OPSIN ("smi")     |                                           SMILES                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            | smiles_extended_opsin | OPSIN ("extendedsmi") |                          Extended SMILES. Not supported by RDKit.                          |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |         inchi         |         RDKit         | Not every molecule can be converted to InChI (it doesn`t support wildcard characters etc.) |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |      inchi_opsin      |    OPSIN ("inchi")    |                                            InChI                                           |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |     stdinchi_opsin    |   OPSIN ("stdinchi")  |                                       standard InChI                                       |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |        inchikey       |         RDKit         |      The same applies as for "inchi". Also molecule cannot be created from InChI-key.      |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |   stdinchikey_opsin   | OPSIN ("stdinchikey") |               Standard InChI-key. Cannot be used by RDKit to create molecule.              |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+
            |          sdf          |         RDKit         |                     If present, an additional SDF file will be created.                    |
            +-----------------------+-----------------------+--------------------------------------------------------------------------------------------+

        write_header : bool
            If True and if `output_file` is set and `output_format` is True, write a CSV write_header.
        dry_run : bool
            If True, only return list of commands to be called by subprocess.
        csv_delimiter : str
            Delimiter for output CSV file.
        standardize_mols : bool
            If True and `format_output` is also True, use molvs (https://github.com/mcs07/MolVS) to standardize molecules.
        normalize_plurals : bool
            | If True, normalize plurals ("nitrates" -> "nitrate"). See OPSIN.PLURAL_PATTERNS for relating plurals. You can
              set your own regex pattern with `plural_patterns` in __init__.
        continue_on_failure : bool
            | If True, continue running even if OPSIN returns non-zero exit code.
            | If False and error occurs, print it and return.

        Returns
        -------
        dict
            Keys:

            - stdout: str ... standard output from OPSIN
            - stderr: str ... standard error output from OPSIN
            - exit_code: int ... exit code from OPSIN
            - content:

              - list of OrderedDicts ... when format_output is True. Fields: "iupac", <output formats>, ..., "error"
              - None ... when format_output is False
        """

        options_internal = self.options_internal.copy()
        opsin_nonreadable_formats = ["cml", "stdinchikey"]

        if input and input_file:
            input_file = ""
            self.logger.warning(
                "Both 'input' and 'input_file' are set, but 'input' will be prefered."
            )
        elif not input and not input_file:
            raise ValueError("One of 'input' or 'input_file' must be set.")

        # OSRA output format check
        if opsin_output_format:
            options_internal["output_format"] = opsin_output_format
        else:
            opsin_output_format = options_internal["output_format"]

        opsin_valid_output_formats = {
            "cml": "cml_opsin",
            "smi": "smiles_opsin",
            "extendedsmi": "smiles_extended_opsin",
            "inchi": "inchi_opsin",
            "stdinchi": "stdinchi_opsin",
            "stdinchikey": "stdinchikey_opsin"
        }

        if opsin_output_format not in opsin_valid_output_formats:
            raise ValueError(
                "Unknown OPSIN output format. Possible values: {}".format(
                    list(opsin_valid_output_formats.keys())))

        if standardize_mols and opsin_output_format in opsin_nonreadable_formats:
            self.logger.warning(
                "OPSIN output format is \"{}\", which cannot be used by RDKit."
                .format(opsin_output_format))

        # output formats check
        if not output_formats:
            output_formats = ["smiles"]
        else:
            if opsin_output_format == "stdinchikey":
                output_formats = ["stdinchikey_opsin"]
            elif opsin_output_format == "extendedsmi":
                output_formats = ["smiles_extended_opsin"]
            else:
                output_formats = sorted(list(set(output_formats)))
                possible_output_formats = [
                    "smiles", "inchi", "inchikey", "sdf"
                ]
                output_formats = [
                    x for x in output_formats if x in possible_output_formats
                    or x == opsin_valid_output_formats[opsin_output_format]
                ]

        if normalize_plurals:
            if input_file:
                with open(input_file, mode="r", encoding="utf-8") as f:
                    input = "\n".join([x.strip() for x in f.readlines()])
                input_file = ""
            input = self.normalize_iupac(input)

        commands, _, _ = self.build_commands(options_internal,
                                             self._OPTIONS_REAL,
                                             self.path_to_binary)

        if input_file:
            commands.append(input)
            stdout, stderr, exit_code = common_subprocess(commands)
        elif input:
            if isinstance(input, list):
                input = "\n".join([x.strip() for x in input])
            stdout, stderr, exit_code = common_subprocess(commands,
                                                          stdin=input)
        else:
            raise UserWarning("Input is empty.")

        if dry_run:
            return " ".join(commands)

        to_return = {
            "stdout": stdout,
            "stderr": stderr,
            "exit_code": exit_code,
            "content": None
        }

        if not continue_on_failure and exit_code > 0:
            self.logger.warning("OPSIN error:")
            eprint("\n\t".join("\n{}".format(stderr).splitlines()))
            return to_return

        if output_file_cml and opsin_output_format == "cml":
            with open(output_file_cml, mode="w", encoding="utf-8") as f:
                f.write(stdout)
            return to_return
        elif output_file_cml and opsin_output_format != "cml":
            self.logger.warning(
                "Output file for CML is requested, but OPSIN output format is '{}'"
                .format(opsin_output_format))

        if not format_output:
            if output_file:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write(stdout)
            return to_return

        compounds = []
        standardizer = Standardizer()
        empty_cols = OrderedDict([(x, "") for x in output_formats])

        if output_file_sdf:
            if sdf_append:
                if not os.path.isfile(output_file_sdf):
                    open(output_file_sdf, mode="w", encoding="utf-8").close()
                writer = SDWriter(
                    open(output_file_sdf, mode="a", encoding="utf-8"))
            else:
                writer = SDWriter(output_file_sdf)

        stdout = stdout.split("\n")
        del stdout[-1]
        stderr = [
            x.strip() for x in stderr.split("\n")[1:] if x
        ]  # remove first line of stderr because there is OPSIN message (y u du dis...)

        if input_file:
            with open(input_file, mode="r", encoding="utf-8") as f:
                lines = iter(f.readlines())
        else:
            lines = iter(input.split("\n"))

        mol_output_template = OrderedDict.fromkeys(["iupac"] + output_formats +
                                                   ["error"])

        e = 0
        for i, line in enumerate(lines):
            line = line.strip()
            converted = stdout[i].strip()
            mol_output = mol_output_template.copy()

            if converted:
                if opsin_output_format == "stdinchikey":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("stdinchikey_opsin", converted),
                                     ("error", "")]))
                    continue
                elif opsin_output_format == "extendedsmi":
                    compounds.append(
                        OrderedDict([("iupac", line),
                                     ("smiles_extended_opsin", converted),
                                     ("error", "")]))
                    continue

                if opsin_output_format == "smi":
                    mol = MolFromSmiles(
                        converted,
                        sanitize=False if standardize_mols else True)
                elif opsin_output_format in ["inchi", "stdinchi"]:
                    mol = MolFromInchi(
                        converted,
                        sanitize=False if standardize_mols else True,
                        removeHs=False if standardize_mols else True)

                if mol:
                    if standardize_mols:
                        try:
                            mol = standardizer.standardize(mol)
                        except ValueError as e:
                            self.logger.warning(
                                "Cannot standardize '{}': {}".format(
                                    MolToSmiles(mol), str(e)))

                    for f in output_formats:
                        if f == "smiles":
                            mol_output["smiles"] = MolToSmiles(
                                mol, isomericSmiles=True)
                        elif f == "smiles_opsin" and opsin_output_format == "smi":
                            mol_output["smiles_opsin"] = converted
                        elif f == "inchi":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchi"] = inchi
                            else:
                                mol_output["inchi"] = ""
                                self.logger.warning(
                                    "Cannot convert to InChI: {}".format(
                                        converted))
                        elif f == "inchi_opsin" and opsin_output_format == "inchi":
                            mol_output["inchi_opsin"] = converted
                        elif f == "stdinchi_opsin" and opsin_output_format == "stdinchi":
                            mol_output["stdinchi_opsin"] = converted
                        elif f == "inchikey":
                            inchi = MolToInchi(mol)
                            if inchi:
                                mol_output["inchikey"] = InchiToInchiKey(inchi)
                            else:
                                mol_output["inchikey"] = ""
                                self.logger.warning(
                                    "Cannot create InChI-key from InChI: {}".
                                    format(converted))
                        elif f == "stdinchikey_opsin" and opsin_output_format == "stdinchikey":
                            mol_output["stdinchikey_opsin"] = converted
                        elif f == "sdf":
                            mol_output["sdf"] = MolToMolBlock(
                                mol, includeStereo=True)

                    if output_file_sdf:
                        writer.write(mol)

                    mol_output.update(
                        OrderedDict([("iupac", line), ("error", "")]))
                else:
                    mol_output.update([
                        ("iupac", line),
                        ("error",
                         "Cannot convert to RDKit mol: {}".format(converted))
                    ])
                    mol_output.update(empty_cols)
                    self.logger.warning(compounds[-1].error)
            else:
                try:
                    error = stderr[e].strip()
                except IndexError:
                    error = ""

                mol_output.update([("iupac", line), ("error", error)])
                mol_output.update(empty_cols)
                e += 1
            compounds.append(mol_output)

        to_return["content"] = compounds

        if output_file and compounds:
            dict_to_csv(to_return["content"],
                        output_file=output_file,
                        csv_delimiter=csv_delimiter,
                        write_header=write_header)
        elif output_file and not compounds:
            write_empty_file(output_file,
                             csv_delimiter=csv_delimiter,
                             header=list(mol_output_template.keys()),
                             write_header=write_header)

        return to_return