Beispiel #1
0
def predict():
    client = Algorithmia.client('simtds2YG9Ed/wd5xucmvHy+U8G1')
    algo = client.algo('Dmitry_BV/predictor/1.1.2')
    algo.set_options(timeout=100)  # optional

    # cgr = "C12(C(CCC1C3C(C4(C(C(C3)=C)=CC(C[->=]C4)=O)CC#C)CC2)=O)C.[O-]" # Test input
    smi = request.args.get('post')
    beamSize = request.args.get('beamSize')
    model = request.args.get('model')

    input_query = {"reaction": smi, 'beamWidth': int(beamSize), "model": model}
    answers = algo.pipe(input_query).result["product"]

    # If CGR string received, performing decomposition;
    # Generating 2 SMILES strings (reactants and products) and 2 SVG images for them
    result_dict = {}
    if model == "cgr":
        decomposed = smiles(answers)
        decomposed_smiles, svg_list = decompose_cgr(decomposed)
        img = get_svg(decomposed)
        result_dict['decomposed_smiles'] = decomposed_smiles
        result_dict['decomposed_svg'] = svg_list
    elif model == "smiles":
        answers = smi + ">>" + answers
        img = get_svg(smiles(answers))

    result_dict['prediction'] = answers
    result_dict['reaction'] = img
    return json.dumps(result_dict)
Beispiel #2
0
def convert(dct):
    out = {}
    for k, (tuples, value) in dct.items():
        k = smiles(k)
        k.canonicalize()
        new_mols = []
        for prob, mols in tuples:
            synth = []
            for mol in mols:
                new_mol = smiles(mol)
                new_mol.canonicalize()
                synth.append(new_mol)
            new_mols.append((prob, tuple(synth)))
        out[k] = (tuple(new_mols), value)
    return out
Beispiel #3
0
def get_smiles():
    """
    Get SMILES string for IUPAC names
    Web translation API: SYSTRAN.io
    """
    smiles_list = []
    string = request.args.get('post')
    string = string.replace(",", ".")
    string = string.split(".")
    source_lang = request.args.get('lang')
    url = "https://systran-systran-platform-for-language-processing-v1.p.rapidapi.com/translation/text/translate"
    headers = {
        'x-rapidapi-key':
        "28bb9cac94msh0d21dd8efbc884ep1172f0jsndc7797830a0e",
        'x-rapidapi-host':
        "systran-systran-platform-for-language-processing-v1.p.rapidapi.com"
    }

    for i in range(len(string)):
        if source_lang != "en":
            # If source lang. is not English, perform translation
            querystring = {
                "source": source_lang,
                "target": "en",
                "input": string[i]
            }
            response = requests.request("GET",
                                        url,
                                        headers=headers,
                                        params=querystring)
            response = json.loads(response.text)
            string[i] = response['outputs'][0]['output'].lower().lstrip(
                " ").lstrip("the")
        try:
            smi = str(
                smiles(get_compounds(string[i], "name")[0].canonical_smiles))
            smiles_list.append(smi)
        except IndexError:
            continue
    smiles_list = ".".join(smiles_list)
    img = get_svg(smiles(smiles_list))

    return json.dumps({
        'smiles': smiles_list,
        'img': img
    })
Beispiel #4
0
def test_good_smiles():
    for t, v in zip('CN C-N C=N C#N C:N cn'.split(), (1, 1, 2, 3, 4, 4)):
        t = smiles(t)
        assert isinstance(t, MoleculeContainer)
        assert t.bond(1, 2).order == v

    t = smiles('C.N ')
    assert isinstance(t, MoleculeContainer)
    assert not t.has_bond(1, 2)

    for t, v1, v2 in zip(
            'C[->.]N C[->=]N C[=>#]N C[#>:]N C[:>.]N [O->0][.>-][Na+>0]'.split(
            ), (1, 1, 2, 3, 4, None), (None, 2, 3, 4, None, 1)):
        t = smiles(t)
        assert isinstance(t, CGRContainer)
        assert t.bond(1, 2).order == v1
        assert t.bond(1, 2).p_order == v2

    for t in '[Fe] [13C] [C+4] [C:14] [C--:12] [C@@H] [C@H] [OH2] [OH3+]'.split(
    ):
        assert isinstance(smiles(t), MoleculeContainer)

    for t in 'C(C)C C(-C)C C(C)-C'.split():
        assert smiles(t).bonds_count == 2

    for t, v in zip(
            'C1CC1 C-1CC-1 C%10CC%10 C-%11CC-%11 C12CC1C2 C-1-2CC-1C-2'.split(
            ), (1, 1, 1, 1, 2, 2)):
        assert smiles(t).rings_count == v

    for t in (
            '[C+>-] [C++>--] [C-3>+2] [C0>+] [C-->0] [C*>^] [C^>*] [C+>-*>^] [C-3>+2^>*] '
            '[C0>-*>^] [C+2>0^>*] [n+>0]').split():
        assert isinstance(smiles(t), CGRContainer)
Beispiel #5
0
def image():
    post = request.args.get('post')
    post = smiles(post)
    result_dict = {'img': get_svg(post)}
    if isinstance(post, CGRContainer):
        decomposed_smiles, svg_list = decompose_cgr(post)
        result_dict['decomposed_smiles'] = decomposed_smiles
        result_dict['decomposed_svg'] = svg_list

    return json.dumps(result_dict)
Beispiel #6
0
 def __new__(cls, molecule, *args, **kwargs):
     if cls.__bb__ is None:
         from .rules import rules
         bb = [smiles(x.strip()) for x in TextIOWrapper(resource_stream(__name__, 'data/building_blocks.smiles'))]
         for b in bb:  # recalculate canonic forms. prevent errors when CGRtools rules set changes.
             b.canonicalize()
         cls.__bb__ = frozenset(str(b) for b in bb)
         cls.__reactors__ = tuple((1., Reactor(x, delete_atoms=True, automorphism_filter=False, one_shot=False))
                                  for x in rules)
     return super().__new__(cls, *args, **kwargs)
Beispiel #7
0
def run(index, smi=None, **kwargs):
    """
    perform calculations for one reaction
    :param index:
        index of the reaction, to keep tracking initial order of tasks
    :param smi:
        Reaction SMILES
    :param crest_speed:
        speed and precision of CREST calculations, possible options :
        - quick
        - squick
        - mquick (default for this project)
    :param dft:
        define to perform
    :return:
        ReactionComponents named tuple
    """
    start = time()
    print(smi)
    if smi:
        reaction = smiles(smi)
    else:
        # TO DO: accept RDF as alternative
        raise NotImplemented
    # TO DO: add check for reaction container
    if reaction:
        if not reaction.reactants:
            return ReactionComponents(index, smi, None, None, None, 'problem: with reactants', spent_time(start))
        elif not reaction.products:
            return ReactionComponents(index, smi, None, None, None, 'problem: with products', spent_time(start))
        else:
            reactants = best_conformers(reaction.reactants, **kwargs)
            if not reactants:
                return ReactionComponents(index, smi, None, None, None,
                                          'anomaly terminated calculations for all of reactants', spent_time(start))
            elif any(isinstance(x, FailReport) for x in reactants):
                return ReactionComponents(index, smi, reactants, None, None,
                                          'anomaly terminated calculations for one of reactants', spent_time(start))
            products = best_conformers(reaction.products, **kwargs)
            if not products:
                ReactionComponents(index, smi, None, None, None,
                                   'anomaly terminated calculations for all of products', spent_time(start))
            elif any(isinstance(x, FailReport) for x in products):
                return ReactionComponents(index, smi, reactants, products, None,
                                          'anomaly terminated calculations for one of products', spent_time(start))
            try:
                energy_dif = sum([x.min_energy for x in products]) - sum([x.min_energy for x in reactants])
            except TypeError:
                return ReactionComponents(index, smi, reactants, products,
                                          None, 'min energy read error', spent_time(start))
            return ReactionComponents(index, smi, reactants, products,
                                      energy_dif, 'terminated normally', spent_time(start))
    else:
        return ReactionComponents(index, smi, None, None, None, 'problem: reaction smiles empty or incorrect',
                                  spent_time(start))
Beispiel #8
0
def convert(dct):
    out = {}
    for k, (tuples, value) in dct.items():
        k = smiles(k)
        k.canonicalize()
        new_mols = []
        for prob, mols in tuples:
            synth = []
            for mol in mols:
                new_mol = smiles(mol)
                new_mol.canonicalize()
                synth.append(new_mol)
            rxn = ReactionContainer((k,), synth)
            ext_center = set(rxn.extended_centers_list[0])
            qk = k.substructure(ext_center.intersection(k), as_query=True)
            qsynth = [m.substructure(ext_center.intersection(m), as_query=True) for m in synth]
            template = ReactionContainer((qk,), qsynth)
            new_mols.append((prob, Reactor(template), set(synth)))
        out[k] = (tuple(new_mols), value)
    return out
Beispiel #9
0
from ThetaSynthesis.synthon import RolloutSynthon
from pickle import dump

data = []
target = None
reactions = set()
for line in open('test.smiles', 'r'):
    line = line.strip()
    if line == '$$$$':
        data.append((target, reactions))
        target = None
        reactions = set()
    elif line.startswith('#'):
        continue
    elif target is None:
        target = smiles(line)
        target.canonicalize()
    else:
        r = smiles(line)
        r.canonicalize()
        reactions.add(r)

for num in [.01, .1, 1., 10., 100.]:
    results = []
    for target, reactions in data:
        found = []
        tree = RetroTree(target,
                         synthon_class=RolloutSynthon,
                         size=10000,
                         c_puct=num)
        for node in tree:
Beispiel #10
0
    Test synthon for Acetaminophen.
    """
    __slots__ = ()

    def __iter__(self):
        for prob, molecules in data[self._molecule][0]:
            yield prob, tuple(type(self)(mol) for mol in molecules)

    def __bool__(self):
        return self._molecule in building_blocks

    def __float__(self):
        return data[self._molecule][1]


building_blocks = {smiles('Oc1ccccc1')}
pre_data = {
    'CC(=O)Nc1ccc(O)cc1': (((.25, ('Oc1ccc(N)cc1',),),
                            (.15, ('C(Nc1ccc(OC)cc1)(C)=O',)),
                            (.2, ('ON=C(C)c1ccc(O)cc1',)),
                            (.25, ('Oc1ccc(O)cc1',)),
                            (.15, ('C(Nc1ccc(OC2OCCCC2)cc1)(C)=O',))), 1.),
    'Oc1ccc(N)cc1': (((.3, ('O=N(=O)c1ccc(O)cc1',)),
                      (.4, ('c1(N)ccc(OC)cc1',)),
                      (.3, ('c1(O)ccc(F)cc1',))), 1.),
    'O=N(=O)c1ccc(O)cc1': (((.35, ('Oc1ccccc1',)),
                            (.35, ('N(c1ccc(N)cc1)(=O)=O',)),
                            (.3, ('c1(N(=O)=O)cc(c(cc1)O)Br',))), 1.),
    'Oc1ccccc1': ((), 1.),
    'C(Nc1ccc(OC)cc1)(C)=O': ((), -1.),
    'Oc1ccc(O)cc1': ((), 0.),
Beispiel #11
0
def test_invalid_ring_closure_ignoring():
    data = 'C1CC1C1CC1\nC=1CC1'
    with StringIO(data) as f, SMILESRead(f, ignore=True, store_log=True) as r:
        for t, v in zip_longest(r, ('C1CC1C2CC2', 'C1=CC1')):
            assert t == smiles(v)
Beispiel #12
0
def load_nicklaus_tautomers(*,
                            return_X_y=False,
                            as_frame=False,
                            as_regression=False):
    """Load and return Nicklaus's tautomers dataset (Regression and Classification).

    ==================   ==============
    Samples total         5960
    Samples Regression    2824
    Data                  molecules, type: MoleculeContainer
    Targets               real ratio 0.0 - 1.0, type: float (Regression)
    Classes               5
    ==================   ==============

    Molecules has .meta attribute which returns dict with additional data:
    structure_id: row in original file
    tautomer_id: id of structure of tautomer in row
    additive.{n}: solvent name. {n} started from 1 id of solvent. in mixtures will be presented more additive keys.
        e.g. additive.2, additive.3 ...
    amount.{n}: amount of additive.
    prevalence (optional): Qualitative category of tautomer reported in the publication.
    temperature (optional) in Kelvin
    pH (optional)

    For Regression: The numeric proportion of tautomer based on its quantitative ratio and qualitative prevalence.

    For Classification: Quantitative ratio of tautomer compared to other tautomers.

    Numeric classification of qualitative prevalence:
    0: Not observed
    1: Less favored, less stable, minor, observed
    2: Equally, favored, major, in equilibrium,  preferred, similar spectra
    3: More favored, more stable, predominant, strongly favored
    4: Exclusively observed, only observed, only tautomer, identical tautomer

    Numeric classification of quantitative amount of tautomers:
    0: ratio = 0.0 - 0.0099
    1: ratio =  0.01 - 0.30
    2: ratio = 0.31 - 0.69
    3: ratio =  0.70 - 0.99
    4: ratio = 1

    Parameters
    ----------
    return_X_y : bool, default=False
        If True, returns ``(data, target)`` instead of a Bunch object.

    as_frame : bool, default=False
        If True, the data is a pandas DataFrame including columns with
        appropriate dtypes (numeric). The target is
        a pandas DataFrame or Series depending on the number of target columns.
        If `return_X_y` is True, then (`data`, `target`) will be pandas
        DataFrames or Series as described below.

    as_regression : bool, default=False
        If True, returns regression subset instead of classes

    Returns
    -------
    data : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.

        data : ndarray of shape (n, )
            The data array.
        target : ndarray of shape (n, )
            The regression or classification target.
        feature_names: list
            The name of the dataset ('Tautomers').
        target_names: list
            The name of target (['ratio or category']).
        frame: DataFrame of shape (n, 2)
            Only present when `as_frame=True`. DataFrame with `data` and
            `target`.

    (data, target) : tuple if ``return_X_y`` is True
    """
    with resource_stream('CIMtools.datasets',
                         'data/tautomer_database_release_3a.xlsx') as s:
        data = read_excel(s,
                          na_values='nul',
                          true_values=['yes'],
                          false_values=['no'])

    parsed = []
    for record, mol in data.iterrows():
        meta = {}
        n = m = 0
        c = mol['Size']

        sol = mol['Solvent']
        if mol['Solvent_Mixture']:
            for n, sol in enumerate(sol.split(','), 1):
                meta[f'additive.{n}'] = sol.strip()

            sol_prop = mol['Solvent_Proportion']
            if isinstance(sol_prop, str):
                sol_prop = [float(x) for x in sol_prop.split(':')]
                sum_prop = sum(sol_prop)
                for m, prop in enumerate(sol_prop, 1):
                    meta[f'amount.{m}'] = prop / sum_prop
                if n != m:
                    raise ValueError
            elif not isnan(sol_prop):
                raise ValueError
        else:
            meta['additive.1'] = sol
            meta['amount.1'] = 1.

        temp = mol['Temperature']
        if not isinstance(temp, str) and not isnan(temp):
            meta['temperature'] = temp
        ph = mol['pH']
        if not isinstance(ph, str) and not isnan(ph):
            meta['pH'] = ph

        tmp = []
        for n in range(1, c + 1):
            s = smiles(mol[f'SMILES_{n}'])
            s.kekule()
            s.standardize(fix_stereo=False)
            if s.check_valence():  # skip invalid records
                break
            s.implicify_hydrogens(fix_stereo=False)
            s.thiele()

            s.meta['structure_id'] = record + 1
            s.meta['tautomer_id'] = n
            s.meta['category'] = mol[f'Prevalence_Category_{n}']

            rat = mol[f'Quantitative_ratio_{n}']
            if not isinstance(rat, str) and not isnan(rat):
                s.meta['ratio'] = rat
            prev = mol[f'Qualitative_prevalence_{n}']
            if isinstance(prev, str):
                s.meta['prevalence'] = prev.lower()
            s.meta.update(meta)
            tmp.append(s)
        else:
            parsed.extend(tmp)

    if as_regression:
        data = array([x for x in parsed if 'ratio' in x.meta])
        target = array([x.meta['ratio'] for x in data])
        target_names = ['ratio']
    else:
        data = array(parsed)
        target = array([x.meta['category'] for x in data], dtype=int)
        target_names = ['category']

    feature_names = ['Tautomers']

    if as_frame:
        frame, data, target = _convert_data_dataframe(
            caller_name='load_nicklaus_tautomers',
            data=data,
            target=target,
            feature_names=feature_names,
            target_names=target_names)
    else:
        frame = None

    if return_X_y:
        return data, target
    return Bunch(data=data,
                 target=target,
                 frame=frame,
                 target_names=target_names,
                 feature_names=feature_names)