def predict(): client = Algorithmia.client('simtds2YG9Ed/wd5xucmvHy+U8G1') algo = client.algo('Dmitry_BV/predictor/1.1.2') algo.set_options(timeout=100) # optional # cgr = "C12(C(CCC1C3C(C4(C(C(C3)=C)=CC(C[->=]C4)=O)CC#C)CC2)=O)C.[O-]" # Test input smi = request.args.get('post') beamSize = request.args.get('beamSize') model = request.args.get('model') input_query = {"reaction": smi, 'beamWidth': int(beamSize), "model": model} answers = algo.pipe(input_query).result["product"] # If CGR string received, performing decomposition; # Generating 2 SMILES strings (reactants and products) and 2 SVG images for them result_dict = {} if model == "cgr": decomposed = smiles(answers) decomposed_smiles, svg_list = decompose_cgr(decomposed) img = get_svg(decomposed) result_dict['decomposed_smiles'] = decomposed_smiles result_dict['decomposed_svg'] = svg_list elif model == "smiles": answers = smi + ">>" + answers img = get_svg(smiles(answers)) result_dict['prediction'] = answers result_dict['reaction'] = img return json.dumps(result_dict)
def convert(dct): out = {} for k, (tuples, value) in dct.items(): k = smiles(k) k.canonicalize() new_mols = [] for prob, mols in tuples: synth = [] for mol in mols: new_mol = smiles(mol) new_mol.canonicalize() synth.append(new_mol) new_mols.append((prob, tuple(synth))) out[k] = (tuple(new_mols), value) return out
def get_smiles(): """ Get SMILES string for IUPAC names Web translation API: SYSTRAN.io """ smiles_list = [] string = request.args.get('post') string = string.replace(",", ".") string = string.split(".") source_lang = request.args.get('lang') url = "https://systran-systran-platform-for-language-processing-v1.p.rapidapi.com/translation/text/translate" headers = { 'x-rapidapi-key': "28bb9cac94msh0d21dd8efbc884ep1172f0jsndc7797830a0e", 'x-rapidapi-host': "systran-systran-platform-for-language-processing-v1.p.rapidapi.com" } for i in range(len(string)): if source_lang != "en": # If source lang. is not English, perform translation querystring = { "source": source_lang, "target": "en", "input": string[i] } response = requests.request("GET", url, headers=headers, params=querystring) response = json.loads(response.text) string[i] = response['outputs'][0]['output'].lower().lstrip( " ").lstrip("the") try: smi = str( smiles(get_compounds(string[i], "name")[0].canonical_smiles)) smiles_list.append(smi) except IndexError: continue smiles_list = ".".join(smiles_list) img = get_svg(smiles(smiles_list)) return json.dumps({ 'smiles': smiles_list, 'img': img })
def test_good_smiles(): for t, v in zip('CN C-N C=N C#N C:N cn'.split(), (1, 1, 2, 3, 4, 4)): t = smiles(t) assert isinstance(t, MoleculeContainer) assert t.bond(1, 2).order == v t = smiles('C.N ') assert isinstance(t, MoleculeContainer) assert not t.has_bond(1, 2) for t, v1, v2 in zip( 'C[->.]N C[->=]N C[=>#]N C[#>:]N C[:>.]N [O->0][.>-][Na+>0]'.split( ), (1, 1, 2, 3, 4, None), (None, 2, 3, 4, None, 1)): t = smiles(t) assert isinstance(t, CGRContainer) assert t.bond(1, 2).order == v1 assert t.bond(1, 2).p_order == v2 for t in '[Fe] [13C] [C+4] [C:14] [C--:12] [C@@H] [C@H] [OH2] [OH3+]'.split( ): assert isinstance(smiles(t), MoleculeContainer) for t in 'C(C)C C(-C)C C(C)-C'.split(): assert smiles(t).bonds_count == 2 for t, v in zip( 'C1CC1 C-1CC-1 C%10CC%10 C-%11CC-%11 C12CC1C2 C-1-2CC-1C-2'.split( ), (1, 1, 1, 1, 2, 2)): assert smiles(t).rings_count == v for t in ( '[C+>-] [C++>--] [C-3>+2] [C0>+] [C-->0] [C*>^] [C^>*] [C+>-*>^] [C-3>+2^>*] ' '[C0>-*>^] [C+2>0^>*] [n+>0]').split(): assert isinstance(smiles(t), CGRContainer)
def image(): post = request.args.get('post') post = smiles(post) result_dict = {'img': get_svg(post)} if isinstance(post, CGRContainer): decomposed_smiles, svg_list = decompose_cgr(post) result_dict['decomposed_smiles'] = decomposed_smiles result_dict['decomposed_svg'] = svg_list return json.dumps(result_dict)
def __new__(cls, molecule, *args, **kwargs): if cls.__bb__ is None: from .rules import rules bb = [smiles(x.strip()) for x in TextIOWrapper(resource_stream(__name__, 'data/building_blocks.smiles'))] for b in bb: # recalculate canonic forms. prevent errors when CGRtools rules set changes. b.canonicalize() cls.__bb__ = frozenset(str(b) for b in bb) cls.__reactors__ = tuple((1., Reactor(x, delete_atoms=True, automorphism_filter=False, one_shot=False)) for x in rules) return super().__new__(cls, *args, **kwargs)
def run(index, smi=None, **kwargs): """ perform calculations for one reaction :param index: index of the reaction, to keep tracking initial order of tasks :param smi: Reaction SMILES :param crest_speed: speed and precision of CREST calculations, possible options : - quick - squick - mquick (default for this project) :param dft: define to perform :return: ReactionComponents named tuple """ start = time() print(smi) if smi: reaction = smiles(smi) else: # TO DO: accept RDF as alternative raise NotImplemented # TO DO: add check for reaction container if reaction: if not reaction.reactants: return ReactionComponents(index, smi, None, None, None, 'problem: with reactants', spent_time(start)) elif not reaction.products: return ReactionComponents(index, smi, None, None, None, 'problem: with products', spent_time(start)) else: reactants = best_conformers(reaction.reactants, **kwargs) if not reactants: return ReactionComponents(index, smi, None, None, None, 'anomaly terminated calculations for all of reactants', spent_time(start)) elif any(isinstance(x, FailReport) for x in reactants): return ReactionComponents(index, smi, reactants, None, None, 'anomaly terminated calculations for one of reactants', spent_time(start)) products = best_conformers(reaction.products, **kwargs) if not products: ReactionComponents(index, smi, None, None, None, 'anomaly terminated calculations for all of products', spent_time(start)) elif any(isinstance(x, FailReport) for x in products): return ReactionComponents(index, smi, reactants, products, None, 'anomaly terminated calculations for one of products', spent_time(start)) try: energy_dif = sum([x.min_energy for x in products]) - sum([x.min_energy for x in reactants]) except TypeError: return ReactionComponents(index, smi, reactants, products, None, 'min energy read error', spent_time(start)) return ReactionComponents(index, smi, reactants, products, energy_dif, 'terminated normally', spent_time(start)) else: return ReactionComponents(index, smi, None, None, None, 'problem: reaction smiles empty or incorrect', spent_time(start))
def convert(dct): out = {} for k, (tuples, value) in dct.items(): k = smiles(k) k.canonicalize() new_mols = [] for prob, mols in tuples: synth = [] for mol in mols: new_mol = smiles(mol) new_mol.canonicalize() synth.append(new_mol) rxn = ReactionContainer((k,), synth) ext_center = set(rxn.extended_centers_list[0]) qk = k.substructure(ext_center.intersection(k), as_query=True) qsynth = [m.substructure(ext_center.intersection(m), as_query=True) for m in synth] template = ReactionContainer((qk,), qsynth) new_mols.append((prob, Reactor(template), set(synth))) out[k] = (tuple(new_mols), value) return out
from ThetaSynthesis.synthon import RolloutSynthon from pickle import dump data = [] target = None reactions = set() for line in open('test.smiles', 'r'): line = line.strip() if line == '$$$$': data.append((target, reactions)) target = None reactions = set() elif line.startswith('#'): continue elif target is None: target = smiles(line) target.canonicalize() else: r = smiles(line) r.canonicalize() reactions.add(r) for num in [.01, .1, 1., 10., 100.]: results = [] for target, reactions in data: found = [] tree = RetroTree(target, synthon_class=RolloutSynthon, size=10000, c_puct=num) for node in tree:
Test synthon for Acetaminophen. """ __slots__ = () def __iter__(self): for prob, molecules in data[self._molecule][0]: yield prob, tuple(type(self)(mol) for mol in molecules) def __bool__(self): return self._molecule in building_blocks def __float__(self): return data[self._molecule][1] building_blocks = {smiles('Oc1ccccc1')} pre_data = { 'CC(=O)Nc1ccc(O)cc1': (((.25, ('Oc1ccc(N)cc1',),), (.15, ('C(Nc1ccc(OC)cc1)(C)=O',)), (.2, ('ON=C(C)c1ccc(O)cc1',)), (.25, ('Oc1ccc(O)cc1',)), (.15, ('C(Nc1ccc(OC2OCCCC2)cc1)(C)=O',))), 1.), 'Oc1ccc(N)cc1': (((.3, ('O=N(=O)c1ccc(O)cc1',)), (.4, ('c1(N)ccc(OC)cc1',)), (.3, ('c1(O)ccc(F)cc1',))), 1.), 'O=N(=O)c1ccc(O)cc1': (((.35, ('Oc1ccccc1',)), (.35, ('N(c1ccc(N)cc1)(=O)=O',)), (.3, ('c1(N(=O)=O)cc(c(cc1)O)Br',))), 1.), 'Oc1ccccc1': ((), 1.), 'C(Nc1ccc(OC)cc1)(C)=O': ((), -1.), 'Oc1ccc(O)cc1': ((), 0.),
def test_invalid_ring_closure_ignoring(): data = 'C1CC1C1CC1\nC=1CC1' with StringIO(data) as f, SMILESRead(f, ignore=True, store_log=True) as r: for t, v in zip_longest(r, ('C1CC1C2CC2', 'C1=CC1')): assert t == smiles(v)
def load_nicklaus_tautomers(*, return_X_y=False, as_frame=False, as_regression=False): """Load and return Nicklaus's tautomers dataset (Regression and Classification). ================== ============== Samples total 5960 Samples Regression 2824 Data molecules, type: MoleculeContainer Targets real ratio 0.0 - 1.0, type: float (Regression) Classes 5 ================== ============== Molecules has .meta attribute which returns dict with additional data: structure_id: row in original file tautomer_id: id of structure of tautomer in row additive.{n}: solvent name. {n} started from 1 id of solvent. in mixtures will be presented more additive keys. e.g. additive.2, additive.3 ... amount.{n}: amount of additive. prevalence (optional): Qualitative category of tautomer reported in the publication. temperature (optional) in Kelvin pH (optional) For Regression: The numeric proportion of tautomer based on its quantitative ratio and qualitative prevalence. For Classification: Quantitative ratio of tautomer compared to other tautomers. Numeric classification of qualitative prevalence: 0: Not observed 1: Less favored, less stable, minor, observed 2: Equally, favored, major, in equilibrium, preferred, similar spectra 3: More favored, more stable, predominant, strongly favored 4: Exclusively observed, only observed, only tautomer, identical tautomer Numeric classification of quantitative amount of tautomers: 0: ratio = 0.0 - 0.0099 1: ratio = 0.01 - 0.30 2: ratio = 0.31 - 0.69 3: ratio = 0.70 - 0.99 4: ratio = 1 Parameters ---------- return_X_y : bool, default=False If True, returns ``(data, target)`` instead of a Bunch object. as_frame : bool, default=False If True, the data is a pandas DataFrame including columns with appropriate dtypes (numeric). The target is a pandas DataFrame or Series depending on the number of target columns. If `return_X_y` is True, then (`data`, `target`) will be pandas DataFrames or Series as described below. as_regression : bool, default=False If True, returns regression subset instead of classes Returns ------- data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. data : ndarray of shape (n, ) The data array. target : ndarray of shape (n, ) The regression or classification target. feature_names: list The name of the dataset ('Tautomers'). target_names: list The name of target (['ratio or category']). frame: DataFrame of shape (n, 2) Only present when `as_frame=True`. DataFrame with `data` and `target`. (data, target) : tuple if ``return_X_y`` is True """ with resource_stream('CIMtools.datasets', 'data/tautomer_database_release_3a.xlsx') as s: data = read_excel(s, na_values='nul', true_values=['yes'], false_values=['no']) parsed = [] for record, mol in data.iterrows(): meta = {} n = m = 0 c = mol['Size'] sol = mol['Solvent'] if mol['Solvent_Mixture']: for n, sol in enumerate(sol.split(','), 1): meta[f'additive.{n}'] = sol.strip() sol_prop = mol['Solvent_Proportion'] if isinstance(sol_prop, str): sol_prop = [float(x) for x in sol_prop.split(':')] sum_prop = sum(sol_prop) for m, prop in enumerate(sol_prop, 1): meta[f'amount.{m}'] = prop / sum_prop if n != m: raise ValueError elif not isnan(sol_prop): raise ValueError else: meta['additive.1'] = sol meta['amount.1'] = 1. temp = mol['Temperature'] if not isinstance(temp, str) and not isnan(temp): meta['temperature'] = temp ph = mol['pH'] if not isinstance(ph, str) and not isnan(ph): meta['pH'] = ph tmp = [] for n in range(1, c + 1): s = smiles(mol[f'SMILES_{n}']) s.kekule() s.standardize(fix_stereo=False) if s.check_valence(): # skip invalid records break s.implicify_hydrogens(fix_stereo=False) s.thiele() s.meta['structure_id'] = record + 1 s.meta['tautomer_id'] = n s.meta['category'] = mol[f'Prevalence_Category_{n}'] rat = mol[f'Quantitative_ratio_{n}'] if not isinstance(rat, str) and not isnan(rat): s.meta['ratio'] = rat prev = mol[f'Qualitative_prevalence_{n}'] if isinstance(prev, str): s.meta['prevalence'] = prev.lower() s.meta.update(meta) tmp.append(s) else: parsed.extend(tmp) if as_regression: data = array([x for x in parsed if 'ratio' in x.meta]) target = array([x.meta['ratio'] for x in data]) target_names = ['ratio'] else: data = array(parsed) target = array([x.meta['category'] for x in data], dtype=int) target_names = ['category'] feature_names = ['Tautomers'] if as_frame: frame, data, target = _convert_data_dataframe( caller_name='load_nicklaus_tautomers', data=data, target=target, feature_names=feature_names, target_names=target_names) else: frame = None if return_X_y: return data, target return Bunch(data=data, target=target, frame=frame, target_names=target_names, feature_names=feature_names)