def construct_buyable_trees(self): if self.celery: # Call celery worker working = time.time() res = get_buyable_paths.apply_async(args=(self.smiles, self.template_prioritization, self.precursor_prioritization), kwargs={'mincount': self.retro_mincount, 'max_branching': self.max_branching, 'max_depth': self.max_depth, 'max_ppg': self.max_ppg, 'max_time': self.expansion_time, 'max_trees': self.max_trees, 'known_bad_reactions': self.known_bad_reactions, 'chiral': self.chiral, 'template_count': self.template_count, 'precursor_score_mode': self.precursor_score_mode, 'max_cum_template_prob': self.max_cum_template_prob, 'apply_fast_filter': self.apply_fast_filter, 'filter_threshold': self.filter_threshold}) while not res.ready(): if int(time.time() - working) % 10 == 0: MyLogger.print_and_log('Building trees...', makeit_loc) time.sleep(1) buyable_trees = res.get() else: # Create tree builder object and run it treeBuilder = TreeBuilder(celery=self.celery, mincount=self.retro_mincount, mincount_chiral=self.retro_mincount_chiral, chiral=self.chiral) buyable_trees = treeBuilder.get_buyable_paths(self.smiles, template_prioritization=self.template_prioritization, precursor_prioritization=self.precursor_prioritization, nproc=self.nproc, max_depth=self.max_depth, max_branching=self.max_branching, max_ppg=self.max_ppg, mincount=self.retro_mincount, chiral=self.chiral, max_trees=self.max_trees, known_bad_reactions=self.known_bad_reactions, expansion_time=self.expansion_time, template_count = self.template_count, precursor_score_mode=self.precursor_score_mode, max_cum_template_prob = self.max_cum_template_prob,apply_fast_filter= self.apply_fast_filter, filter_threshold=self.filter_threshold) return buyable_trees
def get_reaction_input_from_smiles(reaction_smiles, r_fp=1024, c_f=1): ''' c_f: compression factor for the reaction fingerprint ''' if reaction_smiles == 'NONE': return None react = reaction_smiles.split('>') reag_s = react[0] prod_s = react[len(react) - 1] reag = Chem.MolFromSmiles(reag_s) prod = Chem.MolFromSmiles(prod_s) reag = AllChem.GetMorganFingerprintAsBitVect(mol=reag, radius=2, nBits=r_fp) prod = AllChem.GetMorganFingerprintAsBitVect(mol=prod, radius=2, nBits=r_fp) reactionfp = [i - j for i, j in zip(prod, reag)] reaction = [] if (r_fp % c_f == 0): compressed = int(r_fp / c_f) for i in range(compressed): pos = 0 for j in range(c_f): pos += reactionfp[i + compressed * j] reaction.append(pos) input = np.array(reaction).reshape(1, compressed) else: MyLogger.print_and_log( 'Redefine reaction fingerprint size or reaction compression ratio. Fingerprint size should be divisible by the compression factor.', fingerprinting_loc, level=3) return input
def dump_to_file(self, file_path=gc.historian_data, refs=False, compressed=False): """Writes the data from the online datbases to a local file. Args: file_path (str, optional): Path to the output file. (default: {gc.historian_data}) refs (bool, optional): Whether to include the references or just the counts. (default: {False}) compressed (bool, optional): Whether the data is compressed. (default: {False}) """ if not refs: file_path += '_no_refs' for k in self.occurrences.keys(): self.occurrences[k] = tuple(self.occurrences[k][0:2]) if compressed: file_path += '_compressed' with open(file_path, 'wb') as file: pickle.dump(dict(self.occurrences), file) MyLogger.print_and_log("Saved to {}".format(file_path), historian_loc, level=1)
def load_from_file(self, file_path=gc.historian_data, refs=False, compressed=False): """Loads the data for the pricer from a locally stored file. Args: file_path (str, optional): Path to the input file. (default: {gc.historian_data}) refs (bool, optional): Whether to include the references or just the counts. (default: {False}) compressed (bool, optional): Whether the data is compressed. (default: {False}) Raises: ValueError: If file does not exist. """ MyLogger.print_and_log('Loading chemhistorian from file...', historian_loc) if not refs: file_path += '_no_refs' if compressed: file_path += '_compressed' if os.path.isfile(file_path): with open(file_path, 'rb') as file: self.occurrences = pickle.load(file) self._loaded = True if compressed: self._compressed = True else: raise ValueError('File does not exist!')
def load_from_file(self, file_path=gc.CHEMICALS['file_name']): """Loads the data for the pricer from a locally stored file. Args: file_path (str, optional): Path to the input file. (default: {gc.historian_data}) refs (bool, optional): Whether to include the references or just the counts. (default: {False}) compressed (bool, optional): Whether the data is compressed. (default: {False}) Raises: ValueError: If file does not exist. """ if not os.path.isfile(file_path): raise ValueError('File does not exist!') MyLogger.print_and_log('Loading chemhistorian from file...', historian_loc) with gzip.open(file_path, 'rb') as f: chemicals = json.loads(f.read().decode('utf-8')) for chem in chemicals: smiles = chem.pop('smiles') self.occurrences[smiles] = chem
def get_precursor_prioritizers(self, precursor_prioritizer): if not precursor_prioritizer: MyLogger.print_and_log( 'Cannot run the Transformer without a precursor prioritization method. Exiting...', transformer_loc, level=3) if precursor_prioritizer in self.precursor_prioritizers: precursor = self.precursor_prioritizers[precursor_prioritizer] else: if precursor_prioritizer == gc.heuristic: precursor = HeuristicPrecursorPrioritizer() elif precursor_prioritizer == gc.relevanceheuristic: precursor = RelevanceHeuristicPrecursorPrioritizer() elif precursor_prioritizer == gc.scscore: precursor = SCScorePrecursorPrioritizer() elif precursor_prioritizer == gc.mincost: precursor = MinCostPrecursorPrioritizer() elif precursor_prioritizer == gc.natural: precursor = DefaultPrioritizer() else: precursor = DefaultPrioritizer() MyLogger.print_and_log( 'Prioritization method not recognized. Using natural prioritization.', transformer_loc, level=1) precursor.load_model() self.precursor_prioritizers[precursor_prioritizer] = precursor self.precursor_prioritizer = precursor
def parse_molecule_to_smiles(target): """Parses a molecular type (smiles, rdkit mol or mol file) to smiles format). Args: target (str or Chem.Mol): SMILES string, filename, or Chem.Mol to parse. Returns: str or None: SMILES string of target, or None if parsing fails. """ try: mol = Chem.MolFromSmiles(target) if mol: #This in order to canonicalize the molecule return Chem.MolToSmiles(mol) except Exception as e: try: smiles = Chem.MolToSmiles(target, isomericSmiles = gc.USE_STEREOCHEMISTRY) if smiles: return smiles except Exception as e: try: mol = Chem.MolFromMolFile(target) if mol: return Chem.MolToSmiles(mol, isomericSmiles = gc.USE_STEREOCHEMISTRY) except Exception as e: MyLogger.print_and_log('Unable to parse target molecule format. Parsing Only available for: Smiles, RDKIT molecule and mol files. Returning "None"', parsing_loc, level = 1) return None
def find_synthesis(): args = arg_parser.get_args() makeit = MAKEIT( args.TARGET, args.expansion_time, args.max_depth, args.max_branching, args.max_trees, args.retro_mincount, args.retro_mincount_chiral, args.synth_mincount, args.rank_threshold, args.prob_threshold, args.max_contexts, args.template_count, args.max_ppg, args.output, args.chiral, args.nproc, args.celery, args.context_recommender, args.forward_scoring, args.tree_scoring, args.context_prioritization, args.template_prioritization, args.precursor_prioritization, args.parallel_tree, args.precursor_score_mode, args.max_cum_template_prob, args.apply_fast_filter, args.filter_threshold) MyLogger.initialize_logFile(makeit.output_dir_root, makeit.case_dir) tree_status, trees = makeit.construct_buyable_trees() MyLogger.print_and_log( 'MAKEIT generated {} buyable tree(s) that meet(s) all constraints.'. format(len(trees)), makeit_loc) feasible_trees = makeit.evaluate_synthesis_trees(trees) MyLogger.print_and_log( 'MAKEIT found {} tree(s) that are(is) likely to result in a successful synthesis.' .format(len(feasible_trees)), makeit_loc) def writefunc(string): return MyLogger.print_and_log(string, makeit_loc) for i, feasible_tree in enumerate( sorted(feasible_trees, key=lambda x: x['score'], reverse=True)): MyLogger.print_and_log('', makeit_loc) MyLogger.print_and_log( 'Feasible tree {}, plausible = {}, overall score = {}'.format( i + 1, feasible_tree['plausible'], feasible_tree['score']), makeit_loc) print_at_depth(feasible_tree['tree'], writefunc=writefunc) with open( os.path.join(makeit.output_dir_root, '{}_trees.html'.format(makeit.case_dir)), 'w') as fid: def writefunc(string): return fid.write('{}<br>\n'.format(string)) fid.write('<html><title>Results for {}</title>\n'.format(args.TARGET)) fid.write('<body>\n') fid.write('<h1>{}</h1><br>\n'.format(args.TARGET)) fid.write('<i><b>Settings: </b>{}</i><br>\n'.format(args.__dict__)) for i, feasible_tree in enumerate( sorted(feasible_trees, key=lambda x: x['score'], reverse=True)): writefunc( '<h3>Feasible tree {}, plausible = {}, overall score = {}</h3><br>' .format(i + 1, feasible_tree['plausible'], feasible_tree['score'])) print_at_depth(feasible_tree['tree'], writefunc=writefunc, delim=' ', img=True) fid.write('</body>\n')
def clean_reactant_mapping(reactants): """Remaps atoms for reactants. Args: reactants (Chem.Mol): Reactants to remap. Returns: Chem.Mol: Reactants with remapped atoms. """ if not reactants: MyLogger.print_and_log( 'Could not parse reactants {}'.format(reactants), reactants_loc) raise ValueError('Could not parse reactants') if gc.DEBUG: print('Number of reactant atoms: {}'.format(len(reactants.GetAtoms()))) # Report current reactant SMILES string [ a.ClearProp('molAtomMapNumber') for a in reactants.GetAtoms() if a.HasProp('molAtomMapNumber') ] if gc.DEBUG: print('Reactants w/o map: {}'.format(Chem.MolToSmiles(reactants))) # Add new atom map numbers [ a.SetProp('molAtomMapNumber', str(i + 1)) for (i, a) in enumerate(reactants.GetAtoms()) ] # Report new reactant SMILES string if gc.DEBUG: print('Reactants w/ map: {}'.format(Chem.MolToSmiles(reactants))) return reactants
def load_from_database(self): """Read the template data from the database.""" if not self.use_db: MyLogger.print_and_log( 'Error: Cannot load from database when use_db=False', transformer_loc, level=3) if not self.TEMPLATE_DB: self.load_databases() # Look for all templates in collection to_retrieve = [ '_id', 'reaction_smarts', 'necessary_reagent', 'count', 'intra_only', 'dimer_only', 'idex', 'references' ] for document in self.TEMPLATE_DB.find({}, to_retrieve).sort( 'index', pymongo.ASCENDING): if self.load_all: template = self.doc_to_template(document) if template is not None: self.templates.append(template) else: _id = document.get('_id') if _id: self.templates.append(_id) self.num_templates = len(self.templates)
def work(self, i): while True: # If done, stop if self.done.value: MyLogger.print_and_log( 'Worker {} saw done signal, terminating'.format(i), template_nn_scorer_loc) break # If paused, wait and check again if self.paused.value: #print('Worker {} saw pause signal, sleeping for 1 second'.format(i)) time.sleep(1) continue # Grab something off the queue try: (reactants_smiles, start_at, end_at) = self.expansion_queue.get( timeout=0.5) # short timeout self.idle[i] = False (smiles, result) = self.forward_transformer.get_outcomes(reactants_smiles, self.mincount, self.template_prioritization, start_at=start_at, end_at=end_at, template_count=self.template_count) self.results_queue.put([result, start_at, end_at]) #print('Worker {} added children of {} (ID {}) to results queue'.format(i, smiles, _id)) except VanillaQueue.Empty: #print('Queue {} empty for worker {}'.format(j, i)) pass except Exception as e: print(e) # Wait briefly to allow the results_queue to properly update time.sleep(0.5) self.idle[i] = True
def load_nn_model(self, model_path="", info_path=""): """Loads the nearest neighbor model""" if not model_path: MyLogger.print_and_log( 'Cannot load nearest neighbor context recommender without a specific path to the model. Exiting...', contextRecommender_loc, level=3) if not info_path: MyLogger.print_and_log( 'Cannot load nearest neighbor context recommender without a specific path to the model info. Exiting...', contextRecommender_loc, level=3) # Load the nearest neighbor model with open(model_path, 'rb') as infile: self.nnModel = joblib.load(infile) # Load the rxn ids associated with the nearest neighbor model rxd_ids = [] rxn_ids = [] with open(info_path, 'r') as infile: rxn_ids.append( infile.readlines()[1:]) # a list of str(rxn_ids) with '\n' for id in rxn_ids[0]: rxd_ids.append(id.replace('\n', '')) self.rxn_ids = rxd_ids
def get_template_prioritizers(self, template_prioritizer): """Loads template prioritizer for the transformer to use. Args: template_prioritizer (str): Specifies which prioritization method to use. """ if not template_prioritizer: MyLogger.print_and_log( 'Cannot run the Transformer without a template prioritization method. Exiting...', transformer_loc, level=3) if template_prioritizer in self.template_prioritizers: template = self.template_prioritizers[template_prioritizer] else: if template_prioritizer == gc.popularity: template = PopularityTemplatePrioritizer() elif template_prioritizer == gc.relevance: template = RelevanceTemplatePrioritizer() else: template = PopularityTemplatePrioritizer() MyLogger.print_and_log( 'Prioritization method not recognized. Using literature popularity prioritization.', transformer_loc, level=1) template.load_model() self.template_prioritizers[template_prioritizer] = template self.template_prioritizer = template
def parse_molecule_to_smiles(target): ''' Parse a molecular type (smiles, rdkit mol or mol file) into smiles format) ''' try: mol = Chem.MolFromSmiles(target) if mol: #This in order to canonicalize the molecule return Chem.MolToSmiles(mol) except Exception as e: try: smiles = Chem.MolToSmiles(target, isomericSmiles=gc.USE_STEREOCHEMISTRY) if smiles: return smiles except Exception as e: try: mol = Chem.MolFromMolFile(target) if mol: return Chem.MolToSmiles( mol, isomericSmiles=gc.USE_STEREOCHEMISTRY) except Exception as e: MyLogger.print_and_log( 'Unable to parse target molecule format. Parsing Only available for: Smiles, RDKIT molecule and mol files. Returning "None"', parsing_loc, level=1) return None
def load(self, model_path=gc.NEURALNET_CONTEXT_REC['model_path'], info_path=gc.NEURALNET_CONTEXT_REC[ 'info_path'], weights_path=gc.NEURALNET_CONTEXT_REC['weights_path']): # for the neural net model, info path points to the encoders self.load_nn_model(model_path, info_path, weights_path) MyLogger.print_and_log( 'Nerual network context recommender has been loaded.', contextRecommender_loc)
def load_model(self, FP_len=1024, model_tag='1024bool'): self.FP_len = FP_len if model_tag != '1024bool' and model_tag != '1024uint8' and model_tag != '2048bool': MyLogger.print_and_log( 'Non-existent SCScore model requested: {}. Using "1024bool" model'.format(model_tag), scscore_prioritizer_loc, level=2) model_tag = '1024bool' filename = 'trained_model_path_'+model_tag with open(gc.SCScore_Prioritiaztion[filename], 'rb') as fid: self.vars = pickle.load(fid) if gc.DEBUG: MyLogger.print_and_log('Loaded synthetic complexity score prioritization model from {}'.format( gc.SCScore_Prioritiaztion[filename]), scscore_prioritizer_loc) if 'uint8' in gc.SCScore_Prioritiaztion[filename]: def mol_to_fp(mol): if mol is None: return np.array((self.FP_len,), dtype=np.uint8) fp = AllChem.GetMorganFingerprint( mol, self.FP_rad, useChirality=True) # uitnsparsevect fp_folded = np.zeros((self.FP_len,), dtype=np.uint8) for k, v in fp.GetNonzeroElements().items(): fp_folded[k % self.FP_len] += v return np.array(fp_folded) else: def mol_to_fp(mol): if mol is None: return np.zeros((self.FP_len,), dtype=np.float32) return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, self.FP_rad, nBits=self.FP_len, useChirality=True), dtype=np.bool) self.mol_to_fp = mol_to_fp self.pricer = Pricer() self.pricer.load() self._restored = True self._loaded = True
def load_model(): with open(gc.Relevance_Prioritization['trained_model_path_{}'.format(self.retro)], 'rb') as fid: self.vars = pickle.load(fid) if gc.DEBUG: MyLogger.print_and_log('Loaded relevance based template prioritization model from {}'.format( gc.Relevance_Prioritization['trained_model_path_{}'.format(self.retro)]), relevance_template_prioritizer_loc) return self
def prepare(): MyLogger.print_and_log( 'Tree builder spinning off {} child processes'.format( self.nproc), treebuilder_loc) for i in range(self.nproc): p = Process(target=self.work, args=(i, )) self.workers.append(p) p.start()
def load_Pricer(chemical_database, buyable_database): ''' Load a pricer using the chemicals database and database of buyable chemicals ''' MyLogger.print_and_log('Loading pricing model...', model_loader_loc) pricerModel = Pricer() pricerModel.load(chemical_database, buyable_database) MyLogger.print_and_log('Pricer Loaded.', model_loader_loc) return pricerModel
def prepare(): if gc.DEBUG: MyLogger.print_and_log('Template based scorer spinning off {} child processes'.format( self.nproc), template_nn_scorer_loc) for i in range(self.nproc): p = Process(target=self.work, args=(i,)) self.workers.append(p) p.start() self.running = True
def load(self, model_path=gc.FAST_FILTER_MODEL['model_path']): """Loads model from a file. Args: model_path (str): Path to file specifying model. """ MyLogger.print_and_log('Starting to load fast filter', fast_filter_loc) self.model = load_model(model_path) MyLogger.print_and_log('Done loading fast filter', fast_filter_loc)
def dump_to_file(self, retro, file_path, chiral=False): """Write the template database to a file. Args: retro (bool): Whether in the retrosynthetic direction. file_path (str): Specifies where to save the database. chiral (bool, optional): Whether to care about chirality. (default: {False}) """ if not self.templates: raise ValueError( 'Cannot dump to file if templates have not been loaded') if retro and chiral: pickle_templates = [] # reconstruct template list, but without chiral rxn object (can't be pickled) for template in self.templates: pickle_templates.append({ 'name': template['name'], 'reaction_smarts': template['reaction_smarts'], 'incompatible_groups': template['incompatible_groups'], 'references': template['references'], 'rxn_example': template['rxn_example'], 'explicit_H': template['explicit_H'], '_id': template['_id'], 'product_smiles': template['product_smiles'], 'necessary_reagent': template['necessary_reagent'], 'efgs': template['efgs'], 'intra_only': template['intra_only'], 'dimer_only': template['dimer_only'], 'chiral': template['chiral'], 'count': template['count'], }) else: pickle_templates = self.templates with open(file_path, 'w+') as file: pickle.dump(pickle_templates, file) MyLogger.print_and_log('Wrote templates to {}'.format(file_path), transformer_loc)
def spin_up_workers(self, nproc_t): self.running = True MyLogger.print_and_log( 'Tree evaluator spinning off {} child processes'.format(nproc_t), treeEvaluator_loc) for i in range(nproc_t): self.idle.append(False) p = Process(target=self.work, args=(i, )) self.workers.append(p) p.start()
def path_condition(self, n, path): """Reaction condition recommendation for a reaction path with multiple reactions path: a list of reaction SMILES for each step return: a list of reaction context with n options for each step """ rsmi_list = [] psmi_list = [] contexts = [] for rxn in path: try: rsmi = rxn.split('>>')[0] psmi = rxn.split('>>')[1] rct_mol = Chem.MolFromSmiles(rsmi) prd_mol = Chem.MolFromSmiles(psmi) [atom.ClearProp('molAtomMapNumber')for \ atom in rct_mol.GetAtoms() if atom.HasProp('molAtomMapNumber')] [atom.ClearProp('molAtomMapNumber')for \ atom in prd_mol.GetAtoms() if atom.HasProp('molAtomMapNumber')] rsmi = Chem.MolToSmiles(rct_mol, isomericSmiles=True) psmi = Chem.MolToSmiles(prd_mol, isomericSmiles=True) [pfp, rfp ] = fp.create_rxn_Morgan2FP_separately(rsmi, psmi, rxnfpsize=self.fp_size, pfpsize=self.fp_size, useFeatures=False, calculate_rfp=True, useChirality=True) pfp = pfp.reshape(1, self.fp_size) rfp = rfp.reshape(1, self.fp_size) rxnfp = pfp - rfp c1_input = [] r1_input = [] r2_input = [] s1_input = [] s2_input = [] inputs = [ pfp, rxnfp, c1_input, r1_input, r2_input, s1_input, s2_input ] top_combos = self.predict_top_combos(inputs=inputs, c1_rank_thres=1, s1_rank_thres=3, s2_rank_thres=1, r1_rank_thres=4, r2_rank_thres=1) contexts.append(top_combos[:n]) except Exception as e: MyLogger.print_and_log( 'Failed for reaction {} because {}. Returning None.'. format(rxn, e), contextRecommender_loc, level=2) return contexts
def load_Retro_Transformer(mincount=25, mincount_chiral=10, chiral=True): ''' Load the model and databases required for the retro transformer. Returns the retro transformer, ready to run. ''' MyLogger.print_and_log( 'Loading retro synthetic template database...', model_loader_loc) retroTransformer = RetroTransformer(mincount=mincount, mincount_chiral=mincount_chiral) retroTransformer.load(chiral=chiral) MyLogger.print_and_log( 'Retro synthetic transformer loaded.', model_loader_loc) return retroTransformer
def load(self, model_path): MyLogger.print_and_log('Starting to load fast filter', fast_filter_loc) self.model = load_model(model_path, custom_objects={ 'Highway_self': Highway_self, 'pos_ct': pos_ct, 'true_pos': true_pos, 'real_pos': real_pos }) self.model._make_predict_function() MyLogger.print_and_log('Done loading fast filter', fast_filter_loc)
def load_Retro_Transformer(): ''' Load the model and databases required for the retro transformer. Returns the retro transformer, ready to run. ''' MyLogger.print_and_log('Loading retro synthetic template database...', model_loader_loc) retroTransformer = RetroTransformer() retroTransformer.load() MyLogger.print_and_log('Retro synthetic transformer loaded.', model_loader_loc) return retroTransformer
def load(self, model_path="", info_path=""): self.load_databases() self.load_nn_model(model_path, info_path) MyLogger.print_and_log('Context recommender has been loaded.', contextRecommender_loc) #multiprocessing notify done if self.done == None: pass else: self.done.value = 1
def load_from_file(self, file_name): ''' Load buyables information from local file ''' with gzip.open(file_name, 'rb') as f: prices = json.loads(f.read().decode('utf-8')) for p in prices: smiles = p.pop('smiles', '') if smiles: self.prices[smiles] = p.pop('ppg') MyLogger.print_and_log('Loaded prices from flat file', pricer_loc)
def get_context_prioritizer(self, context_method): if context_method == gc.probability: self.context_prioritizer = ProbabilityContextPrioritizer() elif context_method == gc.rank: self.context_prioritizer = RankContextPrioritizer() else: MyLogger.print_and_log( 'Specified prioritization method does not exist. Using default method.', treeEvaluator_loc, level=1) self.context_prioritizer = DefaultPrioritizer() self.context_prioritizer.load_model()