Ejemplo n.º 1
0
    def load_from_file(self,
                       file_path=gc.reactionhistorian_data,
                       testing=False):
        '''
        Load the data for the pricer from a locally stored file instead of from the online database.
        '''

        if testing:
            self.occurrences = defaultdict(
                lambda: [0, []], {
                    'CCO>>CCBr': [2, ['rxn1', 'rxn2']],
                    'CCCCC>>CCC=CC': [1, ['rxn3']],
                })
            self.occurrences_flat = defaultdict(
                lambda: [0, []], {
                    'CCO>>CCBr': [2, ['rxn1', 'rxn2']],
                    'CCCCC>>CCC=CC': [1, ['rxn3']],
                })
            return

        if os.path.isfile(file_path):
            with open(file_path, 'rb') as file:
                self.occurrences = defaultdict(lambda: [0, []],
                                               pickle.load(file))
                self.occurrences_flat = defaultdict(lambda: [0, []],
                                                    pickle.load(file))
        else:
            self.load_databases()
            self.load()
            self.dump_to_file()
Ejemplo n.º 2
0
    def load_from_file(self,
                       file_path=gc.reactionhistorian_data,
                       testing=False):
        """Loads the data for the pricer from a locally stored file.

        Args:
            file_path (str, optional): Path to the input file.
                (default: {gc.reactionhistorian_data})
            testing (bool, optional): Whether to only run a test.
                (default: {False})
        """

        if testing:
            self.occurrences = defaultdict(
                lambda: [0, []], {
                    'CCO>>CCBr': [2, ['rxn1', 'rxn2']],
                    'CCCCC>>CCC=CC': [1, ['rxn3']],
                })
            self.occurrences_flat = defaultdict(
                lambda: [0, []], {
                    'CCO>>CCBr': [2, ['rxn1', 'rxn2']],
                    'CCCCC>>CCC=CC': [1, ['rxn3']],
                })
            return

        if os.path.isfile(file_path):
            with open(file_path, 'rb') as file:
                self.occurrences = defaultdict(lambda: [0, []],
                                               pickle.load(file))
                self.occurrences_flat = defaultdict(lambda: [0, []],
                                                    pickle.load(file))
        else:
            self.load_databases()
            self.load()
            self.dump_to_file()
Ejemplo n.º 3
0
def get_data(max_N_c=None, shuffle=False):
    '''Creates a dictionary defining data generators for 
    training and validation given pickled data/label files

    max_N_c and shuffle only refers to training data'''

    with open(DATA_FPATH, 'rb') as fid:
        legend_data = pickle.load(fid)
    with open(LABELS_FPATH, 'rb') as fid:
        legend_labels = pickle.load(fid)

    N_samples = legend_data['N_examples']
    N_train = int(N_samples * split_ratio[0])
    N_val = int(N_samples * split_ratio[1])
    N_test = N_samples - N_train - N_val
    print('Total number of samples: {}'.format(N_samples))
    print('Training   on {}% - {}'.format(split_ratio[0] * 100, N_train))
    print('Validating on {}% - {}'.format(split_ratio[1] * 100, N_val))
    print('Testing    on {}% - {}'.format(
        (1 - split_ratio[1] - split_ratio[0]) * 100, N_test))

    return {
        'N_samples':
        N_samples,
        'N_train':
        N_train,
        #
        'train_generator':
        data_generator(0,
                       N_train,
                       batch_size,
                       max_N_c=max_N_c,
                       shuffle=shuffle),
        'train_label_generator':
        label_generator(0, N_train, batch_size),
        'train_nb_samples':
        N_train,
        #
        'val_generator':
        data_generator(N_train, N_train + N_val, batch_size),
        'val_label_generator':
        label_generator(N_train, N_train + N_val, batch_size),
        'val_nb_samples':
        N_val,
        #
        'test_generator':
        data_generator(N_train + N_val, N_samples, batch_size),
        'test_label_generator':
        label_generator(N_train + N_val, N_samples, batch_size),
        'test_nb_samples':
        N_test,
        #
        #
        'batch_size':
        batch_size,
    }
Ejemplo n.º 4
0
    def load_model(self, FP_len=1024, model_tag='1024bool'):
        self.FP_len = FP_len
        if model_tag != '1024bool' and model_tag != '1024uint8' and model_tag != '2048bool':
            MyLogger.print_and_log(
                'Non-existent SCScore model requested: {}. Using "1024bool" model'.format(model_tag), scscore_prioritizer_loc, level=2)
            model_tag = '1024bool'
        filename = 'trained_model_path_'+model_tag
        with open(gc.SCScore_Prioritiaztion[filename], 'rb') as fid:
            self.vars = pickle.load(fid)
        if gc.DEBUG:
            MyLogger.print_and_log('Loaded synthetic complexity score prioritization model from {}'.format(
            gc.SCScore_Prioritiaztion[filename]), scscore_prioritizer_loc)

        if 'uint8' in gc.SCScore_Prioritiaztion[filename]:
            def mol_to_fp(mol):
                if mol is None:
                    return np.array((self.FP_len,), dtype=np.uint8)
                fp = AllChem.GetMorganFingerprint(
                    mol, self.FP_rad, useChirality=True)  # uitnsparsevect
                fp_folded = np.zeros((self.FP_len,), dtype=np.uint8)
                for k, v in fp.GetNonzeroElements().items():
                    fp_folded[k % self.FP_len] += v
                return np.array(fp_folded)
        else:
            def mol_to_fp(mol):
                if mol is None:
                    return np.zeros((self.FP_len,), dtype=np.float32)
                return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, self.FP_rad, nBits=self.FP_len,
                                                                      useChirality=True), dtype=np.bool)
        self.mol_to_fp = mol_to_fp

        self.pricer = Pricer()
        self.pricer.load()
        self._restored = True
        self._loaded = True
Ejemplo n.º 5
0
 def load_model():
     with open(gc.Relevance_Prioritization['trained_model_path_{}'.format(self.retro)], 'rb') as fid:
         self.vars = pickle.load(fid)
     if gc.DEBUG:
         MyLogger.print_and_log('Loaded relevance based template prioritization model from {}'.format(
         gc.Relevance_Prioritization['trained_model_path_{}'.format(self.retro)]), relevance_template_prioritizer_loc)
     return self
Ejemplo n.º 6
0
            def load_model(depth=5, hidden_size=300, output_size=gc.Relevance_Prioritization['output_size']):
                config = tf.ConfigProto()
                config.gpu_options.allow_growth = True
                self.session = tf.Session(config=config)
                self.input_mol = tf.placeholder(tf.float32, [self.batch_size, self.FP_len])
                self.mol_hiddens = tf.nn.relu(linearND(self.input_mol, hidden_size, scope="encoder0", reuse=tf.AUTO_REUSE))
                for d in range(1, depth):
                    self.mol_hiddens = tf.nn.relu(linearND(self.mol_hiddens, hidden_size, scope="encoder%i"%d, reuse=tf.AUTO_REUSE))

                self.score = linearND(self.mol_hiddens, output_size, scope="output", reuse=tf.AUTO_REUSE)
                _, self.topk = tf.nn.top_k(self.score, k=self.NK)

                tf.global_variables_initializer().run(session=self.session)
                from functools import reduce
                size_func = lambda v: reduce(lambda x, y: x*y, v.get_shape().as_list())
                n = sum(size_func(v) for v in tf.trainable_variables())
                print(("Model size: %dK" % (n/1000,)))

                self.coord = tf.train.Coordinator()
                with open(gc.Relevance_Prioritization['trained_model_path_{}'.format(self.retro)], 'rb') as fid:
                    variables = pickle.load(fid)
                for i, v in enumerate(tf.trainable_variables()):
                    assign_op = tf.assign(v, variables[i])
                    self.session.run(assign_op)
                    del assign_op
                print('Loaded tf model from numpy arrays')
Ejemplo n.º 7
0
    def load_from_file(self,
                       file_path=gc.historian_data,
                       refs=False,
                       compressed=False):
        """Loads the data for the pricer from a locally stored file.

        Args:
            file_path (str, optional): Path to the input file.
                (default: {gc.historian_data})
            refs (bool, optional): Whether to include the references or just
                the counts. (default: {False})
            compressed (bool, optional): Whether the data is compressed.
                (default: {False})

        Raises:
            ValueError: If file does not exist.
        """

        MyLogger.print_and_log('Loading chemhistorian from file...',
                               historian_loc)

        if not refs:
            file_path += '_no_refs'
        if compressed:
            file_path += '_compressed'

        if os.path.isfile(file_path):
            with open(file_path, 'rb') as file:
                self.occurrences = pickle.load(file)
                self._loaded = True
                if compressed:
                    self._compressed = True
        else:
            raise ValueError('File does not exist!')
Ejemplo n.º 8
0
 def load(self):
     '''
     Try to load the data for the pricer from a mongo database. If server cannot be found, load from locally stored file instead.
     '''
     from makeit.utilities.io.files import get_pricer_path
     file_path = get_pricer_path(
         gc.CHEMICALS['database'],
         gc.CHEMICALS['collection'],
         gc.BUYABLES['database'],
         gc.BUYABLES['collection'],
     )
     self.load_databases()
     if not self.BUYABLE_DB and os.path.isfile(file_path):
         with open(file_path, 'rb') as file:
             self.prices = defaultdict(float, pickle.load(file))
             self.prices_flat = defaultdict(float, pickle.load(file))
             self.prices_by_xrn = defaultdict(float, pickle.load(file))
         MyLogger.print_and_log('Loaded prices from flat file', pricer_loc)
Ejemplo n.º 9
0
 def load(self):
     '''
     Load the data for the pricer from a locally stored file instead of from the online database.
     '''
     from makeit.utilities.io.files import get_pricer_path
     file_path = get_pricer_path(
         gc.CHEMICALS['database'],
         gc.CHEMICALS['collection'],
         gc.BUYABLES['database'],
         gc.BUYABLES['collection'],
     )
     if os.path.isfile(file_path):
         with open(file_path, 'rb') as file:
             self.prices = defaultdict(float, pickle.load(file))
             self.prices_flat = defaultdict(float, pickle.load(file))
     else:
         self.load_databases()
         self.load_from_database()
         self.dump_to_file(file_path)
Ejemplo n.º 10
0
def label_generator(start_at, end_at, batch_size):
    '''This function generates labels to match the data generated
    by data_generator'''

    filePos_start_at = -1

    # Keep returning forever and ever
    with open(LABELS_FPATH, 'rb') as fid:
        while True:
            # Is this the first iteration?
            if filePos_start_at == -1:

                # Remember where data starts
                legend_labels = pickle.load(fid)  # first doc is legend
                CANDIDATE_SMILES = legend_labels['candidate_smiles']
                CANDIDATE_EDITS = legend_labels['candidate_edits_compact']
                REACTION_TRUE = legend_labels['reaction_true']
                RXDID = legend_labels['rxdid']

                for i in range(start_at):
                    pickle.load(fid)  # throw away first ___ entries
                filePos_start_at = fid.tell()

            else:
                fid.seek(filePos_start_at)

            for startIndex in range(start_at, end_at, batch_size):
                endIndex = min(startIndex + batch_size, end_at)

                docs = [pickle.load(fid) for j in range(startIndex, endIndex)]
                yield {
                    'candidate_smiles':
                    [doc[CANDIDATE_SMILES] for doc in docs],
                    'candidate_edits': [doc[CANDIDATE_EDITS] for doc in docs],
                    'reaction_true': [doc[REACTION_TRUE] for doc in docs],
                    'rxdid': [doc[RXDID] for doc in docs]
                }

            filePos_start_at = -1
Ejemplo n.º 11
0
    def load_from_file(self, file_path=gc.historian_data, refs=False, compressed=False):
        '''
        Load the data for the pricer from a locally stored file instead of from the online database.
        '''

        MyLogger.print_and_log('Loading chemhistorian from file...', historian_loc)

        if not refs:
            file_path += '_no_refs'
        if compressed:
            file_path += '_compressed'

        if os.path.isfile(file_path):
            with open(file_path, 'rb') as file:
                self.occurrences = pickle.load(file)
                self._loaded = True
                if compressed:
                    self._compressed = True
        else:
            raise ValueError('File does not exist!')
Ejemplo n.º 12
0
# chemhistorian.load_from_file()
chemhistorian = None

from makeit.prioritization.precursors.scscore import SCScorePrecursorPrioritizer
scscorer = SCScorePrecursorPrioritizer()
scscorer.load_model(model_tag='1024bool')
print('Loaded SCScorer on website')
print(scscorer.get_score_from_smiles('CCCC', noprice=True))

# Solvent choices - the save file is created by the template-based forward predictor
solvent_choices = []
from makeit.utilities.io.files import get_abraham_solvents_path
file_path = get_abraham_solvents_path()
if os.path.isfile(file_path):
    with open(file_path, 'rb') as fid:
        solvent_name_to_smiles = pickle.load(fid)
    solvent_choices = [{
        'smiles': v,
        'name': k
    } for (k, v) in solvent_name_to_smiles.items()]
else:
    db_client = MongoClient(gc.MONGO['path'],
                            gc.MONGO['id'],
                            connect=gc.MONGO['connect'])
    db = db_client[gc.SOLVENTS['database']]
    SOLVENT_DB = db[gc.SOLVENTS['collection']]
    for doc in SOLVENT_DB.find({'_id': {'$ne': 'default'}}):
        solvent_choices.append({
            'smiles': doc['smiles'],
            'name': doc['name'],
        })
Ejemplo n.º 13
0
    def load(self, folder="", worker_no = 0):
        '''Load a neural network scoring model'''
        if worker_no==0:
            MyLogger.print_and_log('Starting to load scorer...', template_nn_scorer_loc)

        # First load neural network
        if not folder:
            MyLogger.print_and_log(
                'Cannot load neural network without the directory in which the parameters are saved. Exiting...', template_nn_scorer_loc, level=3)
        # Get model args
        ARGS_FPATH = os.path.join(folder, 'args.json')
        with open(ARGS_FPATH, 'r') as fid:
            args = json.load(fid)

        N_h2 = int(args['Nh2'])
        N_h1 = int(args['Nh1'])
        N_h3 = int(args['Nh3'])
        N_hf = int(args['Nhf'])
        l2v = float(args['l2'])
        lr = float(args['lr'])
        context_weight = float(args['context_weight'])
        enhancement_weight = float(args['enhancement_weight'])
        optimizer = args['optimizer']
        inner_act = args['inner_act']
        TARGET_YIELD = False

        self.model = build(F_atom=self.F_atom, F_bond=self.F_bond, N_h1=N_h1,
                           N_h2=N_h2, N_h3=N_h3, N_hf=N_hf, l2v=l2v, inner_act=inner_act,
                           context_weight=context_weight, enhancement_weight=enhancement_weight, TARGET_YIELD=TARGET_YIELD,
                           absolute_score=True)

        WEIGHTS_FPATH = os.path.join(folder, 'weights.h5')
        self.model.load_weights(WEIGHTS_FPATH, by_name=True)

        # Now load solvent information
        # Try to load from file first
        from makeit.utilities.io.files import get_abraham_solvents_path
        file_path = get_abraham_solvents_path()
        if os.path.isfile(file_path):
            with open(file_path, 'rb') as fid:
                self.solvent_name_to_smiles = pickle.load(fid)
                self.solvent_smiles_to_params = pickle.load(fid)
        else:
            db_client = MongoClient(gc.MONGO['path'], gc.MONGO[
                            'id'], connect=gc.MONGO['connect'])
            db = db_client[gc.SOLVENTS['database']]
            SOLVENT_DB = db[gc.SOLVENTS['collection']]
            for doc in SOLVENT_DB.find():
                try:
                    if doc['_id'] == 'default':
                        self.solvent_name_to_smiles['default'] = doc['_id']
                    else:
                        self.solvent_name_to_smiles[doc['name']] = doc['_id']

                    self.solvent_smiles_to_params[doc['_id']] = doc

                except KeyError:
                    MyLogger.print_and_log('Solvent doc {} missing a name'.format(
                        doc), template_nn_scorer_loc, level=1)

            with open(file_path, 'wb') as fid:
                pickle.dump(self.solvent_name_to_smiles, fid)
                pickle.dump(self.solvent_smiles_to_params, fid)

        if worker_no == 0:
            MyLogger.print_and_log('Scorer has been loaded.', template_nn_scorer_loc)
Ejemplo n.º 14
0
    def load_from_file(self,
                       retro,
                       file_path,
                       chiral=False,
                       rxns=True,
                       refs=False,
                       efgs=False,
                       rxn_ex=False):
        """Read the template database from a previously saved file.

        Args:
            retro (bool): Whether in the retrosynthetic direction.
            file_path (str): Pickle file to read dumped templates from.
            chiral (bool, optional): Whether to handle chirality properly
                (only for retro for now). (default: {False})
            rxns (bool, optional): Whether to actually load the reaction objects
                (or just the info). (default: {True})
            refs (bool, optional): Whether to include references.
                (default: {False})
            efgs (bool, optional): Whether to include efg information.
                (default: {False})
            rxn_ex (bool, optional): Whether to include reaction examples.
                (default: {False})
        """

        MyLogger.print_and_log('Loading templates from {}'.format(file_path),
                               transformer_loc)

        if os.path.isfile(file_path):
            with open(file_path, 'rb') as file:
                if retro and chiral and rxns:  # cannot pickle rdchiralReactions, so need to reload from SMARTS
                    pickle_templates = pickle.load(file)
                    self.templates = []
                    for template in pickle_templates:
                        try:
                            template['rxn'] = rdchiralReaction(
                                str('(' + template['reaction_smarts'].replace(
                                    '>>', ')>>(') + ')'))
                        except Exception as e:
                            template['rxn'] = None
                        self.templates.append(template)
                else:
                    self.templates = pickle.load(file)
        else:
            MyLogger.print_and_log("No file to read data from.",
                                   transformer_loc,
                                   level=1)
            raise IOError('File not found to load template_transformer from!')

        # Clear out unnecessary info
        if not refs:
            [
                self.templates[i].pop('references', None)
                for i in range(len(self.templates))
            ]
        elif 'references' not in self.templates[0]:
            raise IOError(
                'Save file does not contain references (which were requested!)'
            )

        if not efgs:
            [
                self.templates[i].pop('efgs', None)
                for i in range(len(self.templates))
            ]
        elif 'efgs' not in self.templates[0]:
            raise IOError(
                'Save file does not contain efg info (which was requested!)')

        if not rxn_ex:
            [
                self.templates[i].pop('rxn_example', None)
                for i in range(len(self.templates))
            ]
        elif 'rxn_example' not in self.templates[0]:
            raise IOError(
                'Save file does not contain a reaction example (which was requested!)'
            )

        self.num_templates = len(self.templates)
        MyLogger.print_and_log(
            'Loaded templates. Using {} templates'.format(self.num_templates),
            transformer_loc)
Ejemplo n.º 15
0
def data_generator(start_at, end_at, batch_size, max_N_c=None, shuffle=False):
    '''This function generates batches of data from the
    pickle file since all the data can't fit in memory.

    The starting and ending indices are specified explicitly so the
    same function can be used for validation data as well

    Input tensors are generated on-the-fly so there is less I/O

    max_N_c is the maximum number of candidates to consider. This should ONLY be used
    for training, not for validation or testing.'''
    def bond_string_to_tuple(string):
        split = string.split('-')
        return (split[0], split[1], float(split[2]))

    fileInfo = [() for j in range(start_at, end_at, batch_size)
                ]  # (filePos, startIndex, endIndex)
    batchDims = [() for j in range(start_at, end_at, batch_size)
                 ]  # dimensions of each batch
    batchNums = np.array([
        i for (i, j) in enumerate(range(start_at, end_at, batch_size))
    ])  # list to shuffle later

    # Keep returning forever and ever
    with open(DATA_FPATH, 'rb') as fid:

        # Do a first pass through the data
        legend_data = pickle.load(fid)  # first doc is legend

        # Pre-load indeces
        CANDIDATE_EDITS_COMPACT = legend_data['candidate_edits_compact']
        ATOM_DESC_DICT = legend_data['atom_desc_dict']
        T = legend_data['T']
        SOLVENT = legend_data['solvent']
        REAGENT = legend_data['reagent']
        YIELD = legend_data['yield']
        REACTION_TRUE_ONEHOT = legend_data['reaction_true_onehot']

        for i in range(start_at):
            pickle.load(fid)  # throw away first ___ entries

        for k, startIndex in enumerate(range(start_at, end_at, batch_size)):
            endIndex = min(startIndex + batch_size, end_at)

            # Remember this starting position
            fileInfo[k] = (fid.tell(), startIndex, endIndex)

            N = endIndex - startIndex  # number of samples this batch
            # print('Serving up examples {} through {}'.format(startIndex, endIndex))

            docs = [pickle.load(fid) for j in range(startIndex, endIndex)]

            # FNeed to figure out size of padded batch
            N_c = max([len(doc[REACTION_TRUE_ONEHOT]) for doc in docs])
            if type(max_N_c) != type(None):  # allow truncation during training
                N_c = min(N_c, max_N_c)
            N_e1 = 1
            N_e2 = 1
            N_e3 = 1
            N_e4 = 1
            for i, doc in enumerate(docs):
                for (c,
                     edit_string) in enumerate(doc[CANDIDATE_EDITS_COMPACT]):
                    if c >= N_c: break
                    edit_string_split = edit_string.split(';')
                    N_e1 = max(N_e1, edit_string_split[0].count(',') + 1)
                    N_e2 = max(N_e2, edit_string_split[1].count(',') + 1)
                    N_e3 = max(N_e3, edit_string_split[2].count(',') + 1)
                    N_e4 = max(N_e4, edit_string_split[3].count(',') + 1)

            # Remember sizes of x_h_lost, x_h_gain, x_bond_lost, x_bond_gain, reaction_true_onehot
            batchDim = (N, N_c, N_e1, N_e2, N_e3, N_e4)

            # print('The padded sizes of this batch will be: N, N_c, N_e1, N_e2, N_e3, N_e4')
            # print(batchDim)
            batchDims[k] = batchDim

        while True:

            if shuffle: np.random.shuffle(batchNums)

            for batchNum in batchNums:
                (filePos, startIndex, endIndex) = fileInfo[batchNum]
                (N, N_c, N_e1, N_e2, N_e3, N_e4) = batchDims[batchNum]
                fid.seek(filePos)

                N = endIndex - startIndex  # number of samples this batch
                # print('Serving up examples {} through {}'.format(startIndex, endIndex))

                docs = [pickle.load(fid) for j in range(startIndex, endIndex)]

                # Initialize numpy arrays for x_h_lost, etc.
                x_h_lost = np.zeros((N, N_c, N_e1, F_atom), dtype=np.float32)
                x_h_gain = np.zeros((N, N_c, N_e2, F_atom), dtype=np.float32)
                x_bond_lost = np.zeros((N, N_c, N_e3, F_bond),
                                       dtype=np.float32)
                x_bond_gain = np.zeros((N, N_c, N_e4, F_bond),
                                       dtype=np.float32)
                reaction_true_onehot = np.zeros((N, N_c), dtype=np.float32)
                yields = np.zeros((N, 1), dtype=np.float32)

                for i, doc in enumerate(docs):

                    for (c, edit_string) in enumerate(
                            doc[CANDIDATE_EDITS_COMPACT]):
                        if c >= N_c:
                            break

                        edit_string_split = edit_string.split(';')
                        edits = [
                            [
                                atom_string for atom_string in
                                edit_string_split[0].split(',') if atom_string
                            ],
                            [
                                atom_string for atom_string in
                                edit_string_split[1].split(',') if atom_string
                            ],
                            [
                                bond_string_to_tuple(bond_string) for
                                bond_string in edit_string_split[2].split(',')
                                if bond_string
                            ],
                            [
                                bond_string_to_tuple(bond_string) for
                                bond_string in edit_string_split[3].split(',')
                                if bond_string
                            ],
                        ]

                        try:
                            edit_h_lost_vec, edit_h_gain_vec, \
                                edit_bond_lost_vec, edit_bond_gain_vec = edits_to_vectors(edits, None, atom_desc_dict = doc[ATOM_DESC_DICT])
                        except KeyError as e:  # sometimes molAtomMapNumber not found if hydrogens were explicit
                            continue

                        for (e, edit_h_lost) in enumerate(edit_h_lost_vec):
                            if e >= N_e1:
                                raise ValueError('N_e1 not large enough!')
                            x_h_lost[i, c, e, :] = edit_h_lost
                        for (e, edit_h_gain) in enumerate(edit_h_gain_vec):
                            if e >= N_e2:
                                raise ValueError('N_e2 not large enough!')
                            x_h_gain[i, c, e, :] = edit_h_gain
                        for (e,
                             edit_bond_lost) in enumerate(edit_bond_lost_vec):
                            if e >= N_e3:
                                raise ValueError('N_e3 not large enough!')
                            x_bond_lost[i, c, e, :] = edit_bond_lost
                        for (e,
                             edit_bond_gain) in enumerate(edit_bond_gain_vec):
                            if e >= N_e4:
                                raise ValueRrror('N_e4 not large enough!')
                            x_bond_gain[i, c, e, :] = edit_bond_gain

                    # Add truncated reaction true (eventually will not truncate)
                    if type(max_N_c) == type(None):
                        reaction_true_onehot[
                            i, :len(doc[REACTION_TRUE_ONEHOT]
                                    )] = doc[REACTION_TRUE_ONEHOT]
                    else:
                        reaction_true_onehot[
                            i, :min(len(doc[REACTION_TRUE_ONEHOT]), max_N_c
                                    )] = doc[REACTION_TRUE_ONEHOT][:max_N_c]
                    yields[i, 0] = doc[YIELD] / 100.0

                # Get rid of NaNs
                x_h_lost[np.isnan(x_h_lost)] = 0.0
                x_h_gain[np.isnan(x_h_gain)] = 0.0
                x_bond_lost[np.isnan(x_bond_lost)] = 0.0
                x_bond_gain[np.isnan(x_bond_gain)] = 0.0
                x_h_lost[np.isinf(x_h_lost)] = 0.0
                x_h_gain[np.isinf(x_h_gain)] = 0.0
                x_bond_lost[np.isinf(x_bond_lost)] = 0.0
                x_bond_gain[np.isinf(x_bond_gain)] = 0.0

                # print('Batch {} to {}'.format(startIndex, endIndex))
                # yield (x, y) as tuple, but each one is a list

                if TARGET_YIELD:
                    y = yields
                else:
                    y = reaction_true_onehot

                yield (
                    [
                        x_h_lost,
                        x_h_gain,
                        x_bond_lost,
                        x_bond_gain,
                        np.array([doc[REAGENT] for doc in docs],
                                 dtype=np.float32),  # reagent
                        np.array([doc[SOLVENT] for doc in docs],
                                 dtype=np.float32),  # solvent
                        np.array([doc[T] for doc in docs],
                                 dtype=np.float32),  # temperature
                    ],
                    [
                        y,
                    ],
                )
Ejemplo n.º 16
0
    def load_model(self, FP_len=1024, model_tag='1024bool'):
        """Loads model from given tag.

        Args:
            FP_len (int, optional): Fingerprint length. (default: {1024})
            model_tag (str, optional): Tag of model to load.
                (default: {'1024bool'})
        """
        self.FP_len = FP_len
        if model_tag != '1024bool' and model_tag != '1024uint8' and model_tag != '2048bool':
            MyLogger.print_and_log(
                'Non-existent SCScore model requested: {}. Using "1024bool" model'
                .format(model_tag),
                scscore_prioritizer_loc,
                level=2)
            model_tag = '1024bool'
        filename = 'trained_model_path_' + model_tag
        with open(gc.SCScore_Prioritiaztion[filename], 'rb') as fid:
            self.vars = pickle.load(fid)
        if gc.DEBUG:
            MyLogger.print_and_log(
                'Loaded synthetic complexity score prioritization model from {}'
                .format(gc.SCScore_Prioritiaztion[filename]),
                scscore_prioritizer_loc)

        if 'uint8' in gc.SCScore_Prioritiaztion[filename]:

            def mol_to_fp(mol):
                """Returns fingerprint of molecule for uint8 model.

                Args:
                    mol (Chem.rdchem.Mol or None): Molecule to get fingerprint
                        of.

                Returns:
                    np.ndarray of np.uint8: Fingerprint of given molecule.
                """
                if mol is None:
                    return np.array((self.FP_len, ), dtype=np.uint8)
                fp = AllChem.GetMorganFingerprint(
                    mol, self.FP_rad, useChirality=True)  # uitnsparsevect
                fp_folded = np.zeros((self.FP_len, ), dtype=np.uint8)
                for k, v in fp.GetNonzeroElements().items():
                    fp_folded[k % self.FP_len] += v
                return np.array(fp_folded)
        else:

            def mol_to_fp(mol):
                """Returns fingerprint of molecule for bool model.

                Args:
                    mol (Chem.rdchem.Mol or None): Molecule to get fingerprint
                        of.

                Returns:
                    np.ndarray of np.bool or np.float32: Fingerprint of given
                        molecule.
                """
                if mol is None:
                    return np.zeros((self.FP_len, ), dtype=np.float32)
                return np.array(AllChem.GetMorganFingerprintAsBitVect(
                    mol, self.FP_rad, nBits=self.FP_len, useChirality=True),
                                dtype=np.bool)

        self.mol_to_fp = mol_to_fp

        self.pricer = Pricer()
        self.pricer.load()
        self._restored = True
        self._loaded = True