Ejemplo n.º 1
0
 def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}):
     # Import (if needed)
     if itype == 'smiles':
         rdmol = MolFromSmiles(idepic, sanitize=True)
     elif itype == 'inchi':
         rdmol = MolFromInchi(idepic, sanitize=True)
     else:
         raise NotImplementedError(
             '"{}" is not a valid input type'.format(itype))
     if rdmol is None:  # Check imprt
         raise self.DepictionError(
             'Import error from depiction "{}" of type "{}"'.format(
                 idepic, itype))
     # Export
     odepic = dict()
     for item in otype:
         if item == 'smiles':
             odepic[item] = MolToSmiles(
                 rdmol
             )  # MolToSmiles is tricky, one mays want to check the possible options..
         elif item == 'inchi':
             odepic[item] = MolToInchi(rdmol)
         elif item == 'inchikey':
             odepic[item] = MolToInchiKey(rdmol)
         else:
             raise NotImplementedError(
                 '"{}" is not a valid output type'.format(otype))
     return odepic
Ejemplo n.º 2
0
 def load_data(self, preprocess=False, stereochem=1., augment=1):
     all_mols = read_smiles_file(self.dataset)
     if preprocess:
         all_mols = preprocess_smiles(all_mols, stereochem)
     self.molecules = all_mols
     self.smiles = all_mols
     self.inchi = [MolToInchiKey(MolFromSmiles(s)) for s in all_mols]
     del all_mols
     print("%i molecules loaded from %s..." %
           (len(self.molecules), self.dataset))
     self.maxlen = max([len(m) for m in self.molecules]) + 2
     print("Maximal sequence length: %i" % (self.maxlen - 2))
     if augment > 1:
         print("augmenting SMILES %i-fold..." % augment)
         augmented_mols = randomize_smileslist(self.molecules, num=augment)
         print("%i SMILES strings generated for %i molecules" %
               (len(augmented_mols), len(self.molecules)))
         self.smiles = self.molecules
         self.molecules = augmented_mols
         del augmented_mols
     self.padded = pad_seqs(["^%s$" % m for m in self.molecules],
                            ' ',
                            given_len=self.maxlen)
     self.n_mols = len(self.molecules)
     self.val_mols, self.train_mols = np.split(
         np.random.choice(range(self.n_mols), self.n_mols, replace=False),
         [int(self.validation * self.n_mols)])
     print("Using %i examples for training and %i for valdiation" %
           (len(self.train_mols), len(self.val_mols)))
     self.build_tokenizer()
Ejemplo n.º 3
0
def convert_depiction(idepic, itype='smiles', otype={'inchikey'}):
    """Convert chemical depiction to others type of depictions
    
    :param  idepic: string depiction to be converted, str
    :param   itype: type of depiction provided as input, str
    :param   otype: types of depiction to be generated, {"", "", ..}
    :return odepic: generated depictions, {"otype1": "odepic1", ..}
    
    Usage example:
    - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'})
    - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'})
    """
    # Import (if needed)
    if itype == 'smiles':
        rdmol = MolFromSmiles(idepic, sanitize=True)
    elif itype == 'inchi':
        rdmol = MolFromInchi(idepic, sanitize=True)
    else:
        raise NotImplementedError('"{}" is not a valid input type'.format(itype))
    if rdmol is None:  # Check imprt
        raise Exception('Import error from depiction "{}" of type "{}"'.format(idepic, itype))
    
    # Export
    odepic = dict()
    for item in otype:
        if item == 'smiles':
            odepic[item] = MolToSmiles(rdmol)  # MolToSmiles is tricky, one mays want to check the possible options..
        elif item == 'inchi':
            odepic[item] = MolToInchi(rdmol)
        elif item == 'inchikey':
            odepic[item] = MolToInchiKey(rdmol)
        else:
            raise NotImplementedError('"{}" is not a valid output type'.format(otype))

    return odepic
Ejemplo n.º 4
0
    def analyze(self, smiles: List[str], only_drugs=True) -> pd.DataFrame:
        features = self.preprocessor.transform(smiles)

        # RDKit molecular properties
        inchikey = []
        weight = []
        logp = []
        hdonors = []
        hacceptors = []
        for example in smiles:
            mol = MolFromSmiles(example)
            if not mol:
                raise ValueError("Malformed molecule passed in to analyze")

            inchikey.append(MolToInchiKey(mol))
            weight.append(ExactMolWt(mol))
            logp.append(MolLogP(mol))
            hdonors.append(NumHDonors(mol))
            hacceptors.append(NumHAcceptors(mol))

        # Scores
        safety = self.safety.predict(features)
        feasibility = self.feasibility.predict(features)
        bbbp = self.bbbp.predict_proba(features)

        dataframe = pd.DataFrame(
            {
                "key": inchikey,
                "smiles": smiles,
                "weight": weight,
                "logp": logp,
                "hdonors": hdonors,
                "hacceptors": hacceptors,
                "safety": safety,
                "feasibility": feasibility,
                "bbbp": (i[1] for i in bbbp),
            }
        )

        if only_drugs:
            # Lipinsky's rules
            dataframe = dataframe[dataframe.weight < 500]
            dataframe = dataframe[dataframe.hdonors <= 5]
            dataframe = dataframe[dataframe.hacceptors <= 10]
            dataframe = dataframe[dataframe.logp <= 5]

            # Filter too toxic and infeasible compounds
            dataframe = dataframe[dataframe.safety > 0.75]
            dataframe = dataframe[dataframe.feasibility > 0.75]

            dataframe = dataframe.reset_index(drop=True)

        return dataframe
Ejemplo n.º 5
0
def standardize_chemical(rdmol, add_hs=True, rm_stereo=True, heavy=False):
    """Standardize a chemical using RDKit sanitize method.

    :param      rdmol:      RDKit mol object
    :param      add_hs:     append Hs, bool (default: True)
    :param      rm_stereo:  remove stereo, bool (default: True)
    :param      heavy:      perform custom in depth standardization (default: False)
    :returns    rdmol:      RDKit mol object
    """
    # if not rm_stereo:
    #     logging.warning("Stereo not handled at the time being.")
    #     raise ChemConversionError("Stereo not handled at the time being.")
    simple_standardisation = {
        'OP_REMOVE_ISOTOPE': False,
        'OP_NEUTRALISE_CHARGE': False,
        'OP_REMOVE_STEREO': rm_stereo,
        'OP_COMMUTE_INCHI': True,
        'OP_KEEP_BIGGEST': False,
        'OP_ADD_HYDROGEN': add_hs,
        'OP_KEKULIZE': False,
        'OP_NEUTRALISE_CHARGE_LATE': True
    }
    heavy_standardisation = {
        'OP_REMOVE_ISOTOPE': True,
        'OP_NEUTRALISE_CHARGE': True,
        'OP_REMOVE_STEREO': rm_stereo,
        'OP_COMMUTE_INCHI': True,
        'OP_KEEP_BIGGEST': True,
        'OP_ADD_HYDROGEN': add_hs,
        'OP_KEKULIZE': False,
        'OP_NEUTRALISE_CHARGE_LATE': True
    }

    try:
        if heavy:
            rdmol = Standardizer(sequence_fun='sequence_tunable',
                                 params=heavy_standardisation).compute(rdmol)
            logging.debug(
                "Performing heavy standardisation for compound {}".format(
                    MolToInchiKey(rdmol)))
        else:
            rdmol = Standardizer(sequence_fun='sequence_tunable',
                                 params=simple_standardisation).compute(rdmol)
        return rdmol
    except Exception as e:
        logging.warning(e)
        raise e
Ejemplo n.º 6
0
    def _convert_depiction(self, idepic, itype='smiles', otype={'inchikey'}):
        """Convert chemical depiction to others type of depictions

        Usage example:
         - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'})
         - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'})

        :param idepic: Input string
        :param itype: The type of input
        :param otype: Type of output. Valid options: inchi, smiles, inchikey

        :type idepic: str 
        :type itype: str
        :type otype: dict

        :rtype: dict
        :return: Dictionnary of results
        """
        # Import (if needed)
        if itype == 'smiles':
            rdmol = MolFromSmiles(idepic, sanitize=True)
        elif itype == 'inchi':
            rdmol = MolFromInchi(idepic, sanitize=True)
        else:
            raise NotImplementedError('"{}" is not a valid input type'.format(itype))
        if rdmol is None:  # Check imprt
            raise self.DepictionError('Import error from depiction "{}" of type "{}"'.format(idepic, itype))
        # Export
        odepic = dict()
        for item in otype:
            if item == 'smiles':
                odepic[item] = MolToSmiles(rdmol)  # MolToSmiles is tricky, one mays want to check the possible options..
            elif item == 'inchi':
                odepic[item] = MolToInchi(rdmol)
            elif item == 'inchikey':
                odepic[item] = MolToInchiKey(rdmol)
            else:
                raise NotImplementedError('"{}" is not a valid output type'.format(otype))
        return odepic
Ejemplo n.º 7
0
 def test4MolToInchiKey(self):
     m = MolFromSmiles("CC=C(N)C")
     inchi = MolToInchi(m)
     k1 = InchiToInchiKey(inchi)
     k2 = MolToInchiKey(m)
     self.assertEqual(k1, k2)
Ejemplo n.º 8
0
    def train_model(self, n_sample=100):
        print("Training model...")
        log_dir = "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
        writer = tf.summary.create_file_writer(log_dir)
        # writer = tf.compat.v1.summary.FileWriter('./logs/' + self.run_name, graph=tf.Graph())
        mol_file = open("./generated/" + self.run_name + "_generated.csv", 'a')
        i = 0
        while i < self.num_epochs:
            print("\n------ ITERATION %i ------" % i)
            self.set_lr(i)
            print("\nCurrent learning rate: %.5f" %
                  tf.keras.backend.get_value(self.model.optimizer.lr))
            chkpntr = tf.keras.callbacks.ModelCheckpoint(
                filepath=self.checkpoint_dir +
                'model_epoch_{:02d}.hdf5'.format(i),
                verbose=1)
            if self.validation:
                generator_train = DataGenerator(self.padded, self.train_mols,
                                                self.maxlen - 1,
                                                self.token_indices, self.step,
                                                self.batch_size)
                generator_val = DataGenerator(self.padded, self.val_mols,
                                              self.maxlen - 1,
                                              self.token_indices, self.step,
                                              self.batch_size)
                history = self.model.fit(generator_train,
                                         epochs=1,
                                         validation_data=generator_val,
                                         use_multiprocessing=self.multi,
                                         workers=self.workers,
                                         callbacks=[chkpntr])
                with writer.as_default():
                    tf.summary.scalar('val_loss',
                                      history.history['val_loss'][-1],
                                      step=i)

            else:
                generator = DataGenerator(self.padded, range(self.n_mols),
                                          self.maxlen - 1, self.token_indices,
                                          self.step, self.batch_size)
                history = self.model.fit(generator,
                                         epochs=1,
                                         use_multiprocessing=self.multi,
                                         workers=self.workers,
                                         callbacks=[chkpntr])
            # write losses to tensorboard log
            with writer.as_default():
                tf.summary.scalar('loss', history.history['loss'][-1], step=i)
                tf.summary.scalar('lr',
                                  tf.keras.backend.get_value(
                                      self.model.optimizer.lr),
                                  step=i)

            if (i + 1) % self.sample_after == 0:
                valid_mols = self.sample_points(n_sample, self.temp)
                n_valid = len(valid_mols)
                if n_valid:
                    print("Comparing novelty...")
                    inchi_valid = np.array(
                        [MolToInchiKey(MolFromSmiles(s)) for s in valid_mols])
                    novel = np.array(
                        compare_mollists(inchi_valid, np.array(self.inchi),
                                         False))
                    n_novel = float(len(set(novel))) / n_valid
                    mol_file.write("\n----- epoch %i -----\n" % i)
                    mol_file.write("\n".join(set(valid_mols)))
                else:
                    novel = []
                    n_novel = 0
                # write generated compound summary to tensorboard log
                with writer.as_default():
                    tf.summary.scalar('valid', (float(n_valid) / n_sample),
                                      step=i)
                    tf.summary.scalar('novel', n_novel, step=i)
                    tf.summary.scalar('unique_valid',
                                      len(set(valid_mols)),
                                      step=i)
                print("\nValid:\t{}/{}".format(n_valid, n_sample))
                print("Unique:\t{}".format(len(set(valid_mols))))
                print("Novel:\t{}\n".format(len(novel)))

                if self.reinforce:  # reinforce = add most similar generated compounds to training pool
                    if len(novel) > (n_sample / 5):
                        if self.mw_filter:
                            # only consider molecules in given MW range
                            mw = np.array([
                                Descriptors.MolWt(MolFromSmiles(s))
                                if MolFromSmiles(s) else 0 for s in novel
                            ])
                            mw_idx = np.where((int(self.mw_filter[0]) < mw) &
                                              (mw < int(self.mw_filter[1])))[0]
                            novel = np.array(novel)[mw_idx]

                        print(
                            "Calculating CATS similarities of novel generated molecules to SMILES pool..."
                        )
                        fp_novel = cats_descriptor(
                            [MolFromSmiles(s) for s in novel])
                        if self.reference:  # if a reference mol(s) is given, calculate distance to that one
                            fp_train = cats_descriptor(
                                [MolFromSmiles(self.reference)])
                        else:  # else calculate the distance to all training mols
                            fp_train = cats_descriptor(
                                [MolFromSmiles(s) for s in self.smiles])
                        sims = parallel_pairwise_similarities(
                            fp_novel, fp_train, metric='euclidean')
                        top = sims[range(len(novel)),
                                   np.argsort(sims, axis=1)[:, 0,
                                                            0]].flatten()
                        # take most similar third of the novel mols and add it to self.padded
                        print(
                            "Adding top 3 most similar but novel molecules to SMILES pool"
                        )
                        add = randomize_smileslist(novel[np.argsort(top)[:3]],
                                                   num=3)
                        padd_add = pad_seqs(["^%s$" % m for m in add],
                                            ' ',
                                            given_len=self.maxlen)
                        self.padded = np.hstack((self.padded, padd_add))
                        self.padded = np.random.choice(self.padded,
                                                       len(self.padded),
                                                       False)  # shuffle

            i += 1  # next epoch