Example #1
0
sns.set()

rdBase.DisableLog('rdApp.error')

np.random.seed(0)

# convert rdkit fingerprint to numpy array
def fp2arr(fp):
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr
mol_zinc = []
with open('zinc10000.txt', 'r') as f:
    for line in f:
        smiles = line.rstrip()
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            mol_zinc.append(mol)
        else:
            print(smiles)

mol_active = []
with open('actives_final.ism', 'r') as f:
    for line in f:
        smiles = line.split()[0]
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            mol_active.append(mol)
        else:
            print(smiles)
Example #2
0
def canonicalize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if smiles != '' and mol is not None and mol.GetNumAtoms() > 1:
        return Chem.MolToSmiles(mol)
    else:
        return smiles
Example #3
0
    def test1(self):
        # computeCanonicalTransform returns more approximate eigenvalues/eigencvectors
        # when built against the native RDKit PowerEigenSolver, so unit test results
        # differ slightly
        builtAgainstEigen3 = hasattr(AllChem, 'ComputePrincipalAxesAndMomentsFromGyrationMatrix')
        if builtAgainstEigen3:
            expectedSkelPts = 15
            expectedAlgs = [0, 5, 21, 0]
            prunedAlgs = [0, 4, 11, 0]
        else:
            expectedSkelPts = 16
            expectedAlgs = [0, 5, 28, 0]
            prunedAlgs = [0, 4, 12, 0]
        filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                'test_data', '5ht3ligs.sdf')
        suppl = Chem.SDMolSupplier(filename)
        builder = SubshapeBuilder.SubshapeBuilder()
        builder.gridDims = (20., 20., 10)
        builder.gridSpacing = 0.5
        builder.winRad = 4.

        ms = []
        shapes = []
        for m in suppl:
            m = Chem.AddHs(m, addCoords=True)
            AllChem.CanonicalizeConformer(m.GetConformer())
            ms.append(m)
            shape = builder(m, terminalPtsOnly=True)
            shapes.append(shape)

        self.assertEqual(len(ms), 4)
        self.assertEqual(len(shapes), 4)
        self.assertEqual([len(x.skelPts) for x in shapes], [5, 5, 5, 5])

        refShape = builder.GenerateSubshapeShape(ms[0])

        self.assertEqual(len(refShape.skelPts), expectedSkelPts)

        aligner = SubshapeAligner.SubshapeAligner()
        aligner.shapeDistTol = .30

        algStore = []
        for i, s1 in enumerate(shapes):
            if not i or not s1:
                algStore.append([])
                continue
            m1 = ms[i]
            alignments = aligner.GetSubshapeAlignments(ms[0], refShape, m1, s1, builder)
            algStore.append(alignments)
        self.assertEqual([len(x) for x in algStore], expectedAlgs)

        algStore = []
        for i, s1 in enumerate(shapes):
            if not i or not s1:
                algStore.append([])
                continue
            m1 = ms[i]
            alignments = list(aligner(ms[0], refShape, m1, s1, builder))
            algStore.append(alignments)
        self.assertEqual([len(x) for x in algStore], expectedAlgs)

        pruned = []
        for i, mi in enumerate(ms):
            alignments = algStore[i]
            pruned.append(SubshapeAligner.ClusterAlignments(
                mi, alignments, builder, neighborTol=0.15))
        self.assertEqual([len(x) for x in pruned], prunedAlgs)
Example #4
0
            print('successfully loaded editor model from %s' % path)
        if config['proposal'] == 'random': proposal = Proposal_Random(config)
        elif config['proposal'] == 'editor':
            proposal = Proposal_Editor(config, editor)
        elif config['proposal'] == 'mix':
            proposal = Proposal_Mix(config, editor)
        else:
            raise NotImplementedError

        ### sampler
        if config['sampler'] == 're':
            sampler = Sampler_Recursive(config, proposal, estimator)
        elif config['sampler'] == 'sa':
            sampler = Sampler_SA(config, proposal, estimator)
        elif config['sampler'] == 'mh':
            sampler = Sampler_MH(config, proposal, estimator)
        else:
            raise NotImplementedError

        ### sampling
        if config['mols_init']:
            mols = load_mols(config['data_dir'], config['mols_init'])
            mols = random.choices(mols, k=config['num_mols'])
            mols_init = mols[:config['num_mols']]
        else:
            mols_init = [
                Chem.MolFromSmiles('CC') for _ in range(config['num_mols'])
            ]

        sampler.sample(run_dir, mols_init)

def mols_to_pngs(mols, basename="test"):
    """Helper to write RDKit mols to png files."""
    filenames = []
    for i, mol in enumerate(mols):
        filename = "MUV_%s%d.png" % (basename, i)
        Draw.MolToFile(mol, filename)
        filenames.append(filename)
    return filenames


num_to_display = 12
molecules = []
for _, data in islice(dataset.iterrows(), num_to_display):
    molecules.append(Chem.MolFromSmiles(data["smiles"]))
display_images(mols_to_pngs(molecules))

MUV_tasks = [
    'MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548',
    'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858',
    'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832'
]

featurizer = dc.feat.CircularFingerprint(size=1024)
loader = dc.data.CSVLoader(tasks=MUV_tasks,
                           smiles_field="smiles",
                           featurizer=featurizer)
dataset = loader.featurize(dataset_file)

splitter = dc.splits.RandomSplitter(dataset_file)
Example #6
0
 def test2Issue217(self) :
   smi = 'c1ccccc1'
   m = Chem.MolFromSmiles(smi)
   addConf(m)
   self.assertTrue(m.GetNumConformers()==1);
   mb2 = Chem.MolToMolBlock(m)
Example #7
0
    def predict(self,
                react,
                top_cand_bonds,
                top_cand_scores=[],
                scores=True,
                top_n=100,
                atommap=False):
        '''react: atom mapped reactant smiles
        top_cand_bonds: list of strings "ai-aj-bo"'''

        cand_bonds = []
        if not top_cand_scores:
            top_cand_scores = [0.0 for b in top_cand_bonds]
        for i, b in enumerate(top_cand_bonds):
            x, y, t = b.split('-')
            x, y, t = int(float(x)) - 1, int(float(y)) - 1, float(t)

            cand_bonds.append((x, y, t, float(top_cand_scores[i])))

        while True:
            src_tuple, conf = smiles2graph(react,
                                           None,
                                           cand_bonds,
                                           None,
                                           core_size=core_size,
                                           cutoff=MAX_NCAND,
                                           testing=True)
            if len(conf) <= MAX_NCAND:
                break
            ncore -= 1

        feed_map = {x: y for x, y in zip(self.src_holder, src_tuple)}
        cur_scores, cur_probs, candidates = self.session.run(
            self.predict_vars, feed_dict=feed_map)

        idxfunc = lambda a: a.GetAtomMapNum()
        bond_types = [
            Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
            Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC
        ]
        bond_types_as_double = {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, 1.5: 4}

        # Don't waste predictions on bond changes that aren't actually changes
        rmol = Chem.MolFromSmiles(react)
        rbonds = {}
        for bond in rmol.GetBonds():
            a1 = idxfunc(bond.GetBeginAtom())
            a2 = idxfunc(bond.GetEndAtom())
            t = bond_types.index(bond.GetBondType()) + 1
            a1, a2 = min(a1, a2), max(a1, a2)
            rbonds[(a1, a2)] = t

        cand_smiles = []
        cand_scores = []
        cand_probs = []
        for idx in candidates:
            cbonds = []
            # Define edits from prediction
            for x, y, t, v in conf[idx]:
                x, y = x + 1, y + 1
                if ((x, y) not in rbonds and t > 0) or (
                    (x, y) in rbonds and rbonds[(x, y)] != t):
                    cbonds.append((x, y, bond_types_as_double[t]))

            pred_smiles = edit_mol(rmol, cbonds, atommap=atommap)
            cand_smiles.append(pred_smiles)
            cand_scores.append(cur_scores[idx])
            cand_probs.append(cur_probs[idx])

        outcomes = []
        if scores:
            for i in range(min(len(cand_smiles), top_n)):
                outcomes.append({
                    'rank': i + 1,
                    'smiles': '.'.join(cand_smiles[i]),
                    'score': cand_scores[i],
                    'prob': cand_probs[i],
                })
        else:
            for i in range(min(len(cand_smiles), top_n)):
                outcomes.append({
                    'rank': i + 1,
                    'smiles': '.'.join(cand_smiles[i]),
                })

        return outcomes
Example #8
0
def _read_smi(file_name):
    while True:
        line = file_name.readline()
        if not line:
            break
        yield Chem.MolFromSmiles(line.split('\t')[0])
Example #9
0
def generate_corpus(in_file, out_file, r, sentence_type='alt', n_jobs=1):

    """Generates corpus file from sdf
    
    Parameters
    ----------
    in_file : str
        Input sdf
    out_file : str
        Outfile name prefix, suffix is either _r0, _r1, etc. or _alt_r1 (max radius in alt sentence)
    r : int
        Radius of morgan fingerprint
    sentence_type : str
        Options:    'all' - generates all corpus files for all types of sentences, 
                    'alt' - generates a corpus file with only combined alternating sentence, 
                    'individual' - generates corpus files for each radius
    n_jobs : int
        Number of cores to use (only 'alt' sentence type is parallelized)

    Returns
    -------
    """

    # File type detection
    in_split = in_file.split('.')
    if in_split[-1].lower() not in ['sdf', 'smi', 'ism', 'gz']:
        raise ValueError('File extension not supported (sdf, smi, ism, sdf.gz, smi.gz)')
    gzipped = False
    if in_split[-1].lower() == 'gz':
        gzipped = True
        if in_split[-2].lower() not in ['sdf', 'smi', 'ism']:
            raise ValueError('File extension not supported (sdf, smi, ism, sdf.gz, smi.gz)')

    file_handles = []
    
    # write only files which contain corpus
    if (sentence_type == 'individual') or (sentence_type == 'all'):
        
        f1 = open(out_file+'_r0.corpus', "w")
        f2 = open(out_file+'_r1.corpus', "w")
        file_handles.append(f1)
        file_handles.append(f2)

    if (sentence_type == 'alt') or (sentence_type == 'all'):
        f3 = open(out_file, "w")
        file_handles.append(f3)
    
    if gzipped:
        import gzip
        if in_split[-2].lower() == 'sdf':
            mols_file = gzip.open(in_file, mode='r')
            suppl = Chem.ForwardSDMolSupplier(mols_file)
        else:
            mols_file = gzip.open(in_file, mode='rt')
            suppl = _read_smi(mols_file)
    else:
        if in_split[-1].lower() == 'sdf':
            suppl = Chem.ForwardSDMolSupplier(in_file)
        else:
            mols_file = open(in_file, mode='rt')
            suppl = _read_smi(mols_file)

    if sentence_type == 'alt':  # This can run parallelized
        result = Parallel(n_jobs=n_jobs, verbose=1)(delayed(_parallel_job)(mol, r) for mol in suppl)
        for i, line in enumerate(result):
            f3.write(str(line) + '\n')
        print('% molecules successfully processed.')

    else:
        for mol in suppl:
            if mol is not None:
                smiles = Chem.MolToSmiles(mol)
                mol = Chem.MolFromSmiles(smiles)
                identifier_sentences, alternating_sentence = mol2sentence(mol, r)

                identifier_sentence_r0 = " ".join(identifier_sentences[0])
                identifier_sentence_r1 = " ".join(identifier_sentences[1])
                alternating_sentence_r0r1 = " ".join(alternating_sentence)

                if len(smiles) != 0:
                    if (sentence_type == 'individual') or (sentence_type == 'all'):
                        f1.write(str(identifier_sentence_r0)+'\n')
                        f2.write(str(identifier_sentence_r1)+'\n')

                    if (sentence_type == 'alt') or (sentence_type == 'all'):
                        f3.write(str(alternating_sentence_r0r1)+'\n')

    for fh in file_handles:
        fh.close()
Example #10
0
mymols = make_molecules(cno)

#Make sum over bonds descriptor
bond_types, bonds_in_molecule = sum_over_bonds(mymols)
np.savetxt("sum_over_bonds.out", bonds_in_molecule, delimiter=" ")

#*********** Generate Estate indices************************
#
#Note that there are 79 possible Estate descriptors,
#however only a subset are non-zero for the Huang-Massa/Mathieu dataset so I
#remove the null vectors using scrub_null_columns()
num_smiles = len(smi)
icount = 0
estate_fingers = np.zeros((num_smiles, 79))  #There are 79 possible descriptors
while icount < num_smiles:
    m = Chem.MolFromSmiles(smi[icount])
    counts, sums = FingerprintMol(m)
    estate_fingers[icount, :] = np.transpose(
        counts)  #can also use sums as descriptor
    icount += 1
nz_estate = scrub_null_columns(estate_fingers)
np.savetxt("nz_estate.out", nz_estate, delimiter=" ")
#
#
#**********Done with Estate Generation**************************

# Make Morgan fingerprints using Dan's code
dan_prints = make_fingerprints(mymols)
morgan_prints = np.asarray(dan_prints[2].x)
np.savetxt("morgan_prints.out", morgan_prints, delimiter=" ")
Example #11
0
def cal_feature_IG(sess,
                   all_data,
                   placeholders,
                   info,
                   config,
                   prediction,
                   ig_modal_target,
                   ig_label_target,
                   *,
                   model=None,
                   logger=None,
                   args=None):
    """ calculate integrated gradients
    Args:
        sess: session object
        all_data:
        placeholders:
        info:
        config
        prediction: prediction score(output of the network)
        ig_modal_target:
        ig_label_target:
        model:
        logger:
        args:
    """
    divide_number = 100
    header = "mol"
    if args is not None and args.visualization_header is not None:
        header = args.visualization_header
    outdir = config["visualize_path"]
    os.makedirs(outdir, exist_ok=True)
    mol_obj_list = info.mol_info["obj_list"] if "mol_info" in info else None
    tf_grads = None

    all_count = 0
    correct_count = 0
    visualize_ids = range(all_data.num)
    if args.visualize_resample_num:
        visualize_ids = np.random.choice(visualize_ids,
                                         args.visualize_resample_num,
                                         replace=False)
    for compound_id in visualize_ids:
        s = time.time()
        batch_idx = [compound_id]
        if all_data['sequences'] is not None and hasattr(model, "embedding"):
            _data = all_data['sequences']
            _data = np.expand_dims(_data[compound_id, ...], axis=0)
            _data = model.embedding(sess, _data)
            feed_dict = construct_feed(batch_idx,
                                       placeholders,
                                       all_data,
                                       batch_size=1,
                                       info=info,
                                       embedded_layer=_data)
        else:
            feed_dict = construct_feed(batch_idx,
                                       placeholders,
                                       all_data,
                                       batch_size=1,
                                       info=info)

        out_prediction = sess.run(prediction, feed_dict=feed_dict)
        # print("prediction shape",out_prediction.shape)
        # to give consistency with multitask.
        multitask = False
        if len(out_prediction.shape) == 1:
            out_prediction = out_prediction[:, np.newaxis, np.newaxis]
        elif len(out_prediction.shape) == 2:
            out_prediction = np.expand_dims(out_prediction, axis=1)
        elif len(out_prediction.shape) == 3:
            if out_prediction.shape[1] > 1:
                multitask = True
        # out_prediction: #data x # task x #class
        # labels: data x #task/#label
        for idx in range(out_prediction.shape[1]):
            _out_prediction = out_prediction[0, idx, :]
            true_label = np.argmax(
                all_data.labels[compound_id]
            ) if not multitask else all_data.labels[compound_id, idx]
            _prediction = prediction[:, idx, :] if len(
                prediction.shape) == 3 else prediction  # multitask = 3

            if ig_label_target == "max":
                target_index = np.argmax(_out_prediction)
                target_prediction = _prediction[:, target_index]
                target_score = _out_prediction[target_index]
            elif ig_label_target == "all":
                target_prediction = _prediction
                target_index = "all"
                target_score = np.sum(_out_prediction)
            elif ig_label_target == "correct":
                target_index = np.argmax(_out_prediction)
                if not target_index == true_label:
                    continue
                target_prediction = _prediction[:, target_index]
                target_score = _out_prediction[target_index]
            elif ig_label_target == "uncorrect":
                target_index = np.argmax(_out_prediction)
                if target_index == true_label:
                    continue
                target_prediction = _prediction[:, target_index]
                target_score = _out_prediction[target_index]
            elif ig_label_target == "label":
                target_index = true_label
                target_prediction = _prediction[:, target_index]
                target_score = _out_prediction[target_index]
            else:
                target_index = int(ig_label_target)
                target_prediction = _prediction[:, target_index]
                target_score = _out_prediction[target_index]
            # convert a assay string according to a prediction score
            if len(_out_prediction) > 2:  # softmax output
                assay_str = f"class{target_index}"
            elif len(_out_prediction) == 2:  # softmax output
                assay_str = "active" if _out_prediction[1] > 0.5 else "inactive"
            else:
                assay_str = "active" if _out_prediction > 0.5 else "inactive"

            try:
                mol_name = Chem.MolToSmiles(mol_obj_list[compound_id])
                mol_obj = mol_obj_list[compound_id]
            except:
                mol_name = None
                mol_obj = None
            if args.verbose:
                print(
                    f"No.{compound_id}, task={idx}: \"{mol_name}\": {assay_str} (score= {_out_prediction}, "
                    f"true_label= {true_label}, target_label= {target_index}, target_score= {target_score})"
                )
            else:
                print(
                    f"No.{compound_id}, task={idx}: \"{mol_name}\": {assay_str}"
                )
            visualizer = CompoundVisualizer(
                sess,
                outdir,
                compound_id,
                info,
                config,
                batch_idx,
                placeholders,
                all_data,
                target_prediction,
                logger=logger,
                model=model,
                ig_modal_target=ig_modal_target,
                perturbation_target=ig_modal_target,
                grads=tf_grads)
            tf_grads = visualizer.grads if tf_grads is None else tf_grads
            visualizer.cal_integrated_gradients(sess,
                                                divide_number,
                                                method=args.visualize_method)
            visualizer.check_IG(sess, target_prediction)
            visualizer.dump(
                f"{header}_{compound_id:04d}_task_{idx}_{assay_str}_{ig_modal_target}_scaling.jbl",
                additional_data={
                    "mol": mol_obj,
                    "prediction_score": target_score,
                    "target_label": target_index,
                    "true_label": true_label,
                })
            logger.info(
                f"prediction score: {target_score}\n"
                f"check score: {visualizer.end_score - visualizer.start_score}\n"
                f"sum of IG: {visualizer.sum_of_ig}\n"
                f"time : {time.time() - s}\n")
            all_count += 1
            if np.argmax(_out_prediction) == int(true_label):
                correct_count += 1
    logger.info(f"accuracy(visualized_data) = {correct_count/all_count}")
Example #12
0
def sample(mdl, scaffold_smi, num_samples):
    """Generate `num_samples` samples from the model `mdl` based on a given scaffold with SMILES `scaffold_smi`.

    Args:
        mdl (DeepScaffold): The scaffold-based molecule generative model
        scaffold_smi (str): The SMILES string of the given scaffold
        num_samples (int): The number of samples to generate

    Returns:
        t.Tuple[t.List[t.Union[str, None]], float, float]: The generated molecules. Molecules that does not satisfy the
                                                           validity requirements are returned as `None`
    """
    lg = RDLogger.logger()
    lg.setLevel(RDLogger.CRITICAL)

    # Convert SMILES to molecule
    scaffold = Chem.MolFromSmiles(scaffold_smi)

    # Convert molecule to numpy array
    # shape: 1, ..., 5
    scaffold_array, _ = get_array_from_mol(mol=scaffold,
                                           scaffold_nodes=range(
                                               scaffold.GetNumHeavyAtoms()),
                                           nh_nodes=[],
                                           np_nodes=[],
                                           k=1,
                                           p=1.0)

    # Convert numpy array to torch tensor
    # shape: 1, ..., 5
    scaffold_tensor = torch.from_numpy(scaffold_array).long().cuda()

    # Generate
    with torch.no_grad():
        # Expand the first dimension
        # shape: num_samples, ..., 5
        scaffold_tensor = scaffold_tensor.expand(num_samples, -1, -1)
        # Generate samples
        # shape: [num_samples, -1, 5]
        mol_array = mdl.generate(scaffold_tensor)

    # Move to CPU
    mol_array = mol_array.detach().cpu().numpy()

    # Convert numpy array to Chem.Mol object
    mol_list = get_mol_from_array(mol_array, sanitize=True)

    # Convert Chem.Mol object to SMILES
    def _to_smiles(_mol):
        if _mol is None:
            return None
        try:
            _smiles = Chem.MolToSmiles(_mol)
        except ValueError:
            # If the molecule can not be converted to SMILES, return None
            return None

        # If the output SMILES is None, return None
        if _smiles is None:
            return None

        # Make sure that the SMILES can be convert back to molecule
        try:
            _mol = Chem.MolFromSmiles(_smiles)
        except ValueError:
            # If there are any error encountered during the process,
            # return None
            return None

        # If the output molecule object is None, return None
        if _mol is None:
            return None
        return _smiles

    smiles_list = list(map(_to_smiles, mol_list))

    # Get the validity statistics
    num_valid = sum(1 for _ in smiles_list if _ is not None)
    percent_valid = float(num_valid) / len(smiles_list)

    # Get the uniqueness statistics
    num_unique = len(set(smiles_list)) - 1
    percent_unique = float(num_unique) / num_valid

    return smiles_list, percent_valid, percent_unique
Example #13
0
    def perceive_smiles(self, atommap=True):
        """
        Using the geometry, perceive the corresponding SMILES with bond
        orders using Open Babel and RDKit. In order to create a sensible
        SMILES, first infer the connectivity from the 3D coordinates
        using Open Babel, then convert to InChI to saturate unphysical
        multi-radical structures, then convert to RDKit and match the
        atoms to the ones in self in order to return a SMILES with atom
        mapping corresponding to the order given by the values of
        atom.idx for all atoms in self.

        This method requires Open Babel version >=2.4.1
        """

        # Get dict of atomic numbers for later comparison.
        atoms_in_mol_true = {}
        for atom in self:
            anum = atom.get_atomicnum()
            atoms_in_mol_true[anum] = atoms_in_mol_true.get(anum, 0) + 1

        # There seems to be no particularly simple way in RDKit to read
        # in 3D structures, so use Open Babel for this part. RMG doesn't
        # recognize some single bonds, so we can't use that.
        # We've probably called to_pybel_mol at some previous time to set
        # connections, but it shouldn't be too expensive to do it again.
        pybel_mol = self.to_pybel_mol()

        # Open Babel will often make single bonds and generate Smiles
        # that have multiple radicals, which would probably correspond
        # to double bonds. To get around this, convert to InChI (which
        # does not consider bond orders) and then convert to Smiles.
        inchi = pybel_mol.write('inchi', opt={'F': None}).strip()  # Add fixed H layer

        # Use RDKit to convert back to Smiles
        mol_sanitized = Chem.MolFromInchi(inchi)

        # RDKit doesn't like some hypervalent atoms
        if mol_sanitized is None:
            raise SanitizationError(
                f'Could not convert \n{self.to_xyz()}\nto Smiles. Unsanitized Smiles: {pybel_mol.write("smi").strip()}'
            )

        # RDKit adds unnecessary hydrogens in some cases. If
        # this happens, give up and return an error.
        mol_sanitized = Chem.AddHs(mol_sanitized)
        atoms_in_mol_sani = {}
        for atom in mol_sanitized.GetAtoms():
            atoms_in_mol_sani[atom.GetAtomicNum()] = atoms_in_mol_sani.get(atom.GetAtomicNum(), 0) + 1
        if atoms_in_mol_sani != atoms_in_mol_true:
            raise SanitizationError(
                f'Could not convert \n{self.to_xyz()}\nto Smiles. Wrong Smiles: {Chem.MolToSmiles(mol_sanitized)}'
            )

        if not atommap:
            self.smiles = Chem.MolToSmiles(mol_sanitized)
            return self.smiles

        # Because we went through InChI, we lost atom mapping
        # information. Restore it by matching the original molecule.
        # There should only be one unique map.
        mol_with_map = self.to_rdkit_mol()  # This only has single bonds
        mol_sani_sb = Chem.Mol(mol_sanitized)  # Make copy with single bonds only
        for bond in mol_sani_sb.GetBonds():
            bond.SetBondType(Chem.rdchem.BondType.SINGLE)
        match = mol_sani_sb.GetSubstructMatch(mol_with_map)  # Isomorphism mapping
        assert mol_with_map.GetNumAtoms() == len(match)  # Make sure we match all atoms
        for atom in mol_with_map.GetAtoms():
            idx = match[atom.GetIdx()]
            map_num = atom.GetAtomMapNum()
            mol_sanitized.GetAtomWithIdx(idx).SetAtomMapNum(map_num)

        # If everything succeeded up to here, we hopefully have a
        # sensible Smiles string with atom mappings for all atoms.
        self.smiles = Chem.MolToSmiles(mol_sanitized)
        return self.smiles
Example #14
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import os

import numpy as np
import pybel
from rdkit import Chem
from rdkit.Chem import AllChem, GetPeriodicTable

_rdkit_periodic_table = GetPeriodicTable()
RDKIT_SMILES_PARSER_PARAMS = Chem.SmilesParserParams()


def smiles_to_rdkit(smi, gen_3d=True, nconf=100):
    """
    Convert smiles to RDKit molecule.
    Tries to generate the lowest-energy conformer.
    """
    mol = Chem.MolFromSmiles(smi)
    mol = Chem.AddHs(mol)

    if gen_3d:
        cids = AllChem.EmbedMultipleConfs(mol, nconf, AllChem.ETKDG())

        AllChem.MMFFSanitizeMolecule(mol)
        mmff_props = AllChem.MMFFGetMoleculeProperties(mol)

        energies = []
        for cid in cids:
            ff = AllChem.MMFFGetMoleculeForceField(mol, mmff_props, confId=cid)
Example #15
0
def transform_command(parser, args):
    min_radius = args.min_radius
    assert min_radius in list("012345"), min_radius
    min_radius = int(min_radius)
    min_pairs = int(args.min_pairs)
    min_variable_size = args.min_variable_size
    max_variable_size = args.max_variable_size
    assert max_variable_size > min_variable_size, "max-variable-size must be greater than min-variable-size"
    min_constant_size = args.min_constant_size

    explain = command_support.get_explain(args.explain)

    start_time = time.time()
    dataset = dbutils.open_dataset_from_args_or_exit(args)
    open_time = time.time()

    property_names = command_support.get_property_names_or_error(
        parser, args, dataset)
    if not property_names:
        include_empty = True
    else:
        include_empty = False  # should there be a --show-all option to enable this?

    if args.substructure:
        substructure_pat = Chem.MolFromSmarts(args.substructure)
        if substructure_pat is None:
            parser.error("Cannot parse --substructure %r" %
                         (args.substructure, ))
    else:
        substructure_pat = None

    # evaluate --where, --score, and --rule-selection-cutoffs.
    rule_selection_function = analysis_algorithms.get_rule_selection_function_from_args(
        parser, args)

    transform_tool = analysis_algorithms.get_transform_tool(
        dataset, rule_selection_function)
    transform_record = transform_tool.fragment_transform_smiles(args.smiles)
    if transform_record.errmsg:
        parser.error("Unable to fragment --smiles %r: %s" %
                     (args.smiles, transform_record.errmsg))

    # Make sure I can open the output file before I start doing heavy work.
    try:
        outfile = fileio.open_output(args.output, args.output)
    except IOError as err:
        parser.error("Cannot open --output file: %s" % (err, ))

    query_prep_time = time.time()
    if args.jobs > 1:
        pool = multiprocessing.Pool(processes=args.jobs)
    else:
        pool = None
    try:
        result = transform_tool.transform(
            transform_record.fragments,
            property_names,
            min_radius=min_radius,
            min_pairs=min_pairs,
            min_variable_size=min_variable_size,
            max_variable_size=max_variable_size,
            min_constant_size=min_constant_size,
            substructure_pat=substructure_pat,
            pool=pool,
            explain=explain,
        )
    except analysis_algorithms.EvalError as err:
        sys.stderr.write("ERROR: %s\nExiting.\n" % (err, ))
        raise SystemExit(1)

    transform_time = time.time()

    with outfile:
        result.write_products(
            outfile,
            field_names=(
                #                "rule_environment_statistics_id",),
                "from_smiles",
                "to_smiles",
                "radius",
                "fingerprint",
                "rule_environment_id",
                "count",
                "avg",
                "std",
                "kurtosis",
                "skewness",
                "min",
                "q1",
                "median",
                "q3",
                "max",
                "paired_t",
                "p_value"),
            #column_aliases = {"from_smiles": "FROM"}, # use this to change the column name for a field
            include_empty=include_empty)

    output_time = time.time()

    if args.times:
        sys.stderr.write("Elapsed time (in seconds):\n")
        format_dt = get_time_delta_formatter(output_time - start_time)
        sys.stderr.write("  open database: %s\n" %
                         format_dt(open_time - start_time))
        sys.stderr.write("  prepare query: %s\n" %
                         format_dt(query_prep_time - open_time))
        sys.stderr.write("      transform: %s\n" %
                         format_dt(transform_time - query_prep_time))
        sys.stderr.write("   write output: %s\n" %
                         format_dt(output_time - transform_time))
        sys.stderr.write("         TOTAL = %s\n" %
                         format_dt(output_time - start_time))
Example #16
0
 def setUp(self):
     self.mol1 = Chem.MolFromSmiles('c1ccccc1')
     self.mol2 = Chem.MolFromSmiles('c1ccncc1')
Example #17
0
def randomize_smi(smi):
    random_equivalent_smiles = Chem.MolFromSmiles(
        Chem.MolToSmiles(smi, doRandom=True))
    return random_equivalent_smiles
Example #18
0
def vectorize_rdkit(smiles, mol=None):
    if mol is None:
        mol = Chem.MolFromSmiles(smiles)
    return [desc[x](mol) for x in keys]
Example #19
0
 def test3Exceptions(self) :
   smi = 'c1ccccc1'
   m = Chem.MolFromSmiles(smi)
   addConf(m)
   self.assertTrue(m.GetNumConformers()==1)
   self.assertRaises(ValueError,lambda:m.GetConformer(2))
Example #20
0
def smiles2sentence(smiles):
    mol = Chem.MolFromSmiles(smiles)
    sentence = mol2alt_sentence(mol, 1)
    return sentence
Example #21
0
        Input: mol is a molecule object
        
        Output: result is a dict form 
    #################################################################
    """
    result = {}
    result.update(CalculateLabuteASA(mol))
    result.update(CalculateTPSA(mol))
    result.update(CalculateSLOGPVSA(mol, bins=None))
    result.update(CalculateSMRVSA(mol, bins=None))
    result.update(CalculatePEOEVSA(mol, bins=None))
    result.update(CalculateEstateVSA(mol, bins=None))
    result.update(CalculateVSAEstate(mol, bins=None))
    return result


#########################################################################

if __name__ == "__main__":

    smi5 = [
        'COCCCC', 'CCC(C)CC', 'CC(C)CCC', 'CC(C)C(C)C', 'CCOCCN', 'c1ccccc1N'
    ]
    smis = ['CCCC', 'CCCCC', 'CCCCCC', 'CC(N)C(=O)O', 'CC(N)C(=O)[O-].[Na+]']
    for index, smi in enumerate(smis):
        m = Chem.MolFromSmiles(smi)
        print(index + 1)
        print(smi)
        print('\t', GetMOE(m))
        print('\t', len(GetMOE(m)))
Example #22
0
percentage_cutoff = 0.1 # compute intdiv on top 10% molecules in the samples 


### Get QED distribution at different steps of CbAS

tanim_dist_all, tanim_dist_top = [],[]

for step in np.arange(1,steps+1):

    samples = pd.read_csv(f'../cbas/slurm/results/{name}/docking_results/{step}.csv')
    samples = samples.sort_values('score')
    N = int(samples.shape[0]*percentage_cutoff)
    
    smiles = samples.smile
    smiles = [s for s in smiles if Chem.MolFromSmiles(s) is not None]
    mols = [Chem.MolFromSmiles(s) for s in smiles]
    fps = [AllChem.GetMorganFingerprintAsBitVect(m , 3, nBits=2048) for m in mols]
    
    fps= np.array(fps)
    
    D= pairwise_distances(fps, metric = 'jaccard')
    D_top = D[:N,:N]
    
    tanim_dist_all.append(np.mean(D))
    tanim_dist_top.append(np.mean(D_top))
    
sns.lineplot(x=np.arange(1, step+1), y=tanim_dist_all, color = 'b', label = 'all samples')
sns.lineplot(x=np.arange(1, step+1), y=tanim_dist_top, color = 'r', label = f'top {percentage_cutoff*100:.0f}%')
plt.ylim(0,1)
plt.ylabel('Average fingerprint pairwise distance')
Example #23
0
# -*- encoding: utf-8 -*-

from rdkit import Chem
m = Chem.MolFromSmiles('Cc1ccccc1')

# Smiles:
# Mol:摩尔 (物质的量)
help(m)
print()
Example #24
0
def process_hmdb(args):
    conn = sqlite3.connect(args.database_dir + '/HMDB_MAGMa.db')
    c = conn.cursor()
    try:
        c.execute("""CREATE TABLE molecules (id TEXT PRIMARY KEY,
                                             mim INTEGER NOT NULL,
                                             charge INTEGER NOT NULL,
                                             natoms INTEGER NOT NULL,
                                             molblock TEXT,
                                             inchikey TEXT,
                                             smiles TEXT,
                                             molform TEXT,
                                             name TEXT,
                                             reference TEXT,
                                             logp INT)""")
        conn.commit()
        print("HMDB_MAGMa.db created")
    except:
        print("HMDB_MAGMa.db already exists (or error creating it)")
        exit()

    if args.data_dir == None:
        zf = urllib2.urlopen(
            'http://www.hmdb.ca/system/downloads/current/structures.zip')
    else:
        zf = open(args.data_dir + 'structures.zip')
    sdfile = zipfile.ZipFile(StringIO.StringIO(
        zf.read())).open('structures.sdf')

    memstore = {}
    line = '$$$$'
    while line != "":
        record = []
        amap = {}
        skip = False
        ionized = 0
        # read heading:
        for x in range(4):
            line = sdfile.readline()
            record.append(line)
        if line == "":
            continue
        natoms = int(record[-1][:3])
        nbonds = int(record[-1][3:6])
        bonds = 0
        y = 0
        for x in range(natoms):
            line = sdfile.readline()
            if line[31:33] == 'H ':
                # skip hydrogens
                continue
            y += 1
            amap[x + 1] = y
            if line[31:33] not in [
                    'C ', 'N ', 'O ', 'P ', 'S ', 'F ', 'Cl', 'Br', 'I '
            ]:
                # filter non-organic compounds
                skip = True
            elif line[50:51] != '0':
                # this flag has something to do with polymeric structures
                # and resulted in deviation between calculated and given inchikeys, skip
                skip = True
            elif line[38:39] == '4':
                # radical, resulted in deviation between calculated and given inchikeys
                skip = True
            record.append(line[:42] + '\n')
        for x in range(nbonds):
            line = sdfile.readline()
            a1 = int(line[:3])
            a2 = int(line[3:6])
            # skip bonds involving hydrogens
            if a1 in amap and a2 in amap:
                bonds += 1
                # use bonds with stereoflags set to zero
                record.append('%3i%3i%s  0\n' %
                              (amap[a1], amap[a2], line[6:9]))
        while line != 'M  END\n' and line != '':
            line = sdfile.readline()
            record.append(line)
            if line[:6] == 'M  ISO':
                skip = True
                print 'Skipped isotopically labeled:', record[0][:-1]
        while line != "$$$$\n" and line != "":
            line = sdfile.readline()
            if line == "> <HMDB_ID>\n":
                hmdb_id = str(sdfile.readline()[:-1])
            elif line == "> <GENERIC_NAME>\n":
                molname = str(sdfile.readline()[:-1])
            elif line == "> <INCHI_KEY>\n":
                inchi_key = sdfile.readline()[:-1]
        if line != "" and skip == False:
            record[3] = repr(y).rjust(3) + repr(bonds).rjust(3) + record[3][6:]
            molblock = ''.join(record)
            mol = Chem.MolFromMolBlock(molblock)
            if mol == None or mol.GetNumAtoms() == 0:
                continue
            smiles = Chem.MolToSmiles(mol)
            if len(Chem.GetMolFrags(mol)) > 1:
                print 'complex:', hmdb_id, smiles
                continue
            conf = mol.GetConformer(0)
            molblock = base64.encodestring(zlib.compress(''.join(record)))
            molform = Chem.rdMolDescriptors.CalcMolFormula(mol)
            mim = Chem.rdMolDescriptors.CalcExactMolWt(mol)
            charge = 0
            if '-' in molform:
                if molform[-1] == '-':
                    charge = -1
                else:
                    continue
            elif '+' in molform:
                if molform[-1] == '+':
                    charge = 1
                else:
                    continue
            if mim > 1200.0:
                print 'molecule to heavy:', hmdb_id, smiles
                continue
            natoms = mol.GetNumHeavyAtoms()
            logp = Chem.Crippen.MolLogP(mol)
            inchikey = Chem.AllChem.InchiToInchiKey(
                AllChem.MolToInchi(mol))[:14]
            if inchikey != inchi_key[:14]:
                print 'given inchikey does not match calculated inchikey, skipped:', hmdb_id, smiles
                continue
            ionized = 0
            for x in ['C(=O)[O-]', '[NH+]', '[NH2+]', '[NH3+]', '[NH4+]']:
                if smiles.find(x) >= 0:
                    ionized = 1
            if inchikey in memstore:
                dbid, reference, dbionized = memstore[inchikey]
                reference = reference + ',' + hmdb_id
                print 'Duplicates:', reference, molname
                if dbionized > ionized:  # prefer non-ionized CID's
                    c.execute(
                        '''UPDATE molecules SET id=?, mim=?, charge=?, molblock=?, smiles=?,
                                 molform=?, name=?, reference=?, logp=? WHERE id == ?''',
                        (hmdb_id, int(mim * 1e6), charge, unicode(molblock),
                         unicode(smiles), unicode(molform),
                         unicode(molname, 'utf-8', 'xmlcharrefreplace'),
                         unicode(reference), int(logp * 10), dbid))
                    memstore[inchikey] = (hmdb_id, reference, ionized)
                else:
                    c.execute('UPDATE molecules SET reference=? WHERE id == ?',
                              (unicode(reference), dbid))
                    memstore[inchikey] = (dbid, reference, dbionized)
            else:
                c.execute(
                    '''INSERT INTO molecules (id, mim, charge, natoms, molblock, inchikey,
                             smiles,molform,name,reference,logp) VALUES (?,?,?,?,?,?,?,?,?,?,?)''',
                    (hmdb_id, int(
                        mim * 1e6), charge, int(natoms), unicode(molblock),
                     unicode(inchikey), unicode(smiles), unicode(molform),
                     unicode(molname, 'utf-8', 'xmlcharrefreplace'),
                     unicode(hmdb_id), int(logp * 10)))
                memstore[inchikey] = (hmdb_id, hmdb_id, ionized)
    conn.commit()

    print "Creating index ..."
    c.execute('PRAGMA temp_store = 2')
    c.execute(
        'CREATE INDEX idx_cover ON molecules (charge,mim,natoms,reference,molform,inchikey,smiles,name,molblock,logp)'
    )
    conn.commit()
def bond_topologies_from_geom(molecule, bond_lengths, matching_parameters):
  """Return all BondTopology's that are plausible.

    Given a molecule described by `bond_topology` and `geometry`, return all
    possible
    BondTopology that are consistent with that.
    Note that `bond_topology` will be put in a canonical form.

  Args:
    molecule:
    bond_lengths: matrix of interatomic distances
    matching_parameters:

  Returns:
    TopologyMatches
  """
  starting_topology = molecule.bond_topologies[0]

  result = dataset_pb2.TopologyMatches()  # To be returned.
  result.starting_smiles = starting_topology.smiles
  result.molecule_id = molecule.molecule_id
  result.fate = molecule.properties.errors.fate

  natoms = len(starting_topology.atoms)
  if natoms == 1:
    return result  # empty.

  if len(molecule.optimized_geometry.atom_positions) != natoms:
    return result  # empty
  distances = utilities.distances(molecule.optimized_geometry)

  # First join each Hydrogen to its nearest heavy atom, thereby
  # creating a minimal BondTopology from which all others can grow
  if matching_parameters.check_hydrogen_dists:
    minimal_bond_topology = hydrogen_to_nearest_atom(starting_topology,
                                                     distances, bond_lengths)
  else:
    minimal_bond_topology = hydrogen_to_nearest_atom(starting_topology,
                                                     distances, None)

  if minimal_bond_topology is None:
    return result

  heavy_atom_indices = [
      i for i, t in enumerate(starting_topology.atoms)
      if t != dataset_pb2.BondTopology.AtomType.ATOM_H
  ]

  # For each atom pair, a list of possible bond types.
  # Key is a tuple of the two atom numbers, value is an np.array
  # with the score for each bond type.

  bonds_to_scores: Dict[Tuple[int, int], np.ndarray] = {}
  for (i, j) in itertools.combinations(heavy_atom_indices, 2):  # All pairs.
    dist = distances[i, j]
    if dist > THRESHOLD:
      continue
    try:
      possible_bonds = bond_lengths.probability_of_bond_types(
          starting_topology.atoms[i], starting_topology.atoms[j], dist)
    except KeyError:  # Happens when this bond type has no data
      continue
    if not possible_bonds:
      continue
    # Note that this relies on the fact that BOND_SINGLE==1 etc..
    btypes = np.zeros(4, np.float32)
    for key, value in possible_bonds.items():
      btypes[key] = value
    bonds_to_scores[(i, j)] = btypes

  if not bonds_to_scores:  # Seems unlikely.
    return result

  rdkit_mol = smu_utils_lib.bond_topology_to_rdkit_molecule(starting_topology)
  initial_ring_atom_count = utilities.ring_atom_count_mol(rdkit_mol)

  mol = topology_molecule.TopologyMolecule(minimal_bond_topology,
                                           bonds_to_scores, matching_parameters)

  search_space = mol.generate_search_state()
  for s in itertools.product(*search_space):
    bt = mol.place_bonds(list(s), matching_parameters)
    if not bt:
      continue

    rdkit_mol = smu_utils_lib.bond_topology_to_rdkit_molecule(bt)
    if matching_parameters.consider_not_bonded and len(
        Chem.GetMolFrags(rdkit_mol)) > 1:
      continue

    utilities.canonicalize_bond_topology(bt)

    if matching_parameters.ring_atom_count_cannot_decrease:
      ring_atoms = utilities.ring_atom_count_mol(rdkit_mol)
      if ring_atoms < initial_ring_atom_count:
        continue
      bt.ring_atom_count = ring_atoms

    bt.smiles = smu_utils_lib.compute_smiles_for_rdkit_molecule(
        rdkit_mol, include_hs=matching_parameters.smiles_with_h)

    bt.geometry_score = geometry_score(bt, distances, bond_lengths)
    result.bond_topology.append(bt)

  if len(result.bond_topology) > 1:
    result.bond_topology.sort(key=lambda bt: bt.score, reverse=True)

  score_sum = np.sum([bt.score for bt in result.bond_topology])
  for bt in result.bond_topology:
    bt.topology_score = np.log(bt.score / score_sum)
    bt.ClearField("score")

  return result
Example #26
0
 def SMILESFromGraph(node_list, adjacency_matrix):
     return Chem.MolToSmiles(MolFromGraphs(nodes, a))
Example #27
0
num = 16546
mol_list = []
nCnumber = []
for p, k in enumerate(hanni[1:]):
    k2 = int(k[1:])
    if k2 > num:
        k3 = str(p+1)
        with open("../../../database/knapsack-kcf/KNApSAck" + k3 + ".kcf")as f2:
            Clist = f2.read().split("///\n")
            try:
                for C in Clist:
                    if i == C.split()[1]:
                        molblock = kcfco.kcf_to_molblock(C)
                        # print("OK", i)
                        # print(molblock[1])
                        mol = Chem.MolFromMolBlock(molblock[1])
                        if mol is None:
                            print("None", i, z, k3)
                            if "#+" in C or "#-" in C:
                                print("Charge in\n")
                            counter += 1
                            break
                        # rdDepictor.Compute2DCoords(mol)
                        mol_list.append(mol)
                        nCnumber.append(i)
                        if "#+" in C or "#-" in C:
                            print(i, z, k3, "Charge in\n")
                        break
            except IndexError:
                counter += 1
                print("DAME", i, z)
 def _reward(self):
   molecule = Chem.MolFromSmiles(self._state)
   if molecule is None:
     return 0.0
   return molecules.penalized_logp(molecule)
Example #29
0
def process_DB(DB):
    # df = pd.read_csv(data_folder + 'tox21.csv', sep=',')
    list_ID, list_SMILES, list_y, dict_id2smile = [], [], [], {}
    reader = csv.reader(open('data/MUV/muv.csv'), delimiter=',')
    if DB == 'MUV':
        n_None = [0 for _ in range(17)]
    i = 0
    for row in reader:
        if i > 0:
            smile = row[18]
            m = Chem.MolFromSmiles(smile)
            if m is not None and smile != '':
                if DB == 'MUV':
                    list_ID.append(row[17])
                    list_SMILES.append(row[18])
                    y_temp, n_None = get_multi_label(row[:17], n_None)
                    list_y.append(y_temp)
                    dict_id2smile[row[17]] = row[18]
                elif 'MUV' in DB:
                    if row[int(DB.split('_')[1])] != '':
                        list_ID.append(row[17])
                        list_SMILES.append(row[18])
                        dict_id2smile[row[17]] = row[18]
                        list_y.append(int(row[int(DB.split('_')[1])]))
        i += 1
    pickle.dump(dict_id2smile,
                open('data/' + DB + '/' + DB + '_dict_ID2SMILES.data', 'wb'))
    # pickle.dump(dict_uniprot2fasta,
    #             open(root + 'data/' + DB + '/' + DB + '_dict_ID2FASTA.data', 'wb'))
    pickle.dump(list_SMILES,
                open('data/' + DB + '/' + DB + '_list_SMILES.data', 'wb'))
    pickle.dump(list_y, open('data/' + DB + '/' + DB + '_list_y.data', 'wb'))
    pickle.dump(list_ID, open('data/' + DB + '/' + DB + '_list_ID.data', 'wb'))

    f = open('data/' + DB + '/' + DB + '_dict_ID2SMILES.tsv', 'w')
    for cle, valeur in dict_id2smile.items():
        f.write(cle + '\t' + valeur + '\n')
    f.close()
    f = open('data/' + DB + '/' + DB + '_list_SMILES.tsv', 'w')
    for s in list_SMILES:
        f.write(s + '\n')
    f.close()
    f = open('data/' + DB + '/' + DB + '_list_y.tsv', 'w')
    for s in list_y:
        if type(s) is list:
            for ll in s:
                f.write(str(ll) + '\t')
            f.write('\n')
        else:
            f.write(str(s) + '\n')
    f.close()
    f = open('data/' + DB + '/' + DB + '_list_ID.tsv', 'w')
    for s in list_ID:
        f.write(s + '\n')
    f.close()

    print(len(list_SMILES))
    if DB == 'MUV':
        print([len(list_SMILES) - n_None[i] for i in range(len(n_None))])
    elif 'MUV' in DB:
        print(collections.Counter(list_y))
Example #30
0
import pandas as pd
from rdkit import Chem

df = pd.read_csv("drug_class_test.txt", sep = "\t")

for index,row in df.iterrows():
    smile = row['Canonical_Smiles']
    print(index, Chem.MolFromSmiles(smile))
    # index 2708 produces none

df = df[df['index']!=2708]
# 2708	antiinfective/1169	antiinfective	F[As-](F)(F)(F)(F)F.c1ccc([I+]c2ccccc2)cc1	6

df.to_csv("drug_class_test.txt", sep = "\t")