Example #1
0
    def grad(self, mol):
        """ Calculate the pseudo gradient with respect to the atoms.

        The pseudo gradient is the number of times the atom set that particular
        bit.

        Args:
            mol (skchem.Mol):
                The molecule for which to calculate the pseudo gradient.

        Returns:
            pandas.DataFrame:
                Dataframe of pseudogradients, with columns corresponding to
                atoms, and rows corresponding to features of the fingerprint.
        """

        cols = pd.Index(list(range(len(mol.atoms))), name='atom_idx')
        dist = GetDistanceMatrix(mol)

        info = {}

        if self.n_feats < 0:

            res = GetMorganFingerprint(mol,
                                       self.radius,
                                       useFeatures=self.use_features,
                                       useBondTypes=self.use_bond_types,
                                       useChirality=self.use_chirality,
                                       bitInfo=info).GetNonzeroElements()
            idx_list = list(res.keys())
            idx = pd.Index(idx_list, name='features')
            grad = np.zeros((len(idx), len(cols)))
            for bit in info:
                for atom_idx, radius in info[bit]:
                    grad[idx_list.index(bit)] += (dist <= radius)[atom_idx]

        else:

            GetHashedMorganFingerprint(mol,
                                       self.radius,
                                       nBits=self.n_feats,
                                       useFeatures=self.use_features,
                                       useBondTypes=self.use_bond_types,
                                       useChirality=self.use_chirality,
                                       bitInfo=info)

            idx = pd.Index(range(self.n_feats), name='features')
            grad = np.zeros((len(idx), len(cols)))

            for bit in info:
                for atom_idx, radius in info[bit]:
                    grad[bit] += (dist <= radius)[atom_idx]

        grad = pd.DataFrame(grad, index=idx, columns=cols)

        if self.as_bits:
            grad = (grad > 0)

        return grad.astype(int)
Example #2
0
def get_sum_stats(smi_list, smiles):
    self_fp = GetMorganFingerprint(Chem.MolFromSmiles(smiles), 2)
    fps = [GetMorganFingerprint(Chem.MolFromSmiles(x), 2) for x in smi_list]
    out_sims = []
    for i in fps:
        out_sims.append(DataStructs.TanimotoSimilarity(i, self_fp))
    return len(out_sims), min(out_sims), max(out_sims), sum(out_sims) / float(
        len(out_sims))
Example #3
0
    def _transform_mol(self, mol):
        """Private method to transform a skchem molecule.

        Use `transform` for the public method, which genericizes the argument
        to iterables of mols.

        Args:
            mol (skchem.Mol): Molecule to calculate fingerprint for.

        Returns:
            np.array or dict:
                Fingerprint as an array (or a dict if sparse).
        """

        if self.as_bits and self.n_feats > 0:

            fp = GetMorganFingerprintAsBitVect(
                mol,
                self.radius,
                nBits=self.n_feats,
                useFeatures=self.use_features,
                useBondTypes=self.use_bond_types,
                useChirality=self.use_chirality)

            res = np.array(0)
            ConvertToNumpyArray(fp, res)
            res = res.astype(np.uint8)

        else:

            if self.n_feats <= 0:

                res = GetMorganFingerprint(mol,
                                           self.radius,
                                           useFeatures=self.use_features,
                                           useBondTypes=self.use_bond_types,
                                           useChirality=self.use_chirality)

                res = res.GetNonzeroElements()
                if self.as_bits:
                    res = {k: int(v > 0) for k, v in res.items()}

            else:
                res = GetHashedMorganFingerprint(
                    mol,
                    self.radius,
                    nBits=self.n_feats,
                    useFeatures=self.use_features,
                    useBondTypes=self.use_bond_types,
                    useChirality=self.use_chirality)

                res = np.array(list(res))

        return res
Example #4
0
def GetUnfoldedCircularFragment(mol, minRadius=1, maxRadius=2,
                                maxFragment=True, disposed=True):
    """Get unfolded circular fragment

    Parameters
    ----------
    mol : dkit.Chem.rdchem.Mol object
        Compound to be Calculated
    minRadius : int, optional
        The probable minimum radius of circular fragment, by default 1
    maxRadius : int, optional
        The probable maximum radius of circular fragment, by default 2
    maxFragment : bool, optional
        Whether only return the maximum fragment at a center atom, by default True
    disposed : bool, optional
        Whether dispose the original bitinfo, by default True

    Returns
    -------
    fragments : list of list
        The first element is the ID of all fragments generated
        the second one is the ID of output fragments
    """
    bitInfo = {}
    fp = GetMorganFingerprint(mol,
                              radius=maxRadius,
                              bitInfo=bitInfo)

    fragments = _DisposeCircularBitInfo(
            bitInfo, minRadius, maxFragment
        ) if disposed else bitInfo
    return fragments
Example #5
0
 def _morgan(self, molecules):
     if self.vector == 'int':
         from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
         self.fps_ = [
             GetMorganFingerprint(self._sanitary(mol), self.radius,
                                  **self.kwargs) for mol in molecules
         ]
         # get nonzero elements as a dictionary for each molecule
         dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps_]
         # pairScores = []
         # for fp in dict_nonzero:
         #     pairScores += list(fp)
         data = pd.DataFrame(
             dict_nonzero)  #, columns=list(set(pairScores)))
         data.fillna(0, inplace=True)
         return data
     elif self.vector == 'bit':
         from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
         self.fps_ = [
             GetMorganFingerprintAsBitVect(self._sanitary(mol),
                                           self.radius,
                                           nBits=self.n_bits,
                                           **self.kwargs)
             for mol in molecules
         ]
         data = np.array(self.fps_)
         data = pd.DataFrame(data)
         return data
Example #6
0
    def test__string_output_format(self) -> None:
        fprintr = CircularFPFeaturizer(output_format="sparse_string")

        fps_str = fprintr.fit_transform(self.smis)  # using SMILES

        # Output shape
        self.assertEqual(self.n_mols, len(fps_str))

        # Fingerprint matrix structure
        for i, mol in enumerate(self.mols):
            fps_ref = GetMorganFingerprint(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_,
                                           useChirality=fprintr.use_chirality, useCounts=fprintr.use_counts_)

            fp_i_from_str = eval("{" + fps_str[i] + "}")

            for hash, cnt in fps_ref.GetNonzeroElements().items():
                self.assertEqual(fp_i_from_str[hash], cnt)
Example #7
0
    def transform_mol(self,
                      molecule: Chem.rdchem.Mol) -> Tuple[np.ndarray, bool]:
        use_chirality = self.__dict__.get('use_chirality', False)

        fp = GetMorganFingerprint(
            molecule,
            radius=self.radius,
            useFeatures=self.use_features,
            useCounts=self.use_counts,
            useChirality=use_chirality,
            **self.fingerprint_extra_args,
        )
        fp = rdkit_sparse_array_to_np(fp.GetNonzeroElements().items(),
                                      use_counts=self.use_counts,
                                      fp_size=self.fp_size)

        return fp, True
Example #8
0
def diverse_mols_indexes(mol_list, n_pick, radius=4, seed=42):
    fps = [GetMorganFingerprint(mol, radius) for mol in mol_list]
    picker = MaxMinPicker()
    n_fps = len(fps)

    def fp_distance(i, j): return 1 - \
        DataStructs.DiceSimilarity(fps[i], fps[j])
    indexes = picker.LazyPick(fp_distance, n_fps, n_pick, seed=seed)
    return indexes
Example #9
0
    def test__hashed_counting_fingerprints__fcfp(self) -> None:
        fprintr = CircularFPFeaturizer(fp_type="FCFP")

        fps_mat_smi = fprintr.fit_transform(self.smis)  # using SMILES
        fps_mat_mol = fprintr.fit_transform(self.mols)  # using Mol objects

        # Output shape
        self.assertEqual(fps_mat_smi.shape[0], self.n_mols)
        self.assertEqual(fps_mat_smi.shape[1], fprintr.max_hash_value_)
        self.assertEqual(fps_mat_mol.shape[0], self.n_mols)
        self.assertEqual(fps_mat_mol.shape[1], fprintr.max_hash_value_)

        # Fingerprint matrix structure
        for i, mol in enumerate(self.mols):
            fps_ref = GetMorganFingerprint(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_,
                                           useChirality=fprintr.use_chirality, useCounts=fprintr.use_counts_)
            for hash, cnt in fps_ref.GetNonzeroElements().items():
                self.assertEqual(fps_mat_smi[i, hash], cnt)
                self.assertEqual(fps_mat_mol[i, hash], cnt)
Example #10
0
    def test__hashed_binary_fingerprints__ecfp(self) -> None:
        fprintr = CircularFPFeaturizer(fp_mode="binary")

        fps_mat_smi = fprintr.fit_transform(self.smis)  # using SMILES
        fps_mat_mol = fprintr.fit_transform(self.mols)  # using Mol objects

        # Output shape
        self.assertEqual(fps_mat_smi.shape[0], self.n_mols)
        self.assertEqual(fps_mat_smi.shape[1], fprintr.max_hash_value_)
        self.assertEqual(fps_mat_mol.shape[0], self.n_mols)
        self.assertEqual(fps_mat_mol.shape[1], fprintr.max_hash_value_)

        # Fingerprint matrix structure
        for i, mol in enumerate(self.mols):
            fps_ref = GetMorganFingerprint(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_,
                                           useChirality=fprintr.use_chirality, useCounts=fprintr.use_counts_)
            for hash in fps_ref.GetNonzeroElements():
                self.assertTrue(fps_mat_smi[i, hash])
                self.assertTrue(fps_mat_mol[i, hash])

            # No other elements are set
            self.assertEqual(np.sum(fps_mat_smi[i, :].data), len(fps_ref.GetNonzeroElements()))
            self.assertEqual(np.sum(fps_mat_mol[i, :].data), len(fps_ref.GetNonzeroElements()))
Example #11
0
def pick_subset(mols, num=5, radius=3, seed=-1):
    """
    Pick a disparate subset of molecules using Morgan Fingerprints.
    https://towardsdatascience.com/a-practical-introduction-to-the-use-of-molecular-fingerprints-in-drug-discovery-7f15021be2b1

    :param mols: an iterable of molecules
    :param num: number of molecules to pick
    :param radius:
    :return: list of integer locations of the subset of molecules
    """
    fps = [GetMorganFingerprint(mol, radius) for mol in mols]

    def distij(i, j, fps=fps):
        return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

    return list(MaxMinPicker().LazyPick(distij, len(fps), num, seed=seed))
Example #12
0
def main(prior_name, name, max_samples, diversity_picker, oracle, w_min):
    prior_model = model_from_json(prior_name)

    # We start by creating another prior instance, then replace it with the actual weights
    # name = search_vae
    search_model = model_from_json(prior_name)
    model_weights_path = os.path.join(script_dir, 'results', name,
                                      'weights.pth')
    search_model.load(model_weights_path)

    samples, weights = get_samples(prior_model,
                                   search_model,
                                   max=max_samples,
                                   w_min=w_min)

    # if diversity picker < max_samples, we subsample with rdkit picker :
    if 0 < diversity_picker < max_samples:
        mols = [Chem.MolFromSmiles(s) for s in samples]
        fps = [GetMorganFingerprint(x, 3) for x in mols]
        picker = MaxMinPicker()

        def distij(i, j, fps=fps):
            return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

        pickIndices = picker.LazyPick(distij, max_samples, diversity_picker)
        idces = list(pickIndices)
        samples = [samples[i] for i in idces]
        weights = [weights[i] for i in idces]

    # Since we don't maintain a dict for qed, we just give everything to the docker
    if oracle != 'docking' or True:
        dump_path = os.path.join(script_dir, 'results', name,
                                 'docker_samples.p')
        pickle.dump(samples, open(dump_path, 'wb'))

        # Dump for the trainer
        dump_path = os.path.join(script_dir, 'results', name, 'samples.p')
        pickle.dump((samples, weights), open(dump_path, 'wb'))

    else:
        # Memoization, we split the list into already docked ones and dump a simili-docking csv
        whole_path = os.path.join(script_dir, '..', 'data',
                                  'drd3_scores.pickle')
        docking_whole_results = pickle.load(open(whole_path, 'rb'))
        filtered_smiles = list()
        already_smiles = list()
        already_scores = list()
        for i, smile in enumerate(samples):
            if smile in docking_whole_results:
                already_smiles.append(smile)
                already_scores.append(docking_whole_results[smile])
            else:
                filtered_smiles.append(smile)

        # Dump simili-docking
        dump_path = os.path.join(script_dir, 'results', name,
                                 'docking_small_results', 'simili.csv')
        df = pd.DataFrame.from_dict({
            'smile': already_smiles,
            'score': already_scores
        })
        df.to_csv(dump_path)

        # Dump for the docker
        dump_path = os.path.join(script_dir, 'results', name,
                                 'docker_samples.p')
        pickle.dump(filtered_smiles, open(dump_path, 'wb'))

        # Dump for the trainer
        dump_path = os.path.join(script_dir, 'results', name, 'samples.p')
        pickle.dump((samples, weights), open(dump_path, 'wb'))
Example #13
0
            #s=Chem.MolToSmiles(m, kekuleSmiles=True)
            if s not in training_set:
                novel += 1

    print(novel / len(smiles_list))

    ## Diversity sampling

    from rdkit import Chem
    from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
    from rdkit import DataStructs
    from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker

    ms = [Chem.MolFromSmiles(s) for s in smiles_list]
    start = time()
    fps = [GetMorganFingerprint(x, 3) for x in ms]
    nfps = len(fps)
    end = time()
    print(f'Time for {nfps} fingerprints: ', end - start)

    def distij(i, j, fps=fps):
        return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

    picker = MaxMinPicker()
    start = time()
    pickIndices = picker.LazyPick(distij, nfps, 1000, seed=23)
    end = time()
    idces = list(pickIndices)
    print('Time for picker: ', end - start)

    m_selected = [ms[i] for i in idces]
Example #14
0
if RECOMPUTE:
    radius = 5
    mol = {}
    fp = []
    fpNames = []
    cstr = getStructs(path.join('data', 'chem_prop.tsv'))
    for c in cstr:
        try:
            mol[c] = Chem.MolFromSmiles(cstr[c])
        except:
            continue
    pickle.dump(mol, open(path.join('data', 'mnxMol.pk'), 'w'))

    for c in mol:
        try:
            fp.append(GetMorganFingerprint(mol[c], radius))
        except:
            continue
        fpNames.append(c)
    f = open(path.join('data', 'mnxFp.pk'), 'w')
    pickle.dump(fp, f)
    pickle.dump(fpNames, f)
    f.close()

else:
    print('Reading fingerprints...')

    data = np.load('fp.npz', allow_pickle=True)

    fp = data['x']
    fpNames = data['y']
Example #15
0
def score_and_append_diversity_scores(molecules_list):
    """
    This function will take list of molecules which makes up a population. It
    will then create a diversity score for each molecules:
    It creates the diversity score by determining the Morgan Fingerprint for
        each molecule in the population.
    It then compares the fingerprints for every molecule against every
        molecule in a pairwise manner.
        Based on the approach provided on
            http://www.rdkit.org/docs/GettingStartedInPython.html section: "Picking
            Diverse Molecules Using Fingerprints"
        It determines a score of similarity using the RDKit function
            DataStructs.DiceSimilarity
            -The higher the similarity the higher the similarity score
                -ie) if you compare two identical SMILES the similarity score
                    is 1.0. I.e., if you compare 4 identical SMILES the
                    similarity score for each is 4.0.
                -ie) if you compare two completely different SMILES, the score
                    is 0.0

        It sums the similarity score for each pairwise comparison.
            -ie) if there are 15 ligands the max score is 15 the minimum is 0.0
                    with 15.0 if all ligands are identical

        It then appends the diversity score to the molecule list which it
        returns.

        It can raise an AssertionError if there are ligs which fail to
            sanitize or deprotanate.
                -this prevents future errors from occuring in later steps and
                    makes this funciton usable for multiple codes
        It will remove any Nones from the input list

    Inputs:
    :param list molecules_list: list of all molecules in the populations with
    the respective info

    Returns:
    :returns: list molecules_list: list of all molecules in the populations
        with the respective info and append diversity score
    """

    mol_list = []

    for pair in molecules_list:
        if pair is not None:
            smile = pair[0]
            # name = pair[1]
            try:
                mol = Chem.MolFromSmiles(smile, sanitize=False)
            except:
                mol = None

            if mol is None:
                raise AssertionError(
                    "mol in list failed to sanitize. Issue in Ranking.py \
                                    def score_and_append_diversity_scores")

            mol = MOH.check_sanitization(mol)
            if mol is None:
                raise AssertionError(
                    "mol in list failed to sanitize. Issue in Ranking.py \
                                        def score_and_append_diversity_scores")

            mol = MOH.try_deprotanation(mol)
            if mol is None:
                raise AssertionError(
                    "mol in list failed to sanitize. Issue in Ranking.py \
                                        def score_and_append_diversity_scores")

            temp = [x for x in pair]
            temp.append(mol)
            if temp[-1] is None:
                print(temp)
                print("None in temp list, skip this one")
                continue
            if temp[-1] is not None:
                mol_list.append(temp)
        else:
            print(
                "noneitem in molecules_list in score_and_append_diversity_scores"
            )

    fps_list = []
    for molecule in mol_list:
        fp = GetMorganFingerprint(molecule[-1], 10, useFeatures=True)
        temp = [x for x in molecule]
        temp.append(fp)
        fps_list.append(temp)

    fps_list_w_div_score = []
    for i in range(0, len(fps_list)):
        diversity_score = 0
        for j in range(0, len(fps_list)):
            if i != j:
                # if DiceSimilarity=1.0 its a perfect match, the smaller the
                # number the more diverse it is. The sum of all of these gives
                # the distance from the normal. The smaller the number means
                # the more distant
                diversity_score = diversity_score + DataStructs.DiceSimilarity(
                    fps_list[i][-1], fps_list[j][-1])
        temp = [x for x in fps_list[i]]
        temp.append(str(diversity_score))
        fps_list_w_div_score.append(temp)

    # take the diversity score and append to the last column in the original
    # list

    for i in range(0, len(molecules_list)):
        if molecules_list[i][0] == fps_list_w_div_score[i][0]:

            molecules_list[i].append(fps_list_w_div_score[i][-1])

    return molecules_list
Example #16
0
def morgan(mol, **kwargs):
    return list(GetMorganFingerprint(mol, **kwargs).GetNonzeroElements())
Example #17
0
    def Fingerprint(self):
        if self.FPtype == 'Hashed_atom_pair' or self.FPtype == 'HAP':
            if self.vector == 'int':
                from rdkit.Chem.AtomPairs.Pairs import GetHashedAtomPairFingerprint
                self.fps = [
                    GetHashedAtomPairFingerprint(m, nBits=self.nBits)
                    for m in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                data = pd.DataFrame(dict_nonzero, columns=range(self.nBits))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.rdMolDescriptors import GetHashedAtomPairFingerprintAsBitVect
                self.fps = [
                    GetHashedAtomPairFingerprintAsBitVect(m, nBits=self.nBits)
                    for m in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The argument vector can be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Atom_pair' or self.FPtype == 'AP':
            if self.vector == 'int':
                from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint
                self.fps = [GetAtomPairFingerprint(m) for m in self.molecules]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += [key for key in fp]
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprintAsBitVect
                self.fps = [
                    GetAtomPairFingerprintAsBitVect(m) for m in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)

                print len(data.columns)
                d_des = data.describe()
                for i in data.columns:
                    if d_des[i]['mean'] == 0:
                        data.drop(i, 1)
                print len(data.columns)

                dict_nonzero = []
                for fp in self.fps:
                    dict_nonzero.append(
                        {i: el
                         for i, el in enumerate(fp) if el != 0})
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += [key for key in fp]
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            else:
                msg = "The argument vector can be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'MACCS':
            if self.vector == 'int':
                msg = "There is no RDKit function to encode int vectors for MACCS keys"
                raise ValueError(msg)
            elif self.vector == 'bit':
                from rdkit.Chem.MACCSkeys import GenMACCSKeys
                self.fps = [GenMACCSKeys(mol) for mol in self.molecules]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The vector argument can only be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Morgan':
            if self.vector == 'int':
                from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
                self.fps = [
                    GetMorganFingerprint(mol, self.radius)
                    for mol in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += list(fp)
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
                self.fps = [
                    GetMorganFingerprintAsBitVect(mol,
                                                  self.radius,
                                                  nBits=self.nBits)
                    for mol in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The argument vector can only be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Hashed_topological_torsion' or self.FPtype == 'HTT':
            if self.vector == 'int':
                from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprint
                self.fps = [
                    GetHashedTopologicalTorsionFingerprint(m, nBits=self.nBits)
                    for m in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                data = pd.DataFrame(dict_nonzero, columns=range(self.nBits))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprintAsBitVect
                self.fps = [
                    GetHashedTopologicalTorsionFingerprintAsBitVect(
                        m, nBits=self.nBits) for m in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The argument vector can be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Topological_torsion' or self.FPtype == 'TT':
            if self.vector == 'int':
                from rdkit.Chem.AtomPairs.Torsions import GetTopologicalTorsionFingerprintAsIntVect
                self.fps = [
                    GetTopologicalTorsionFingerprintAsIntVect(mol)
                    for mol in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += list(fp)
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                msg = "There is no RDKit function to encode bit vectors for Topological Torsion Fingerprints"
                raise ValueError(msg)
            else:
                msg = "The argument vector can only be 'int'"
                raise ValueError(msg)
        else:
            msg = "The type argument '%s' is not a valid fingerprint type" % self.FPtype
            raise ValueError(msg)
 def execute(self):
     """
     TODO
     """
     print()
     print("Loading input file with path: " + ZincPicker.input_file_path)
     zinc_for_sale_mol_supplier = Chem.SmilesMolSupplier(
         ZincPicker.input_file_path)
     num_none_mols = 0
     print("Output file path: " + ZincPicker.output_file_path)
     writer = Chem.SmilesWriter(ZincPicker.output_file_path)
     lower_index = 0
     upper_index = ZincPicker.pool_size
     print("Entering picking iterations...")
     print()
     for y in range(0, ZincPicker.num_iterations):
         print("Number of iteration: ", y)
         print("Lower index: ", lower_index)
         print("Upper index: ", upper_index)
         print("Loading molecules now...")
         molecules = []
         for x in range(lower_index, upper_index):
             mol = zinc_for_sale_mol_supplier[x]
             if mol is None:
                 num_none_mols += 1
                 continue
             molecules.append(mol)
         while molecules.count(None):
             molecules.remove(None)
         # radius 3
         print("Number of molecules loaded: ", len(molecules))
         print("Calculating fingerprints...")
         self.fingerprint_list = [
             GetMorganFingerprint(x, 3) for x in molecules
         ]
         nfps = len(self.fingerprint_list)
         print("Number of fingerprints: ", nfps)
         print("Now min-max picking ", ZincPicker.pick_size,
               " out of the finger print list...")
         picker = MaxMinPicker()
         pickIndices = picker.LazyPick(
             self.calculate_dice_similarity_distance,
             nfps,
             ZincPicker.pick_size,
             seed=23)
         print("Finished picking, writing to file...")
         for z in pickIndices:
             writer.write(molecules[z])
         # clear memory
         molecules = []
         self.fingerprint_list = []
         nfps = 0
         picker = None
         pickIndices = []
         # raise indices
         lower_index = lower_index + ZincPicker.pool_size
         upper_index = upper_index + ZincPicker.pool_size
         print("Finished this iteration, entering the next...")
         print()
     print("Execution successful.")
     print("Picked ",
           ZincPicker.pick_size * ZincPicker.num_iterations - num_none_mols,
           " out of ", ZincPicker.num_iterations * ZincPicker.pool_size,
           " molecules in ", ZincPicker.num_iterations,
           " iterations, while picking ", ZincPicker.pick_size,
           " in each iteration.")