Python PandasTools.AddMoleculeColumnToFrame Exemples, rdkit.Chem.PandasTools.AddMoleculeColumnToFrame Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : vs_smina.py Projet : dahuilangda/virtual_screening_smina_script

def loadDrugDatabase(filename):
    '''
    输入文件支持sdf和csv格式
    如果是csv格式必须存存在smiles列,例如 smiles, id, name, ....
                                    CCC(CO)NC, 1, HD0001, ...
    '''
    if not os.path.exists(filename):
        raise Exception("数据库文件不存在!")

    if not ((filename.split('.')[-1] == 'csv') or (filename.split('.')[-1] == 'sdf')):
        raise Exception("数据库文件仅支持sdf和csv格式!")

    print('')
    print('Step 3')
    print('###############################')
    print('正在加在数据库!!!')

    if filename.split('.')[-1] == 'sdf':
        drugs = Chem.SDMolSupplier(filename)
        # drugs = [x for x in drugs]
        descr = []
        smiles = []
        for x in drugs:
            descr.append(x.GetPropsAsDict())
            smiles.append(Chem.MolToSmiles(x))
        col = [i for i in x.GetPropsAsDict().keys()]
        df = pd.DataFrame(columns=[col])
        df = df.from_dict(descr)

        df['smiles_tmp'] = smiles
        print('正在去除化合物中的盐离子!!!')
        df['smiles'] = [removeSalt(smi) for smi in df.smiles_tmp]
        print('正在产生结构文件!!!')
        PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol')

    elif filename.split('.')[-1] == 'csv':
        df = pd.read_csv(filename)
        df['smiles_tmp'] = df['smiles']

        print('正在去除化合物中的盐离子!!!')
        df['smiles'] = [removeSalt(smi) for smi in df.smiles_tmp]
        print('正在产生结构文件!!!')
        PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol')
        # drugs = [x for x in df.mol]

    else:
        raise Exception('化合物数据库必须是SDF或CSV格式！')
    
    print('共加载%s个化合物!!!' %df.shape[0])
    print('###############################')
    return df

Exemple #2

0

Afficher le fichier

Fichier : dataset_curation.py Projet : phi-grib/Data_curation

    def add_mol_column_to_df(self, data: pd.DataFrame,
                             smiles_column: str) -> pd.DataFrame:
        """
            Applies PandasTools functionalities to process the structure into a valid format for the sdf transformation.

            :param data: dataframe to be modified
            :param smiles_column: SMILES column in the dataframe to be processed

            :return data: modified data
            :return no_mol: data that hasn't been modified
        """

        PandasTools.AddMoleculeColumnToFrame(data, smiles_column)
        no_mol = data[data['ROMol'].isna()]
        data.drop(no_mol.index, axis=0, inplace=True)
        data.loc[:, 'ROMol'] = [
            Chem.AddHs(x) for x in data['ROMol'].values.tolist()
        ]

        if no_mol.empty is False:
            self.get_output_file(outfile_type='xlsx',
                                 data=no_mol,
                                 outfile_name='Non_processed_molecules')

        return data

Exemple #3

0

Afficher le fichier

def RDkitRead(in_file, removeHs=True, add_Hs=False):
    ## Read in SDF file; can choose to add hydrogens or not
    if re.search(r'.sdf', in_file):
        print(' # Reading SDF')
        df = rdpd.LoadSDF(file_handle(in_file),
                          removeHs=removeHs,
                          idName='ID',
                          molColName='mol')
        df['smiles'] = df.mol.apply(
            lambda m: Chem.MolToSmiles(Chem.RemoveHs(m)))
        if add_Hs:
            df['mol'] = df.mol.apply(Chem.AddHs)

    ## Read in SMILES file, check if there is a header "smiles"
    if re.search(r'.smi', in_file):
        print('# Reading SMI')
        with file_handle(in_file) as fi:
            if re.search('smi', str(fi.readline()), re.IGNORECASE):
                print('# Smiles input has Header #\n')
                df = pd.read_csv(in_file, sep='\s+', comment='#').dropna()
                df.columns = ['smiles', 'ID']
            else:
                print('# Smiles input has NO Header #\n')
                df = pd.read_csv(in_file, header=None, sep='\s+',
                                 comment='#').dropna()
                df.columns = ['smiles', 'ID']
        rdpd.AddMoleculeColumnToFrame(df, smilesCol='smiles', molCol='mol')
        df['smiles'] = df.mol.apply(Chem.MolToSmiles)

    print('## Number of MOL read from {}: {}\n'.format(in_file,
                                                       len(df.smiles)))
    return df

Exemple #4

0

Afficher le fichier

def read_csv(
    urlpath: Union[str, os.PathLike, TextIO],
    smiles_column: str = None,
    mol_column: str = "mol",
    **kwargs,
) -> pd.DataFrame:
    """Read a CSV file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        smiles_column: Use this column to build a mol column.
        mol_column: Name to give to the mol column. If not None a mol column will be build.
            Avoid when loading a very large file.
        kwargs: Arguments to pass to `pd.read_csv()`.

    Returns:
        df: a `pandas.DataFrame`
    """

    df: pd.DataFrame = pd.read_csv(urlpath, **kwargs)  # type: ignore

    if smiles_column is not None:
        PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)

    return df

Exemple #5

0

Afficher le fichier

Fichier : ligand.py Projet : volkamerlab/teachopencadd

    def __call__(self):
        df = pd.DataFrame(columns=["smiles"])
        df.loc[1] = self.smiles
        PandasTools.AddMoleculeColumnToFrame(df, smilesCol="smiles")
        romol = df.loc[1, "ROMol"]

        return pd.concat({romol: self.dataframe}, names=["Structure"])

Exemple #6

0

Afficher le fichier

def regression_gc():

    smiles = t_smiles.get()
    df = pd.DataFrame({
        'name': [t_name.get()],
        'smiles': [t_smiles.get()],
        'solubility': [0.00]
    })
    df.to_csv('tmp.csv')

    graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer()

    loader_p = dc.data.data_loader.CSVLoader(tasks=['solubility'],
                                             smiles_field="smiles",
                                             id_field="name",
                                             featurizer=graph_featurizer)
    predictset = loader_p.featurize('tmp.csv')

    prediction_sol = model_sol.predict(predictset)
    t_sol.set(round(10**prediction_sol[0][0], 3))

    prediction_lip = model_lip.predict(predictset)
    t_lip.set(round(10**prediction_lip[0][0], 3))

    prediction_GWP100 = model_GWP100.predict(predictset)
    t_GWP100.set(round(10**prediction_GWP100[0][0], 3))

    PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles')

    sa_score = df.ROMol.map(sascorer.calculateScore)

    t_sasc.set(round(sa_score[0], 2))

    print(sa_score[0])

Exemple #7

0

Afficher le fichier

Fichier : hmdb_structure_parser.py Projet : DinosaurInSpace/colocalization-plot

def pandas_structure(active_df):
    # Converts INCHI input file to smiles, then adds rd_object to dataframe'''

    active_df['Smiles'] = active_df.apply(lambda x: inchi_smiles(x['inchi']),
                                          axis=1)
    PandasTools.AddMoleculeColumnToFrame(active_df, 'Smiles', 'Molecule')
    return active_df

Exemple #8

0

Afficher le fichier

Fichier : cluster.py Projet : lenselinkbart/APCluster

def parse_text_file(file):
    """
    parse a csv file and return molecules
    """
    data = pd.read_csv(file)
    PandasTools.AddMoleculeColumnToFrame(data, "SMILES", "Molecule")
    return data

Exemple #9

0

Afficher le fichier

def painthis(smidf, prop):

    pt.AddMoleculeColumnToFrame(smidf, "smiles")
    smidf['pr'] = smidf['ROMol'].map('Descriptors.' + prop)
    del smidf["ROMol"]
    ax = smidf['pr'].hist(bins=50)
    ax.set_xlabel(prop)

Exemple #10

0

Afficher le fichier

Fichier : curate.py Projet : erik-overdahl/chemical_curation

def get_dataframe_from_file(filename, mol_col="ROMol", smiles_col="SMILES"):
    """Determine file type from filename extension and produce a Pandas dataframe
    accordingly.

    Supported filename extensions: .sdf, .csv, .tsv, .smi

    If the file is an SDF, get the structures from the MOL_COL
    column. Otherwise, use the SMILES strings in SMILES_COL to obtain the
    structures.

    Parameters
    ----------
    
    filename: filepath string
        The name of the file from which to read the data; can an be absolute or
        relative path.

    mol_col: string containing column name (case-insensitive)
        In an SDF formatted file, the name of the column containing the structures

    smiles_col: string containing column name (case-insensitive)
        In a non-SDF formatted file, the name of the column containing the SMILES strings


    Returns
    -------

    df: Pandas dataframe 
        A Pandas dataframe containing the data from FILENAME. 

    """
    logging.info(f'Reading {filename}')

    file_ext = pathlib.Path(filename).suffix

    if file_ext == ".sdf":
        ## Maintaining the standard parameters: /idName/ = 'ID', /includeFingerprints/ = False,
        ##                                      /isomericSmiles/ = True, /embedProps/ = False,
        ##                                      /removeHs/ = True, /strictParsing/ = True, /smilesName/ = None
        df = PandasTools.LoadSDF(filename, molColName=mol_col)

    elif file_ext in [".csv", ".tsv", ".smi"]:
        sep = ","
        if file_ext == ".tsv":
            sep = "\t"
        elif file_ext == ".smi":
            mol_field = "smiles"
        df = pandas.read_csv(filename, sep=sep)
        # Generate structures from SMILES
        # Yields /None/ if conversion fails
        PandasTools.AddMoleculeColumnToFrame(df, smilesCol=smiles_col)

    elif file_ext == '':
        # TODO: Error: Cannot determine file type
        pass
    else:
        # TODO: Error: file type not supported
        pass

    return df

Exemple #11

0

Afficher le fichier

    def process_ligands(self, ligands):
        XD = []

        if self.drug_format == "labeled_smiles":
            if type(ligands) == OrderedDict:
                iterator = ligands.keys()
            else:
                iterator = range(ligands.shape[0])

            for d in iterator:
                XD.append(
                    label_smiles(ligands[d], self.SMILEN, self.charsmiset))

        elif self.drug_format == "mol2vec":
            from gensim.models import word2vec
            from mol2vec.features import (MolSentence, mol2alt_sentence,
                                          sentences2vec)
            from rdkit.Chem import PandasTools

            word2vec_model = word2vec.Word2Vec.load(self.mol2vec_model_path)
            df_ligands = pd.DataFrame({"smiles": ligands})

            PandasTools.AddMoleculeColumnToFrame(df_ligands, "smiles", "ROMol")
            dtc_train = df_ligands[df_ligands["ROMol"].notnull()]
            dtc_train.loc[:, "mol-sentence"] = dtc_train.apply(
                lambda x: MolSentence(
                    mol2alt_sentence(x["ROMol"], self.mol2vec_radius)),
                axis=1,
            )
            XD = sentences2vec(dtc_train["mol-sentence"],
                               word2vec_model,
                               unseen="UNK")

        return XD

Exemple #12

0

Afficher le fichier

def get_molecules():
    df = pd.read_csv('./data/smiles.csv')
    PandasTools.AddMoleculeColumnToFrame(df,
                                         'smiles',
                                         'molecule',
                                         includeFingerprints=False)
    return df

Exemple #13

0

Afficher le fichier

def read_excel(
    urlpath: Union[str, os.PathLike, TextIO],
    sheet_name: Optional[Union[str, int, list]] = 0,
    smiles_column: str = None,
    mol_column: str = "mol",
    **kwargs,
) -> pd.DataFrame:
    """Read an excel file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        sheet_name: see `pandas.read_excel()` doc.
        mol_column: Name to give to the mol column. If not None a mol column will be build.
            Avoid when loading a very large file.
        mol_column: name to give to the mol column.
        kwargs: Arguments to pass to `pd.read_excel()`.

    Returns:
        df: a `pandas.DataFrame`
    """

    df = pd.read_excel(urlpath, sheet_name=sheet_name,
                       **kwargs)  # type: ignore

    if smiles_column is not None:
        PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)

    return df

Exemple #14

0

Afficher le fichier

def grid_image(
        df,
        filename,
        molobj=True,
        smi='smiles'):  # list of molecules to print and substructre to align
    """
    Creates and saves grid image of 2D drawings of molecules.
    Accepts dataframe containing a column titled "Molecule" that contains RDKit molecule objects.
    Accepts filename as string (without .png) for image file.
    Returns nothing, saves file in current directory.
    _____________________________
    Keyword Arguments:
    molobj=True, if RDKit MolObj column exists in df.  (Must be headed "Molecule")
    smi='smiles', if molojb=False then use column titled smi to create MolObj column.

     """

    if not molobj:  # no molobj exists
        PandasTools.AddMoleculeColumnToFrame(df,
                                             smi,
                                             'Molecule',
                                             includeFingerprints=True)

    # this code makes multiple images of n molecules.  May be prefered for large sets of molecules.

    # create images of molecules in dataframe
    mol_image = PandasTools.FrameToGridImage(
        df,
        column='Molecule',
        molsPerRow=3,
        subImgSize=(800, 400),
        legends=[str(i + 1) for i in range(len(df['Molecule']))])
    mol_image.save(
        filename +
        '.png')  # shold use a better naming scheme to avoid overwrites.

Exemple #15

0

Afficher le fichier

Fichier : data_loaders.py Projet : vorsilam/MI_ADM

def fetch_learning_data(datasets,
                        datasets_cols=(),
                        bioacitivities_cols=('value', ),
                        compute_descriptors=False,
                        create_rdkit_mols=False,
                        col_names_map=(),
                        duplicates_handler=None):
    DB_CONNECTION, TB_COMPOUNDS, TB_DATASETS, TB_BIOACTIVITIES = db.fetch_all()

    session = sessionmaker(bind=DB_CONNECTION)()

    cols = _gather_columns(TB_BIOACTIVITIES, bioacitivities_cols)
    cols.extend(_gather_columns(TB_DATASETS, datasets_cols))
    cols.append(TB_COMPOUNDS.c.smiles)
    query = session.query(
        *cols
    ).join(TB_COMPOUNDS).join(TB_DATASETS)\
        .filter(
        TB_DATASETS.c.unique_id.in_(datasets)
    )

    # make the DB query and export the data to pandas DataFrame object
    data = pandas.read_sql_query(query.selectable, DB_CONNECTION)
    smiles_col_name = settings.COMPOUNDS_TABLE + '_smiles'
    ic50_col_name = settings.BIOACTIVITIES_TABLE + '_value'

    # remove duplicate values
    if duplicates_handler:
        duplicates = set(
            data[smiles_col_name][data[smiles_col_name].duplicated()])
        for smiles in duplicates:
            duplicate_ic50s = data[data[smiles_col_name] ==
                                   smiles][ic50_col_name]
            ret = duplicates_handler(smiles, duplicate_ic50s)
            data = data[data[smiles_col_name] != smiles]
            if type(ret) != bool and ret != False:
                data.update(
                    pandas.DataFrame([[smiles, ret]],
                                     columns=[smiles_col_name, ic50_col_name]))

    if compute_descriptors:
        desc_list = Descriptors.descList
        try:
            desc_list = [x for x in desc_list if x[0] in compute_descriptors]
        except TypeError:
            for desc_name, function in desc_list:
                values = []
                for smiles in data[smiles_col_name]:
                    mol = MolFromSmiles(smiles)
                    values.append(function(mol))
                data[desc_name] = values

    if create_rdkit_mols:
        PandasTools.AddMoleculeColumnToFrame(data, smiles_col_name, 'rdmol')

    if col_names_map:
        data.rename(columns=col_names_map, inplace=True)

    return data

Exemple #16

0

Afficher le fichier

Fichier : QueryHandler.py Projet : SohanCSERU/PhytoChem

def query_to_df(queryset):
    # Dataframe to write calculations of each compounds
    compounds_df = pd.DataFrame(list(queryset.values())).drop('id', axis=1)
    PandasTools.AddMoleculeColumnToFrame(compounds_df,
                                         'Smiles',
                                         'ROMol',
                                         includeFingerprints=True)
    return compounds_df

Exemple #17

0

Afficher le fichier

Fichier : illuminate.py Projet : jeriek/Argenomic

 def initial_population(self) -> None:
     dataframe = pd.read_csv(self.data_file)
     pdtl.AddMoleculeColumnToFrame(dataframe, 'smiles', 'molecule')
     molecules = dataframe['molecule'].sample(n=self.initial_size).tolist()
     molecules = self.arbiter(self.unique_molecules(molecules))
     molecules, descriptors, fitnesses = self.process_molecules(molecules)
     self.archive.add_to_archive(molecules, descriptors, fitnesses)
     return None

Exemple #18

0

Afficher le fichier

Fichier : scaffold_folding.py Projet : melloddy/MELLODDY-TUNER

    def sn_scaff_smiles(self, murcko_smiles):
        """Function to exctract the preferred scaffold based on Scaffold Tree rules from the scaffold network created from a Murcko scaffold

        Args:
            murcko_smiles(str): valdi smiles string of a Murcko scaffold

        Returns:
            str: smiles string of the preferred scaffold

        """

        if murcko_smiles is None:
            return None
        mol = Chem.MolFromSmiles(murcko_smiles)
        if mol is not None:
            # if the murcko scaffold has less or equal than the targeted number of rings, then the Murcko scaffold is already the sn_scaffold,
            # so no further decomposition is needed
            if Chem.rdMolDescriptors.CalcNumRings(mol) <= self.nrings_target:
                return murcko_smiles
            # otherwise start decomposition
            try:
                sn = rdScaffoldNetwork.CreateScaffoldNetwork([mol],
                                                             self.snparams)
            except:
                raise ValueError(
                    "failed to calculate scaffold network for {}".format(
                        murcko_smiles))
            # create data fram with n ode smiles
            node_df = pd.DataFrame({"node_smiles": [str(n) for n in sn.nodes]})
            PandasTools.AddMoleculeColumnToFrame(node_df,
                                                 "node_smiles",
                                                 "mol",
                                                 includeFingerprints=False)
            node_df["num_rings"] = node_df["mol"].apply(
                Chem.rdMolDescriptors.CalcNumRings)
            node_df["num_rings_delta"] = (node_df["num_rings"] -
                                          self.nrings_target).abs()
            node_df["num_rbonds"] = node_df["mol"].apply(
                Chem.rdMolDescriptors.CalcNumRotatableBonds, strict=False)
            node_df["num_hrings"] = node_df["mol"].apply(
                Chem.rdMolDescriptors.CalcNumHeterocycles)
            node_df["num_arings"] = node_df["mol"].apply(
                Chem.rdMolDescriptors.CalcNumAromaticRings)
            node_df["num_bridge"] = node_df["mol"].apply(
                Chem.rdMolDescriptors.CalcNumBridgeheadAtoms)
            node_df["num_spiro"] = node_df["mol"].apply(
                Chem.rdMolDescriptors.CalcNumSpiroAtoms)
            node_df["has_macrocyle"] = node_df["mol"].apply(
                self.has_macrocycle)
            node_df["has_unusual_ring_size"] = node_df["mol"].apply(
                self.has_unusual_ringsize)
            node_df.sort_values(self.priority_cols,
                                ascending=self.priority_asc,
                                inplace=True)
            return node_df.iloc[0]["node_smiles"]
        else:
            raise ValueError("murcko_smiles {} cannot be read by rdkit".format(
                murcko_smiles))

Exemple #19

0

Afficher le fichier

Fichier : ace_workflow.py Projet : r-cloke/deepchem_ace

def make_input():
    active_df = pd.read_csv("actives_final.ism", header=None, sep=" ")
    active_rows, active_cols = active_df.shape
    active_df.columns = ["SMILES", "ID", "ChEMBL_ID"]
    active_df["label"] = ["Active"] * active_rows
    PandasTools.AddMoleculeColumnToFrame(active_df, "SMILES", "MOL")

    decoy_df = pd.read_csv("decoys_final.ism", header=None, sep=" ")
    decoy_rows, decoy_cols = decoy_df.shape
    decoy_df.columns = ["SMILES", "ID"]
    decoy_df["label"] = ["Decoy"] * decoy_rows
    PandasTools.AddMoleculeColumnToFrame(decoy_df, "SMILES", "MOL")

    active_df["is_active"] = [1] * active_df.shape[0]
    decoy_df["is_active"] = [0] * decoy_df.shape[0]
    combined_df = active_df.append(decoy_df)[["SMILES", "ID", "is_active"]]

    combined_df.to_csv("dude_ace.csv", index=False)

Exemple #20

0

Afficher le fichier

 def test_AddMoleculeColumnToFrame(self):
     df = PandasTools.LoadSDF(getStreamIO(methane + peroxide),
                              isomericSmiles=True,
                              smilesName='Smiles')
     PandasTools.ChangeMoleculeRendering(frame=df, renderer='String')
     del df['ROMol']
     self.assertNotIn('ROMol', str(df))
     PandasTools.AddMoleculeColumnToFrame(df, includeFingerprints=False)
     self.assertIn('ROMol', str(df))

Exemple #21

0

Afficher le fichier

Fichier : rdkit_easy.py Projet : CBIIT/NCI-DOE-Collab-ATOM-Modeling-Pipeline-AMPL

def add_mol_column(df, smiles_col, molecule_col='mol'):
    """
    Add a column 'molecule_col' to data frame 'df' containing RDKit Mol objects
    corresponding to the SMILES strings in column 'smiles_col'.
    """
    PandasTools.AddMoleculeColumnToFrame(df,
                                         smiles_col,
                                         molecule_col,
                                         includeFingerprints=True)
    return df

Exemple #22

0

Afficher le fichier

Fichier : QueryHandler.py Projet : SohanCSERU/PhytoChem

def update_sdf():
    compounds_df = pd.DataFrame(list(Compound.objects.all().values())).drop(
        ['id', 'created_at', 'updated_at'], axis=1)
    PandasTools.AddMoleculeColumnToFrame(compounds_df,
                                         'Smiles',
                                         'ROMol',
                                         includeFingerprints=True)
    if not os.path.exists('media'):
        os.makedirs('media')
    df_to_sdf(compounds_df, 'media/all_data.sdf')

Exemple #23

0

Afficher le fichier

Fichier : save_to_db.py Projet : ahadsheikh/PhytoChem

def update_sdf():
    compounds_df = pd.DataFrame(list(Compound.objects.all().values()))
    if not compounds_df.isnull:
        compounds_df = compounds_df.drop(['id', 'created_at', 'updated_at'], axis=1)
    PandasTools.AddMoleculeColumnToFrame(compounds_df, 'Smiles', 'ROMol', includeFingerprints=True)
    if not os.path.exists('media'):
        os.makedirs('media')
    with open('media/all_data.sdf', 'w') as fi:
        PandasTools.WriteSDF(compounds_df, fi, molColName='ROMol', idName='PID',
                             properties=list(compounds_df.columns))

Exemple #24

0

Afficher le fichier

    def add_mol_to_frame(self):
        """
        Adds a image file of the ligand to the :py:class:`pandas.Dataframe`

        :return: None
        """
        PandasTools.AddMoleculeColumnToFrame(
            self._data, smilesCol="smiles", molCol="ROMol", includeFingerprints=False
        )
        self._data["ROMol"].apply(lambda x: x[0])

Exemple #25

0

Afficher le fichier

def molgrid_image(smiles, file_name, labels=None, molPerRow=5):
    df = pd.DataFrame({'smiles': smiles})
    PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol')
    if labels is None:
        labels = ['{:d}'.format(i) for i in df.index]
    svg = Draw.MolsToGridImage(df['mol'],
                               molsPerRow=5,
                               legends=labels,
                               useSVG=True)
    save_svg(svg, file_name + '.svg', dpi=150)
    return

Exemple #26

0

Afficher le fichier

    def compute_unique_smiles(self,
                              interp_df,
                              embeddings,
                              embedding_funct,
                              scaled_radius=0.5):
        """
        Identify duplicate SMILES and distorts the embedding. The input df
        must have columns 'SMILES' and 'Generated' at 0th and 1st position.
        'Generated' colunm must contain boolean to classify SMILES into input
        SMILES(False) and generated SMILES(True).

        This function does not make any assumptions about order of embeddings.
        Instead it simply orders the df by SMILES to identify the duplicates.
        """
        distance = self._compute_radius(scaled_radius)

        for i in range(5):
            smiles = interp_df['SMILES'].sort_values()
            duplicates = set()
            for idx in range(0, smiles.shape[0] - 1):
                if smiles.iat[idx] == smiles.iat[idx + 1]:
                    duplicates.add(smiles.index[idx])
                    duplicates.add(smiles.index[idx + 1])

            if len(duplicates) > 0:
                for dup_idx in duplicates:
                    if interp_df.iat[dup_idx, 1]:
                        # add jitter to generated molecules only
                        embeddings[dup_idx] = self.addjitter(
                            embeddings[dup_idx], distance, 1)
                smiles = embedding_funct(embeddings)
            else:
                break

        # Ensure all generated molecules are valid.
        for i in range(5):
            PandasTools.AddMoleculeColumnToFrame(interp_df,'SMILES')
            invalid_mol_df = interp_df[interp_df['ROMol'].isnull()]

            if not invalid_mol_df.empty:
                invalid_index = invalid_mol_df.index.to_list()
                for idx in invalid_index:
                    embeddings[idx] = self.addjitter(embeddings[idx],
                                                        distance,
                                                        cnt=1)
                smiles = embedding_funct(embeddings)
            else:
                break

        # Cleanup
        if 'ROMol' in interp_df.columns:
            interp_df = interp_df.drop('ROMol', axis=1)

        return interp_df

Exemple #27

0

Afficher le fichier

def get_all(self):
    t2.set('')
    t_sol.set('')
    t_lip.set('')
    t_sasc.set('')



    print('molecule')

    print(molecule[0])
    #print('canocical_smile', molecule[0].canonical_smiles)
    print('isomeric_smile',  molecule[0].isomeric_smiles)
    mol_canonical_smiles = molecule[0].canonical_smiles
    mol_isomeric_smiles  = molecule[0].isomeric_smiles
    t2.set(mol_isomeric_smiles)

    mol_ = Chem.MolFromSmiles(mol_isomeric_smiles)

    Draw.MolToFile(mol_, 'tmp.png')

    global image_
    image_open = Image.open('tmp.png')
    image_ = ImageTk.PhotoImage(image_open, master=frame1)

    canvas.create_image(150,75, image=image_)

    smiles = t2.get()

    df = pd.DataFrame({'name': [t1.get()], 'smiles' : [t2.get()], 'solubility': [0.00]})
    #df = pd.DataFrame([])
    df.to_csv('tmp.csv')


    graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer()

    loader_p = dc.data.data_loader.CSVLoader( tasks = ['solubility'], smiles_field = "smiles", id_field = "name", featurizer = graph_featurizer )
    predictset = loader_p.featurize( 'tmp.csv' )

    prediction_sol =  model_sol.predict(predictset)
    t_sol.set(round(10**prediction_sol[0][0],3))

    prediction_lip =  model_lip.predict(predictset)
    t_lip.set(round(10**prediction_lip[0][0],3))


    PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles')

    sa_score = df.ROMol.map(sascorer.calculateScore)

    t_sasc.set(round(sa_score[0],2))

    #print(df['calc_SA_score'])
    print(sa_score[0])

Exemple #28

0

Afficher le fichier

def get_most_common_fragments(fragments, top_x=50):
    """
    Get most common fragments.
    
    Parameters
    ----------
    fragments : pandas.DataFrame
        Fragment details, i.e. SMILES, kinase groups, and fragment RDKit molecules, for input subpocket.
    top_x : int
        Top x most common fragments.
        
    Returns
    -------
    pandas.DataFrame
        Most common fragments (sorted in descending order), including fragments' SMILES, ROMol, and count.
    """

    # Get number of occurrences (count) per fragment (based on SMILES) in decending order
    fragment_counts = fragments.smiles.value_counts()
    fragment_counts.name = "fragment_count"

    # Cast Series to DataFrame and add ROMol column
    fragment_counts = fragment_counts.reset_index().rename(
        columns={"index": "smiles"})
    PandasTools.AddMoleculeColumnToFrame(fragment_counts, "smiles")

    # Sort fragments by their count (descending)
    fragment_counts.sort_values("fragment_count",
                                ascending=False,
                                inplace=True)
    fragment_counts.reset_index(inplace=True, drop=True)

    # Set molecule ID as index name
    fragment_counts.index.name = "molecule_id"

    # Get the top X most common fragments
    if fragment_counts.shape[0] < top_x:

        # Select all fragments if there are less than top X fragments in subpocket
        most_common_fragments = fragment_counts

    else:

        # If multiple fragments have the same count but some make it into the top X and some not,
        # include the latter also

        # Get lowest fragment count that is included in top X fragments
        lowest_fragment_count = fragment_counts.iloc[top_x - 1].fragment_count

        # Get all fragments with more or equal to the lowest fragment count
        most_common_fragments = fragment_counts[
            fragment_counts.fragment_count >= lowest_fragment_count]

    return most_common_fragments

Exemple #29

0

Afficher le fichier

def readProjectData(filename, FP, smilesCol):
    # reads in the project data and calculates fingerprints
    df_proj=pd.read_csv(filename)#,names=['ID','Structure','mol name','scaffold','series assignment','assay'], skiprows=[0])
    #df_proj = df_proj.head(100)
    PandasTools.AddMoleculeColumnToFrame(df_proj,smilesCol=smilesCol,molCol='Molecule')
    df_proj=df_proj.loc[df_proj['Molecule'].map(lambda x: x is not None)]
    if FP=='Morgan2':
        df_proj['FP']=df_proj.Molecule.map(lambda x : AllChem.GetMorganFingerprint(x,2))
    else: 
        print(FP, ' fingerprint not implemented.')
        return
    return df_proj

Exemple #30

0

Afficher le fichier

    def compile_filters(self):
        RS_inchi = self.limit_RS(self.df, self.command['RS_min'],
                                 self.command['RS_max'])
        MW_inchi = self.limit_MW(self.df, self.command['MW_min'],
                                 self.command['MW_max'])
        nRing_inchi = self.limit_nRing(self.df, self.command['nRing_min'],
                                       self.command['nRing_max'])
        Lipinski_inchi = self.limit_Lipinski(self.df, self.command['Lipinski'])
        nG12Ring_inchi = self.limit_nG12Ring(self.df,
                                             self.command['nG12Ring_min'],
                                             self.command['nG12Ring_max'])
        SlogP_inchi = self.limit_SlogP(self.df, self.command['SlogP_min'],
                                       self.command['SlogP_max'])
        Sugars_inchi = self.limit_nSugars(self.df, self.command['nSugars_min'],
                                          self.command['nSugars_min'])
        nFRing_inchi = self.limit_nFusedRing(self.df,
                                             self.command['nFRing_min'],
                                             self.command['nFRing_max'])
        core_ester_inchi = self.limit_core_ester(
            self.df, self.command['core_ester_min'],
            self.command['core_ester_max'])
        naRing_inchi = self.limit_naRing(self.df, self.command['naRing_min'],
                                         self.command['naRing_max'])
        activity_reported_inchi = self.limit_activity_reported(
            self.df, self.command['activity_reported'])

        sets = [
            RS_inchi, MW_inchi, nRing_inchi, Lipinski_inchi, nG12Ring_inchi,
            SlogP_inchi, Sugars_inchi, nFRing_inchi, core_ester_inchi,
            naRing_inchi, activity_reported_inchi
        ]
        self.filtered_inchi = list(set.intersection(*sets))
        self.filtered_df = self.df.loc[self.df['InChI Keys'].isin(
            self.filtered_inchi)]
        # print(filtered_df.shape[0], ' compouds have been compiled based on your filters.')
        # smiles = filtered_df['smiles'].tolist()
        PandasTools.AddMoleculeColumnToFrame(self.filtered_df, 'smiles',
                                             'Molecule picture')

        # export csv file
        # self.filtered_df.to_csv('temp.csv', index=False)

        ## export sdf file
        # PandasTools.WriteSDF(self.filtered_df, 'temp.sdf', molColName='structures', properties=list(self.filtered_df.columns), allNumeric=False)

        # export smiles
        # self.smiles_writer()

        # self.filtered_df.to_sql(name='temp', con=db.engine, index=False)
        smiles_frame = self.frame_manage()

        return smiles_frame