Python PandasTools Examples, rdkit.Chem.PandasTools Python Examples

Example #1

0

Show file

 def test_all_numeric_with_numeric_columns(self):
     sio = StringIO()
     df = self.df
     df["len"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertEqual(s.count("<len>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)

Example #2

0

Show file

File: ace_workflow.py Project: r-cloke/deepchem_ace

def make_input():
    active_df = pd.read_csv("actives_final.ism", header=None, sep=" ")
    active_rows, active_cols = active_df.shape
    active_df.columns = ["SMILES", "ID", "ChEMBL_ID"]
    active_df["label"] = ["Active"] * active_rows
    PandasTools.AddMoleculeColumnToFrame(active_df, "SMILES", "MOL")

    decoy_df = pd.read_csv("decoys_final.ism", header=None, sep=" ")
    decoy_rows, decoy_cols = decoy_df.shape
    decoy_df.columns = ["SMILES", "ID"]
    decoy_df["label"] = ["Decoy"] * decoy_rows
    PandasTools.AddMoleculeColumnToFrame(decoy_df, "SMILES", "MOL")

    active_df["is_active"] = [1] * active_df.shape[0]
    decoy_df["is_active"] = [0] * decoy_df.shape[0]
    combined_df = active_df.append(decoy_df)[["SMILES", "ID", "is_active"]]

    combined_df.to_csv("dude_ace.csv", index=False)

Example #3

0

Show file

def get_mols_from_files(filenames, targets, verbose=True):
    """
    Read each file into its own Pandas dataframe. File type is based on the file
    extension. Currently supported filetypes are .sdf, .smi, .csv, and .tsv.

    For each file, extract the mols, stats, and the molecules that require review.

    Bring cleaned mols from all files into one list, /all_mols/, and all mols
    requiring review into one dict, /all_for_review/.
    """

    all_for_review = {}
    all_mols = []

    for filename in filenames:
        logging.info(filename)

        # Determine the type of the filename by the extension
        file_ext = pathlib.Path(filename).suffix
        ## Mol_field should probably be a passable agument, defaulting to "mol"?
        mol_field = "mol"

        # Read file depending on file extension
        if file_ext == ".sdf":
            df = PandasTools.LoadSDF(filename, molColName=mol_field)
        elif file_ext in [".csv", ".tsv", ".smi"]:
            sep = ","
            if file_ext == ".tsv":
                sep = "\t"
            if file_ext == ".smi":
                mol_field = "smiles"
            df = pandas.read_csv(filename, sep=sep)
        else:
            # TODO Throw an error
            pass

        # Stats is never used?
        mols, stats, for_review = get_activities(df,
                                                 original_filename=filename,
                                                 activity_fields=targets,
                                                 mol_field=mol_field)

        # Report the number of mols with activity for each target
        for target in targets:
            # We iterate over all the mols A LOT. Can that be reduced at all?
            # Also why did we make and return the /stats/ dict if we were just going to count
            # the stuff in /mols/ to get the same info???
            t = [x for x in mols if x.has_activity(target)]
            logging.info(f"{filename} {target} hits: {len(t)}")

        # Add the mols from the file to the list of all mols
        all_mols.extend(mols)
        # Add the mols that require review from this file to the dict of all mols requiring review
        extend_dict(all_for_review, for_review)

    # Return the list of /all_mols/ that have at least one valid activity and the mols that need to be reviewed
    return all_mols, all_for_review

Example #4

0

Show file

 def test_all_numeric_with_no_numeric_columns(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertFalse(">" in s, s)
     self.assertNotIn(
         "7\n\n",
         s)  # double-check that the numeric tests don't pass by accident
     self.assertNotIn("8\n\n", s)

Example #5

0

Show file

 def test_specify_numeric_column(self):
     sio = StringIO()
     df = self.df
     df["len2"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, properties=["len2"])
     s = sio.getvalue()
     self.assertEqual(s.count("<len2>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)

Example #6

0

Show file

File: cal_dERMSD.py Project: zhenglz/deltaVinaXGB

def num_structure_change(confs, native):
    ''' Get number of conformations satisfying requirements --> for entropy '''
    df_confs = PandasTools.LoadSDF(confs)
    df_confs["energy_abs"] = df_confs["energy_abs"].astype(float)
    lowest = df_confs.sort_values(["energy_abs"]).energy_abs.min()
    num_1 = df_confs[df_confs["energy_abs"] < lowest + 1.0].shape[0]
    num_2 = df_confs[df_confs["energy_abs"] < native].shape[0]
    
    return num_1, num_2

Example #7

0

Show file

def load_valid_atom_or_bond_features(path: str,
                                     smiles: List[str]) -> List[np.ndarray]:
    """
    Loads features saved in a variety of formats.

    Supported formats:

    * :code:`.npz` descriptors are saved as 2D array for each molecule in the order of that in the data.csv
    * :code:`.pkl` / :code:`.pckl` / :code:`.pickle` containing a pandas dataframe with smiles as index and numpy array of descriptors as columns
    * :code:'.sdf' containing all mol blocks with descriptors as entries

    :param path: Path to file containing atomwise features.
    :return: A list of 2D array.
    """

    extension = os.path.splitext(path)[1]

    if extension == '.npz':
        container = np.load(path)
        features = [container[key] for key in container]

    elif extension in ['.pkl', '.pckl', '.pickle']:
        features_df = pd.read_pickle(path)
        if features_df.iloc[0, 0].ndim == 1:
            features = features_df.apply(
                lambda x: np.stack(x.tolist(), axis=1), axis=1).tolist()
        elif features_df.iloc[0, 0].ndim == 2:
            features = features_df.apply(
                lambda x: np.concatenate(x.tolist(), axis=1), axis=1).tolist()
        else:
            raise ValueError(
                f'Atom/bond descriptors input {path} format not supported')

    elif extension == '.sdf':
        features_df = PandasTools.LoadSDF(path).drop(
            ['ID', 'ROMol'], axis=1).set_index('SMILES')

        features_df = features_df[~features_df.index.duplicated()]

        # locate atomic descriptors columns
        features_df = features_df.iloc[:, features_df.iloc[
            0, :].apply(lambda x: isinstance(x, str) and ',' in x).to_list()]
        features_df = features_df.reindex(smiles)
        if features_df.isnull().any().any():
            raise ValueError(
                'Invalid custom atomic descriptors file, Nan found in data')

        features_df = features_df.applymap(lambda x: np.array(
            x.replace('\r', '').replace('\n', '').split(',')).astype(float))

        features = features_df.apply(lambda x: np.stack(x.tolist(), axis=1),
                                     axis=1).tolist()

    else:
        raise ValueError(f'Extension "{extension}" is not supported.')

    return features

Example #8

0

Show file

def get_chembl(terms_to_keep):
    sdf_file = '/project/projectdirs/openmsi/projects/compound_data/chembl/chembl_21.sdf.gz'
    df = PandasTools.LoadSDF(sdf_file)
    df['source_database'] = 'chembl'
    k = list(df.keys())
    for t in terms_to_keep:
        if not t in k:
            df[t] = ''
    return df

Example #9

0

Show file

def main():
    args = getArgs()
    print(args.infile, args.outfile)
    smiles_df = pd.read_csv(args.infile)
    pp = smiles_df[['rdkit_smiles', 'compound_id']]

    PandasTools.AddMoleculeColumnToFrame(pp, 'rdkit_smiles', 'Molecule')

    for index, row in pp.iterrows():
        row['Molecule'] = Chem.AddHs(row['Molecule'])
        AllChem.EmbedMolecule(row['Molecule'])
        pp.at[index, 'i_user_TOTAL_CHARGE'] = Chem.rdmolops.GetFormalCharge(
            row['Molecule'])

    PandasTools.WriteSDF(pp,
                         args.outfile,
                         molColName='Molecule',
                         idName='compound_id',
                         properties=list(pp.columns))

Example #10

0

Show file

File: rdkit_easy.py Project: CBIIT/NCI-DOE-Collab-ATOM-Modeling-Pipeline-AMPL

def add_mol_column(df, smiles_col, molecule_col='mol'):
    """
    Add a column 'molecule_col' to data frame 'df' containing RDKit Mol objects
    corresponding to the SMILES strings in column 'smiles_col'.
    """
    PandasTools.AddMoleculeColumnToFrame(df,
                                         smiles_col,
                                         molecule_col,
                                         includeFingerprints=True)
    return df

Example #11

0

Show file

def test_force_Kekulize():
    df = pd.read_csv('All_Moles_Tested_Data.csv')
    i= 0
    mol_list = []
    for smile in df['smiles']:
        mol = MolFromSmiles(smile)
        x = find_custom_Kekulize_set(smile,  max_atoms= 60,  max_degree= 5,printMe = False)
        for index in x:
            mol.GetAtomWithIdx(index).SetAtomicNum(32)
            
        mol_list.append(mol)
        
    df['mol'] = pd.DataFrame({'mol':mol_list})
    
    unit = 5
    for i in range(0,len(df)//unit):
        display(PandasTools.FrameToGridImage(df.iloc[i*unit:i*unit+unit],column='mol', legendsCol='',molsPerRow=unit))
    if((len(df)%unit>0)*1):
        display(PandasTools.FrameToGridImage(df.iloc[len(df)//unit*unit:len(df)],column='mol', legendsCol='',molsPerRow=unit))

Example #12

0

Show file

File: pipeline.py Project: leelasdSI/rdkit_ipynb_tools

def stop_df_from_stream(stream, summary=None, comp_id="stop_df_from_stream"):
    """Generates a Pandas DataFrame out of the data stream.
    The molecules need to be present in the stream,
    e.g. generated by `pipe_mol_from_smiles`."""

    if not PANDAS:
        raise ImportError("pandas is not available.")
    PT.RenderImagesInAllDataFrames(images=True)
    df = pandas.DataFrame.from_dict(stop_dict_from_stream(stream, summary=summary, comp_id=comp_id))
    return df

Example #13

0

Show file

def main():
    args = UserInput()
    if args.name:
        name = args.name
    else:
        name = 'ID'
    if args.score:
        score = args.score
    else:
        score = 'Chemgauss4'
    if args.dock:
        dock = args.dock
    else:
        dock = 'fred'
    if args.top:
        top = int(args.top)
    else:
        top = -1  # all

    df = rdpd.LoadSDF(args.infile,
                      removeHs=False,
                      molColName='ROMol',
                      idName='mol_ID')[:top].fillna('')
    print('\033[34m> select mol: \033[32m{0}\033[0m'.format(len(df)))
    df[score] = df[score].apply(float)
    df['Rank'] = df.index

    for idx, row in df.iterrows():
        df['ROMol'][idx].SetProp(
            '_Name',
            '{0}::{1}::{2:.2f}::{3}'.format(row[name], row['Rank'] + 1,
                                            row[score], dock))

    sdf_out = '{0}.{1}_docked.sdf.gz'.format(args.outpref, dock)
    csv_out = '{0}.{1}_docked.txt.bz2'.format(args.outpref, dock)

    rdpd.WriteSDF(df, sdf_out, properties=list(df.columns))
    df.to_csv(csv_out,
              header=False,
              index=False,
              sep='\t',
              columns=[name, score],
              float_format='%.3f')

Example #14

0

Show file

File: QueryHandler.py Project: SohanCSERU/PhytoChem

def update_sdf():
    compounds_df = pd.DataFrame(list(Compound.objects.all().values())).drop(
        ['id', 'created_at', 'updated_at'], axis=1)
    PandasTools.AddMoleculeColumnToFrame(compounds_df,
                                         'Smiles',
                                         'ROMol',
                                         includeFingerprints=True)
    if not os.path.exists('media'):
        os.makedirs('media')
    df_to_sdf(compounds_df, 'media/all_data.sdf')

Example #15

0

Show file

    def add_mol_to_frame(self):
        """
        Adds a image file of the ligand to the :py:class:`pandas.Dataframe`

        :return: None
        """
        PandasTools.AddMoleculeColumnToFrame(
            self._data, smilesCol="smiles", molCol="ROMol", includeFingerprints=False
        )
        self._data["ROMol"].apply(lambda x: x[0])

Example #16

0

Show file

File: UnitTestPandasTools.py Project: sb123456789sb/rdkit

 def test_write_to_sdf_gz(self):
     dirname = tempfile.mkdtemp()
     try:
         filename = os.path.join(dirname, "test.sdf.gz")
         PandasTools.WriteSDF(self.df, filename)
         s = gzip.open(filename).read()
         self.assertEqual(s.count("\n$$$$\n"), 2)
         self.assertEqual(s.split("\n", 1)[0], "Methane")
     finally:
         shutil.rmtree(dirname)

Example #17

0

Show file

    def test_FrameToGridImage(self):
        # This test only makes sure that we get no exception. To see the created images, set
        # interactive to True
        interactive = False
        self.assertTrue(True)
        df = self.df

        result = PandasTools.FrameToGridImage(df)
        if interactive:
            result.show()

        result = PandasTools.FrameToGridImage(
            df, legendsCol='PUBCHEM_IUPAC_INCHIKEY')
        if interactive:
            result.show()

        result = PandasTools.FrameToGridImage(df, legendsCol=df.index.name)
        if interactive:
            result.show()

Example #18

0

Show file

File: sdf_NeverSeeFilter_separator.py Project: Tmacme/Structure-Based_docking

def main():
    args = UserInput()

    df = RDkitRead(args.infile, args.id, removeHs=False, add_Hs=False)

    nsee_df = df[df['NeverSee_Groups'] == 'Y']
    len(nsee_df)
    pass_df = df[df['NeverSee_Groups'] == 'N']
    len(pass_df)

    print('\033[34m Passed NeverSee Filter: \033[32m{0}\033[0m'.format(
        len(pass_df)))
    print('\033[34m Failed NeverSee Filter: \033[31m{0}\033[0m'.format(
        len(nsee_df)))

    if re.search(r'.smi', args.nsee_file, re.IGNORECASE):
        nsee_df.smiles = nsee_df.MOL.apply(
            lambda m: Chem.MolToSmiles(Chem.RemoveHs(m)))
        nsee_df.to_csv(args.nsee_file,
                       columns=['smiles', 'ID'],
                       sep=' ',
                       header=False,
                       index=False)
    else:
        rdpd.WriteSDF(nsee_df,
                      args.nsee_file,
                      molColName='MOL',
                      properties=list(nsee_df.columns))

    if re.search(r'.smi', args.pass_file, re.IGNORECASE):
        pass_df.smiles = pass_df.MOL.apply(
            lambda m: Chem.MolToSmiles(Chem.RemoveHs(m)))
        pass_df.to_csv(args.pass_file,
                       columns=['smiles', 'ID'],
                       sep=' ',
                       header=False,
                       index=False)
    else:
        rdpd.WriteSDF(pass_df,
                      args.pass_file,
                      molColName='MOL',
                      properties=list(pass_df.columns))
    print('')

Example #19

0

Show file

File: cluster.py Project: lenselinkbart/APCluster

def parse_sd_file(file, tgz=False):
    """
    parse a sd file and return molecules
    """
    if tgz == True:
        file = gzip.open(file)
    data = PandasTools.LoadSDF(file,
                               molColName='Molecule',
                               smilesName='smiles')
    return data

Example #20

0

Show file

    def compute_unique_smiles(self,
                              interp_df,
                              embeddings,
                              embedding_funct,
                              scaled_radius=0.5):
        """
        Identify duplicate SMILES and distorts the embedding. The input df
        must have columns 'SMILES' and 'Generated' at 0th and 1st position.
        'Generated' colunm must contain boolean to classify SMILES into input
        SMILES(False) and generated SMILES(True).

        This function does not make any assumptions about order of embeddings.
        Instead it simply orders the df by SMILES to identify the duplicates.
        """
        distance = self._compute_radius(scaled_radius)

        for i in range(5):
            smiles = interp_df['SMILES'].sort_values()
            duplicates = set()
            for idx in range(0, smiles.shape[0] - 1):
                if smiles.iat[idx] == smiles.iat[idx + 1]:
                    duplicates.add(smiles.index[idx])
                    duplicates.add(smiles.index[idx + 1])

            if len(duplicates) > 0:
                for dup_idx in duplicates:
                    if interp_df.iat[dup_idx, 1]:
                        # add jitter to generated molecules only
                        embeddings[dup_idx] = self.addjitter(
                            embeddings[dup_idx], distance, 1)
                smiles = embedding_funct(embeddings)
            else:
                break

        # Ensure all generated molecules are valid.
        for i in range(5):
            PandasTools.AddMoleculeColumnToFrame(interp_df,'SMILES')
            invalid_mol_df = interp_df[interp_df['ROMol'].isnull()]

            if not invalid_mol_df.empty:
                invalid_index = invalid_mol_df.index.to_list()
                for idx in invalid_index:
                    embeddings[idx] = self.addjitter(embeddings[idx],
                                                        distance,
                                                        cnt=1)
                smiles = embedding_funct(embeddings)
            else:
                break

        # Cleanup
        if 'ROMol' in interp_df.columns:
            interp_df = interp_df.drop('ROMol', axis=1)

        return interp_df

Example #21

0

Show file

def get_all(self):
    t2.set('')
    t_sol.set('')
    t_lip.set('')
    t_sasc.set('')



    print('molecule')

    print(molecule[0])
    #print('canocical_smile', molecule[0].canonical_smiles)
    print('isomeric_smile',  molecule[0].isomeric_smiles)
    mol_canonical_smiles = molecule[0].canonical_smiles
    mol_isomeric_smiles  = molecule[0].isomeric_smiles
    t2.set(mol_isomeric_smiles)

    mol_ = Chem.MolFromSmiles(mol_isomeric_smiles)

    Draw.MolToFile(mol_, 'tmp.png')

    global image_
    image_open = Image.open('tmp.png')
    image_ = ImageTk.PhotoImage(image_open, master=frame1)

    canvas.create_image(150,75, image=image_)

    smiles = t2.get()

    df = pd.DataFrame({'name': [t1.get()], 'smiles' : [t2.get()], 'solubility': [0.00]})
    #df = pd.DataFrame([])
    df.to_csv('tmp.csv')


    graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer()

    loader_p = dc.data.data_loader.CSVLoader( tasks = ['solubility'], smiles_field = "smiles", id_field = "name", featurizer = graph_featurizer )
    predictset = loader_p.featurize( 'tmp.csv' )

    prediction_sol =  model_sol.predict(predictset)
    t_sol.set(round(10**prediction_sol[0][0],3))

    prediction_lip =  model_lip.predict(predictset)
    t_lip.set(round(10**prediction_lip[0][0],3))


    PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles')

    sa_score = df.ROMol.map(sascorer.calculateScore)

    t_sasc.set(round(sa_score[0],2))

    #print(df['calc_SA_score'])
    print(sa_score[0])

Example #22

0

Show file

def get_most_common_fragments(fragments, top_x=50):
    """
    Get most common fragments.
    
    Parameters
    ----------
    fragments : pandas.DataFrame
        Fragment details, i.e. SMILES, kinase groups, and fragment RDKit molecules, for input subpocket.
    top_x : int
        Top x most common fragments.
        
    Returns
    -------
    pandas.DataFrame
        Most common fragments (sorted in descending order), including fragments' SMILES, ROMol, and count.
    """

    # Get number of occurrences (count) per fragment (based on SMILES) in decending order
    fragment_counts = fragments.smiles.value_counts()
    fragment_counts.name = "fragment_count"

    # Cast Series to DataFrame and add ROMol column
    fragment_counts = fragment_counts.reset_index().rename(
        columns={"index": "smiles"})
    PandasTools.AddMoleculeColumnToFrame(fragment_counts, "smiles")

    # Sort fragments by their count (descending)
    fragment_counts.sort_values("fragment_count",
                                ascending=False,
                                inplace=True)
    fragment_counts.reset_index(inplace=True, drop=True)

    # Set molecule ID as index name
    fragment_counts.index.name = "molecule_id"

    # Get the top X most common fragments
    if fragment_counts.shape[0] < top_x:

        # Select all fragments if there are less than top X fragments in subpocket
        most_common_fragments = fragment_counts

    else:

        # If multiple fragments have the same count but some make it into the top X and some not,
        # include the latter also

        # Get lowest fragment count that is included in top X fragments
        lowest_fragment_count = fragment_counts.iloc[top_x - 1].fragment_count

        # Get all fragments with more or equal to the lowest fragment count
        most_common_fragments = fragment_counts[
            fragment_counts.fragment_count >= lowest_fragment_count]

    return most_common_fragments

Example #23

0

Show file

    def __init__(self, data, output_name):
        output = StringIO()
        compounds_df = pd.DataFrame(list(data.values())).drop('id', axis=1)
        PandasTools.AddMoleculeColumnToFrame(compounds_df,
                                             'Smiles',
                                             'ROMol',
                                             includeFingerprints=True)
        PandasTools.WriteSDF(compounds_df,
                             output,
                             molColName='ROMol',
                             idName='PID',
                             properties=list(compounds_df.columns))

        mimetype = 'text/plain'
        file_ext = 'sdf'
        output.seek(0)
        super(SDFResponse, self).__init__(content=output.getvalue(),
                                          content_type=mimetype)
        self['Content-Disposition'] = 'attachment;filename="%s.%s"' % \
                                      (output_name.replace('"', '\"'), file_ext)

Example #24

0

Show file

def molgrid_image(smiles, file_name, labels=None, molPerRow=5):
    df = pd.DataFrame({'smiles': smiles})
    PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol')
    if labels is None:
        labels = ['{:d}'.format(i) for i in df.index]
    svg = Draw.MolsToGridImage(df['mol'],
                               molsPerRow=5,
                               legends=labels,
                               useSVG=True)
    save_svg(svg, file_name + '.svg', dpi=150)
    return

Example #25

0

Show file

def readProjectData(filename, FP, smilesCol):
    # reads in the project data and calculates fingerprints
    df_proj=pd.read_csv(filename)#,names=['ID','Structure','mol name','scaffold','series assignment','assay'], skiprows=[0])
    #df_proj = df_proj.head(100)
    PandasTools.AddMoleculeColumnToFrame(df_proj,smilesCol=smilesCol,molCol='Molecule')
    df_proj=df_proj.loc[df_proj['Molecule'].map(lambda x: x is not None)]
    if FP=='Morgan2':
        df_proj['FP']=df_proj.Molecule.map(lambda x : AllChem.GetMorganFingerprint(x,2))
    else: 
        print(FP, ' fingerprint not implemented.')
        return
    return df_proj

Example #26

0

Show file

File: main.py Project: zinph/MacrolactoneDB

def return_files_sdf():
    df = pd.read_pickle('temp.pickle')
    PandasTools.WriteSDF(df,
                         'temp.sdf',
                         molColName='structures',
                         properties=list(df.columns),
                         allNumeric=False)
    try:
        result = send_file('temp.sdf', as_attachment=True)
        return result
    except Exception as e:
        return str(e)

Example #27

0

Show file

 def binding_affinity(self,
                      prot_in,
                      lig_in,
                      outpath="results/results_affinity_binding.csv"):
     DF = self.preprocessing(prot_in, lig_in)
     X = DF.iloc[:, 2:]
     print(DF.columns)
     logger.info(X.shape)
     jl_filename = "models/gbdt_regression.joblib"
     cl_filename = "models/gbdt_model.joblib"
     if os.path.isfile(jl_filename) is True:
         with open(jl_filename, 'rb') as file:
             models = joblib.load(file)
             y = pd.Series(models.predict(X))
             ya = y.rename("predicted_affinity")
     else:
         logger.info("no model available")
     if os.path.isfile(cl_filename) is True:
         with open(cl_filename, 'rb') as file:
             models = joblib.load(file)
             yb = pd.Series(models.predict_proba(X)[:, 1])
     else:
         logger.info("no model available")
     smiles = DF["smiles"]
     prot = DF["UniProtID"]
     final = pd.concat([smiles, prot, ya, yb], axis=1)
     final.columns = ["smiles", "Uniprot ID", "affinity", "probability"]
     final["predicted_label"] = np.where(
         final.probability > 0.7, "high",
         np.where(final.probability < 0.4, "low", "medium"))
     logger.info(final.columns)
     logger.info(final.columns)
     logger.info(final[0:10])
     final.to_csv(outpath)
     pp_out = "results/affinity_out.sdf"
     PandasTools.AddMoleculeColumnToFrame(final, 'smiles', 'Molecule')
     PandasTools.WriteSDF(final,
                          pp_out,
                          molColName='Molecule',
                          properties=list(final.columns))

Example #28

0

Show file

File: sdf_to_dataset.py Project: Matrix-Groups/pharml

def split_sdf(sdf_file_name, outdir="data/"):
    print("Loading sdf.")
    # Parse the SDF file into a Pandas dataframe.
    rdk_lg = RDLogger.logger()
    rdk_lg.setLevel(RDLogger.CRITICAL)
    df = PandasTools.LoadSDF(sdf_file_name,
                             smilesName='SMILES',
                             molColName='Molecule',
                             includeFingerprints=False)
    print("Raw cols = ", [str(x) for x in df.columns])
    # Select only the needed columns and merge the two PDB cols.
    df_list = [
        'PDB ID(s) for Ligand-Target Complex', 'PDB ID(s) of Target Chain',
        'SMILES', 'IC50 (nM)', 'Molecule'
    ]
    df_selected = df[df_list].copy()
    df_selected["PDB IDs"] = df_selected[
        'PDB ID(s) for Ligand-Target Complex'] + ',' + df_selected[
            'PDB ID(s) of Target Chain']
    print("Selected cols = ", [str(x) for x in df_selected.columns])
    df_selected = df_selected[["PDB IDs"] + df_list[2:]]
    # Drop any rows with missing data.
    df_selected = df_selected.replace('', np.nan)
    df_selected = df_selected.replace(',', np.nan)
    df_selected = df_selected.dropna()
    r_rows = len(df.index)
    s_rows = len(df_selected.index)
    print("Raw rows = ", r_rows)
    print("Sel rows = ", s_rows)
    print("Keep pct = %.2f%s" %
          (((float(s_rows) / float(r_rows)) * 100.0), '%'))
    # Build ligand dictionary and a protein dictionary.
    print("Building protein-ligand dictionary.")
    uligs = {}
    prots_ligs = {}
    for lndx, row in enumerate(df_selected.values):
        pdbs = row[0].split(',')
        for pdb in pdbs:
            if pdb == '':
                continue
            if pdb not in prots_ligs:
                prots_ligs[pdb] = []
            prots_ligs[pdb] += [lndx]
        uligs[lndx] = row
    print("Unique proteins = ", len(prots_ligs))
    print("Writing per-ligand output files.")
    # Write out .lig files and return the data dictionaries.
    for key in uligs:
        ndx = str(key)
        lig = uligs[key]
        write_lig_file(lig[3], outdir + "/lig/lig%s.lig" % ndx)
    return uligs, prots_ligs

Example #29

0

Show file

    def compile_filters(self):
        RS_inchi = self.limit_RS(self.df, self.command['RS_min'],
                                 self.command['RS_max'])
        MW_inchi = self.limit_MW(self.df, self.command['MW_min'],
                                 self.command['MW_max'])
        nRing_inchi = self.limit_nRing(self.df, self.command['nRing_min'],
                                       self.command['nRing_max'])
        Lipinski_inchi = self.limit_Lipinski(self.df, self.command['Lipinski'])
        nG12Ring_inchi = self.limit_nG12Ring(self.df,
                                             self.command['nG12Ring_min'],
                                             self.command['nG12Ring_max'])
        SlogP_inchi = self.limit_SlogP(self.df, self.command['SlogP_min'],
                                       self.command['SlogP_max'])
        Sugars_inchi = self.limit_nSugars(self.df, self.command['nSugars_min'],
                                          self.command['nSugars_min'])
        nFRing_inchi = self.limit_nFusedRing(self.df,
                                             self.command['nFRing_min'],
                                             self.command['nFRing_max'])
        core_ester_inchi = self.limit_core_ester(
            self.df, self.command['core_ester_min'],
            self.command['core_ester_max'])
        naRing_inchi = self.limit_naRing(self.df, self.command['naRing_min'],
                                         self.command['naRing_max'])
        activity_reported_inchi = self.limit_activity_reported(
            self.df, self.command['activity_reported'])

        sets = [
            RS_inchi, MW_inchi, nRing_inchi, Lipinski_inchi, nG12Ring_inchi,
            SlogP_inchi, Sugars_inchi, nFRing_inchi, core_ester_inchi,
            naRing_inchi, activity_reported_inchi
        ]
        self.filtered_inchi = list(set.intersection(*sets))
        self.filtered_df = self.df.loc[self.df['InChI Keys'].isin(
            self.filtered_inchi)]
        # print(filtered_df.shape[0], ' compouds have been compiled based on your filters.')
        # smiles = filtered_df['smiles'].tolist()
        PandasTools.AddMoleculeColumnToFrame(self.filtered_df, 'smiles',
                                             'Molecule picture')

        # export csv file
        # self.filtered_df.to_csv('temp.csv', index=False)

        ## export sdf file
        # PandasTools.WriteSDF(self.filtered_df, 'temp.sdf', molColName='structures', properties=list(self.filtered_df.columns), allNumeric=False)

        # export smiles
        # self.smiles_writer()

        # self.filtered_df.to_sql(name='temp', con=db.engine, index=False)
        smiles_frame = self.frame_manage()

        return smiles_frame

Example #30

0

Show file

def mol_diversity(smiles):
    df = pd.DataFrame({'smiles': smiles})
    PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol')
    fps = [
        Chem.GetMorganFingerprintAsBitVect(m, 4, nBits=2048) for m in df['mol']
    ]
    dist_1d = tanimoto_1d(fps)
    mean_dist = np.mean(dist_1d)
    return mean_dist
    mean_rand = 0.91549  # mean random distance
    mean_diverse = 0.94170  # mean diverse distance
    norm_dist = (mean_dist - mean_rand) / (mean_diverse - mean_rand)
    return norm_dist