def _urls_for_tranches_2d(self, col_list: List[str], row_list: List[str], fileformat: str = "smi") -> List[str]: """ Returns a list of urls to download files in smi format for the specified tranches. Parameters ---------- col_list : list of str List with the columns names. Columns are named with letters from A to K. They correspond to molecular weight. row_list : list of str List with the row names. Rows are named with letters from A to K. They correspond to LogP. Returns ------- url_list : list of str The urls. """ url_list = [] if fileformat != "smi" and fileformat != "txt": raise InvalidFileFormat(f"{fileformat} is not a valid file format. Valid formats are smi or txt") for col in col_list: for row in row_list: tranch = col + row # Each tranch is divided into various files from A to E tranch_subcategories = itertools.product("ABCE", "ABCD", repeat=1) url = self._tranches_2d_url + tranch + "/" + tranch for subtranch in tranch_subcategories: url_download = url + subtranch[0] + subtranch[1] + "." + fileformat url_list.append(url_download) return url_list
def _validate_filters(self, fileformat: str, availability: Optional[str] = None, bioactive: Optional[str] = None, biogenic: Optional[str] = None, reactivity: Optional[str] = None) -> None: """ Validate the filters passed to the download_substances and download_catalogs methods. Parameters ---------- availability : str, default is None The availability of the molecules. bioactive : str, default is None Subset of bioactivity and drugs. biogenic : str, default is None Subset of biogenic. reactivity: str, default is None The reactivity of the molecules. Raises ------ InvalidFileFormat InvalidAvailabilityError InvalidBioactiveError InvalidBiogenicError InvalidReactivityError """ if fileformat not in self.file_formats: raise InvalidFileFormat(f"{fileformat} is not a valid fileformat.") if availability: if availability not in self.filters["Availability"]: raise InvalidAvailabilityError(f"{availability} is not a valid availability.") if bioactive: if bioactive not in self.filters["BioactiveAndDrugs"]: raise InvalidBioactiveError(f"{bioactive} is not a valid bioactivity.") if biogenic: if biogenic not in self.filters["Biogenic"]: raise InvalidBiogenicError(f"{biogenic} is not a valid biogenic.") if reactivity: if reactivity not in self.filters["Reactivity"]: raise InvalidReactivityError(f"{reactivity} is not a valid reactivity.")
def from_ligand_file(cls, file_name: str, method: str, radius: float, feat_def: Callable, feat_list: Optional[List[str]] = None) -> "LigandBasedPharmacophore": """ Get a pharmacophore from a file of ligands Accepted file formats: smi, mol2, sdf, pdb Parameters ---------- file_name : str Name or path of the file containing the ligands. method : str Name of method or algorithm to compute the ligand based pharmacophore. radius : float, default=1.0 The radius in angstroms of the parmacohporic points. feat_list : list of str, optional List of features that will be used to derive the pharmacophore. If None is passed the default features will be used: donors, acceptors, aromatic rings, hydrophobics, positive and negative charges. feat_def : dict, optional Definitions of the pharmacophoric features. Dictionary which keys are SMARTS strings and values are feature names. If None is passed the default rdkit definition will be used. """ fextension = file_name.split(".")[-1] if fextension == "smi": ligands = Chem.SmilesMolSupplier(file_name, delimiter='\t', titleLine=False) elif fextension == "mol2": ligands = load_mol2_file(file_name) elif fextension == "sdf": ligands = Chem.SDMolSupplier(file_name) elif fextension == "pdb": ligands = Chem.rdmolfiles.MolFromPDBFile(file_name) else: raise InvalidFileFormat(f"{fextension} is not a supported file format") len(ligands) ligands = list(ligands) assert len(ligands) > 0 tmp_pharmacophore = LigandBasedPharmacophore().from_ligand_list( ligands=ligands, method=method, radius=radius, feat_list=feat_list, feat_def=feat_def) return cls(pharmacophoric_points=tmp_pharmacophore.pharmacophoric_points, ligands=tmp_pharmacophore.ligands, feat_def=feat_def)
def _urls_for_tranches_3d(self, col_list: List[str], row_list: List[str], fileformat: str) -> List[str]: """ Get a list of urls to download files in a 3D format for the specified tranches. Parameters ---------- col_list : list of str List with the columns names. Columns are named with letters from A to K. They correspond to molecular weight. row_list : list of str List with the row names. Rows are named with letters from A to K. They correspond to LogP. fileformat : {"sdf", "mol2", "db2"} The format of the files. Returns ------- url_list : list of str The urls. """ formats_3d = ["sdf", "mol2", "db2"] if fileformat not in formats_3d: raise InvalidFileFormat(f"{fileformat} is not a valid 3D file format. Valid formats are: {formats_3d}") tranches = [] for column in col_list: for row in row_list: tranches.append(column + row) base_url = "http://files.docking.org/3D/" urls3d_dir = "./data/zinc/urls3d/" url_list = [] for tranch in tranches: file = pkg_resources.resource_filename("openpharmacophore", urls3d_dir + tranch + ".uri") with open(file, "r") as fh: for line in fh.readlines(): url = base_url + tranch + "/" + line.rstrip() + "." + fileformat + ".gz" url_list.append(url) return url_list
def from_file(cls, file_name: str, load_mol_sys: bool = True) -> "StructuredBasedPharmacophore": """ Class method to load an structured based pharmacophore from a file. Currently supports only json format from pharmer. Parameters --------- file_name : str Name of the file containing the pharmacophore. """ fextension = file_name.split(".")[-1] if fextension == "json": points, receptor, ligand = from_pharmer(file_name, load_mol_sys) else: raise InvalidFileFormat( f"Invalid file type, \"{file_name}\" is not a supported file format" ) return cls(points, receptor, ligand)
def download_predifined_subset(self, download_path: str, subset: str, fileformat: str, tree: bool = True, ignore_failures: bool = True) -> None: """ Download one of ZINC's predifined subsets. Predifined substs can only be downloaded in the following formats: smi, txt, sdf, mol2 and db2. Parameters ---------- download_path : str The path were files will be downloaded. subset : str Name of the subset fileformat : str The format of the files that will be downloaded. Use mol2 or sdf for 3D molecules, otherwise use any of the other formats for 2D molecules (smiles). tree : bool Whether to use a tree directory structure or to download all the files to a single folder. ignore_failures : bool Whether to raise an exception if a file could not be downloaded. """ col_list, row_list = self._predefined_subset_tranches(subset) formats_2d = ["smi", "txt"] formats_3d = ["sdf", "mol2", "db2"] if fileformat in formats_2d: url_list = self._urls_for_tranches_2d(col_list, row_list, fileformat) self._download_batch_of_files(url_list, download_path, fileformat, "2D", tree, ignore_failures) elif fileformat in formats_3d: url_list = self._urls_for_tranches_3d(col_list, row_list, fileformat) self._download_batch_of_files(url_list, download_path, fileformat, "3D", tree, ignore_failures) else: raise InvalidFileFormat(f"{fileformat} is not a valid fileformat")
def from_file(cls, file_name: str) -> "Pharmacophore": """ Class method to load a pharmacophore from a file. Parameters --------- file_name : str Name of the file containing the pharmacophore """ fextension = file_name.split(".")[-1] if fextension == "json": points, _ , _ = from_pharmer(file_name, False) elif fextension == "ph4": points = from_moe(file_name) elif fextension == "pml": points = from_ligandscout(file_name) else: raise InvalidFileFormat(f"Invalid file format, \"{file_name}\" is not a supported file format") return cls(pharmacophoric_points=points)
def download_custom_subset(self, download_path: str, fileformat: str, mw_range: Tuple[float, float], logp_range: Tuple[float, float], tree: bool = True, ignore_failures: bool = True, availability: Optional[str] = None, bioactive: Optional[str] = None, biogenic: Optional[str] = None, reactivity: Optional[str] = None) -> None: """ Download subset with a custom molecular weight range and logP range from ZINC. This method accepts all file formats as specified in the attribute fileformats. Parameters ---------- download_path : str The path were files will be downloaded. mw_range : 2-tuple of float Range of molecular weight in daltons for the downloaded molecules. logp_range : 2-tuple of float Range of logP for the downloaded molecules. tree : bool Whether to use a tree directory structure or to download all the files to a single folder. ignore_failures : bool Whether to raise an exception if a file could not be downloaded. availability : str, default is None The availability of the molecules. bioactive : str, default is None Subset of bioactivity and drugs. biogenic : str, default is None Subset of biogenic. reactivity: str, default is None The reactivity of the molecules. """ formats_2d = ["xml" ,"csv","js","json","db","solv"] formats_3d = ["sdf", "mol2", "db2"] col_list, row_list = self._mw_and_logp_tranches(mw_range, logp_range) if any([availability, bioactive, biogenic, reactivity]): url_list = self._tranche_with_filters_url_list(col_list, row_list, availability, bioactive, biogenic, reactivity, fileformat=fileformat) self._download_batch_of_files(url_list, download_path, fileformat, "CS", tree, ignore_failures) else: if fileformat == "smi" or fileformat == "txt": url_list = self._urls_for_tranches_2d(col_list, row_list, fileformat) self._download_batch_of_files(url_list, download_path, fileformat, "2D", tree, ignore_failures) elif fileformat in formats_2d: url_list = self._tranche_with_filters_url_list(col_list, row_list, availability, bioactive, biogenic, reactivity, fileformat=fileformat) self._download_batch_of_files(url_list, download_path, fileformat, "CS", tree, ignore_failures) elif fileformat in formats_3d: url_list = self._urls_for_tranches_3d(col_list, row_list, fileformat) self._download_batch_of_files(url_list, download_path, fileformat, "3D", tree, ignore_failures) else: raise InvalidFileFormat(f"{fileformat} is not a valid file format.")
def draw(self, file_name: str, img_size: Tuple[int, int] = (500, 500), legend: str = "") -> None: """ Draw a 2d representation of the pharmacophore. This is a drawing of the ligand with the pharmacophoric features highlighted. Parameters ---------- file_name : str File where the drawing will be saved. Must be a png file. img_size : 2-tuple of int, optional The size of the image. (Default=(500,500)) legend : str, optional Image legend. """ if self.ligand is None: raise NoLigandsError( "Cannot draw pharmacophore if there is no ligand") if not file_name.endswith(".png"): raise InvalidFileFormat("File must be a png.") ligand = copy.deepcopy(self.ligand) ligand.RemoveAllConformers() ligand = Chem.RemoveHs(ligand) atoms = [] bond_colors = {} atom_highlights = defaultdict(list) highlight_radius = {} for point in self.pharmacophoric_points: indices = point.atom_indices for idx in indices: atoms.append(idx) atom_highlights[idx].append( get_color_from_palette_for_feature(point.feature_name)) highlight_radius[idx] = 0.6 # Draw aromatic rings bonds if point.feature_name == "aromatic ring": for neighbor in ligand.GetAtomWithIdx(idx).GetNeighbors(): nbr_idx = neighbor.GetIdx() if nbr_idx not in indices: continue bond = ligand.GetBondBetweenAtoms(idx, nbr_idx).GetIdx() bond_colors[bond] = [ get_color_from_palette_for_feature("aromatic ring") ] # If an atom has more than one feature label will contain both names if idx in atoms: if ligand.GetAtomWithIdx(idx).HasProp("atomNote"): label = ligand.GetAtomWithIdx(idx).GetProp("atomNote") label += "|" + str(point.short_name) else: label = point.short_name else: label = point.short_name ligand.GetAtomWithIdx(idx).SetProp("atomNote", label) drawing = rdMolDraw2D.MolDraw2DCairo(img_size[0], img_size[1]) drawing.DrawMoleculeWithHighlights(ligand, legend, dict(atom_highlights), bond_colors, highlight_radius, {}) drawing.FinishDrawing() drawing.WriteDrawingText(file_name)
def draw(self, file_name: str, img_size: Tuple[int, int] = (500, 500), legend: str = "", freq_threshold: float = 0.2) -> None: """ Draw a 2d representation of the dynamic pharmacophore. This is a drawing of the ligand with the pharmacophoric features highlighted and the frequency if each one. Parameters ---------- file_name : str Name or path og the file where the drawing will be saved. Must be a png file. img_size : 2-tuple of int, optional The size of the image (default=(500,500)) legend : str, optional Image legend. freq_threshold : double , optional The minimun frequency of a pharmacophoric point to be drawn. Number between 0.0 and 1.0 (default=0.2). """ if freq_threshold < 0.0 or freq_threshold > 1.0: raise ValueError( "Freqency threshold must be a value between 0 and 1") if not file_name.endswith(".png"): raise InvalidFileFormat("File must be a png.") # Extract a ligand if self.pharmacophores[0].ligand is None: raise NoLigandsError("Ligand could not be extracted") ligand = copy.deepcopy(self.pharmacophores[0].ligand) ligand.RemoveAllConformers() atoms = [] bond_colors = {} atom_highlights = defaultdict(list) highlight_radius = {} for up in self.unique_pharmacophoric_points: if up.frequency < freq_threshold: continue indices = up.atom_indices update_freq = True for idx in indices: # If an atom has more than one feature keep higher frequency value if idx in atoms: if ligand.GetAtomWithIdx(idx).HasProp("atomNote"): freq = int( ligand.GetAtomWithIdx(idx).GetProp("atomNote")[2:]) if freq > up.frequency: update_freq = False atoms.append(idx) if "hydrophobicity" in up.feature_name: feat_name = "hydrophobicity" else: feat_name = " ".join(up.feature_name.split()[0:2]) atom_highlights[idx].append( get_color_from_palette_for_feature(feat_name)) highlight_radius[idx] = 0.6 # Draw aromatic rings bonds if up.short_name == "R": for neighbor in ligand.GetAtomWithIdx(idx).GetNeighbors(): nbr_idx = neighbor.GetIdx() if nbr_idx not in indices: continue bond = ligand.GetBondBetweenAtoms(idx, nbr_idx).GetIdx() bond_colors[bond] = [ get_color_from_palette_for_feature("aromatic ring") ] if update_freq: frequency = int(up.frequency * 100) ligand.GetAtomWithIdx(idx).SetProp("atomNote", f"f={frequency}") drawing = rdMolDraw2D.MolDraw2DCairo(img_size[0], img_size[1]) drawing.DrawMoleculeWithHighlights(ligand, legend, dict(atom_highlights), bond_colors, highlight_radius, {}) drawing.FinishDrawing() drawing.WriteDrawingText(file_name)