def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(self.__cachePath, "chem_comp") self.__mU = MarshalUtil(workPath=self.__dirPath) self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc") self.__ccIdxD = self.__reload(**kwargs)
def getCCDefFile(self, ccFilePath, molBuildType="model-xyz", suppressHydrogens=False): """Fetch the molecule definition (ccPath) and build OE molecules for comparison. """ # mU = MarshalUtil(workPath=self.__workPath) rdCcObjL = mU.doImport(ccFilePath, fmt="mmcif") oemf = OeMoleculeFactory() if not self.__verbose: oemf.setQuiet() ccId = oemf.setChemCompDef(rdCcObjL[0]) oemf.build(molBuildType=molBuildType) if self.__verbose: logger.info(" CCId = %s", ccId) logger.info(" Title = %s", oemf.getTitle()) logger.info(" SMILES = %s", oemf.getCanSMILES()) logger.info(" SMILES (stereo) = %s", oemf.getIsoSMILES()) logger.info(" Formula (Hill) = %s", oemf.getFormula()) logger.info(" InChI key = %s", oemf.getInChIKey()) logger.info(" InChI = %s", oemf.getInChI()) fD = {} fD = {"Formula": oemf.getFormula(), "SMILES": oemf.getCanSMILES(), "SMILES_STEREO": oemf.getIsoSMILES(), "InChI": oemf.getInChI(), "InChIKey": oemf.getInChIKey()} if suppressHydrogens: tMol = oemf.getGraphMolSuppressH() else: tMol = oemf.getMol() fD["OEMOL"] = tMol fD["xyz"] = oemf.getAtomDetails(xyzType="model") return (ccId, tMol, fD)
def __init__(self, **kwargs): """Utilities build and deliver OE molecules for search applications. Source molecular definitions are taken from SMILES descriptors generated by ChemCompSearchIndexProvider() Args: cachePath (str, optional): path to the directory containing cache files (default: '.') ccFileNamePrefix (str, optional) file name prefix for chemical component search index (default: "cc") oeFileNamePrefix (str, optional) file name prefix for all generated databases (default: "oe") """ # Database file names with be prefixed with base prefix plus the molecular build type and perception options oeFileNamePrefixBase = kwargs.get("oeFileNamePrefix", "oe") self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "oe") limitPerceptions = kwargs.get("limitPerceptions", False) if limitPerceptions: self.__oeFileNamePrefix = oeFileNamePrefixBase + "-limit" else: self.__oeFileNamePrefix = oeFileNamePrefixBase # cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(cachePath, "oe_mol") # self.__fpDbD = {} self.__ssDb = None self.__oeMolD = {} self.__oeMolDb = None self.__oeMolDbTitleD = None # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__reload(**kwargs)
def __init__(self, **kwargs): self.__dirPath = kwargs.get("holdingsDirPath", ".") useCache = kwargs.get("useCache", True) baseUrl = kwargs.get( "baseUrl", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/development/fall_back/holdings/" ) urlTargetContent = kwargs.get( "currentTargetUrl", os.path.join(baseUrl, "current_holdings.json.gz")) urlFallbackTargetContent = kwargs.get( "currentTargetUrl", os.path.join(baseUrl, "current_holdings.json.gz")) # urlTargetIds = kwargs.get( "currentTargetUrl", os.path.join(baseUrl, "current_pdb_ids.json.gz")) urlFallbackTargetIds = kwargs.get( "currentTargetUrl", os.path.join(baseUrl, "current_pdb_ids.json.gz")) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__invD = self.__reloadEntryContent(urlTargetContent, urlFallbackTargetContent, self.__dirPath, useCache=useCache) self.__idD = self.__reloadEntryIds(urlTargetIds, urlFallbackTargetIds, self.__dirPath, useCache=useCache)
def testExtractAndSerialize(self): """ Test extraction on an example sequence cluster data set. """ try: cdp = ClusterDataPrep(workPath=self.__workPath) cifD, docBySequenceD, docByClusterD = cdp.extract( self.__dataSetId, clusterSetLocator=self.__pathClusterData, levels=self.__levels, clusterType="entity") mU = MarshalUtil(workPath=self.__workPath) ok = mU.doExport(self.__pathSaveStyleCif, cifD, fmt="json", indent=3) self.assertTrue(ok) ok = mU.doExport(self.__pathSaveStyleDocSequence, docBySequenceD, fmt="json", indent=3) self.assertTrue(ok) ok = mU.doExport(self.__pathSaveStyleDocCluster, docByClusterD, fmt="json", indent=3) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def setUp(self): # # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml") # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) # self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__workPath = os.path.join(HERE, "test-output") self.__taxonomyDataPath = os.path.join( self.__cachePath, self.__cfgOb.get("NCBI_TAXONOMY_CACHE_DIR", sectionName=configName)) # self.__cacheKwargs = {"fmt": "json", "indent": 3} self.__exdbCacheDirPath = os.path.join( self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName)) # self.__mU = MarshalUtil() self.__entryLimitTest = 18 # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def __getApi(self, dictLocators, **kwargs): """ Return an instance of a dictionary API instance for the input dictionary locator list. """ consolidate = kwargs.get("consolidate", True) replaceDefinition = kwargs.get("replaceDefinitions", True) verbose = kwargs.get("verbose", True) # ok = self.__reload(dictLocators, self.__dirPath, useCache=self.__useCache) # dApi = None if ok: mU = MarshalUtil() containerList = [] for dictLocator in dictLocators: cacheFilePath = os.path.join( self.__dirPath, self.__fileU.getFileName(dictLocator)) containerList.extend( mU.doImport(cacheFilePath, fmt="mmcif-dict")) # dApi = DictionaryApi(containerList=containerList, consolidate=consolidate, replaceDefinition=replaceDefinition, verbose=verbose) return dApi
def testSearchExec(self): """Test case: search cli""" try: mL = glob.glob(os.path.join(self.__molFileDirPath, "*")) logger.info("search list length %d", len(mL)) mU = MarshalUtil() ok = mU.doExport(self.__queryListFilePath, mL, fmt="list") exU = ExecUtils() logger.info("Executing shell for %s", self.__queryListFilePath) cmdPath = os.path.join(TOPDIR, "rcsb", "utils", "ccdc", "CcdcSearchExec.py") logger.info("cmdPath %r", cmdPath) ok = exU.runShell( "%s %s --mol_list_path %s --result_path %s --search_type %s --csdhome %s" % (self.__pythonBinPath, cmdPath, self.__queryListFilePath, self.__ssResultPath, "substructure", self.__csdHome), outPath=self.__logPath, outAppend=False, timeOut=60, suppressStderr=False, ) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def __init__(self, filePath, **kwargs): self.__filePath = filePath # Turn off warnings for missing values self.__verbose = kwargs.get("verbose", False) self.__tD = {} self.__mU = MarshalUtil() self.__byPassMode = not self.__setup(self.__filePath)
def __init__(self, **kwargs): """Utilities build and deliver OE molecule databases from PDB chemical component definition data Args: cachePath (str, optional): path to the directory containing cache files (default: '.') molBuildType (str,optional): data source for building OE molecules (default: "model-xyz") oeFileNamePrefix (str, optional) file name prefix for all generated databases (default: "oe") """ # Database file names with be prefixed with base prefix plus the molecular build type and perception options oeFileNamePrefixBase = kwargs.get("oeFileNamePrefix", "oe") limitPerceptions = kwargs.get("limitPerceptions", False) molBuildType = kwargs.get("molBuildType", "model-xyz") if limitPerceptions and molBuildType in [ "oe-smiles", "oe-iso-smiles", "inchi" ]: self.__oeFileNamePrefix = oeFileNamePrefixBase + "-" + molBuildType + "-limit" else: self.__oeFileNamePrefix = oeFileNamePrefixBase + "-" + molBuildType # cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(cachePath, "oe_mol") # self.__fpDbD = {} self.__ssDb = None self.__oeMolD = {} self.__oeMolDb = None self.__oeMolDbTitleD = None # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__molCount = self.__reload(**kwargs)
def setUp(self): self.__verbose = True # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__updateId = "2018_25" self.__export = False # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) # self.__mU = MarshalUtil(workPath=self.__cachePath) self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def __init__( self, cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", polymerType="Protein", referenceDatabaseName="UniProt", provSource="PDB", maxChunkSize=100, fetchLimit=None, **kwargs ): self.__cfgOb = cfgOb self.__polymerType = polymerType self.__mU = MarshalUtil() # self.__maxChunkSize = maxChunkSize self.__statusList = [] # self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs) self.__refIdMapD, self.__matchD, self.__refD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs)
def __init__(self, **kwargs): # dirName = "glygen" cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(cachePath, dirName) super(GlyGenProvider, self).__init__(cachePath, [dirName]) useCache = kwargs.get("useCache", True) # baseUrl = kwargs.get( "glygenBasetUrl", "https://data.glygen.org/ln2data/releases/data/v-1.12.3/reviewed/") fallbackUrl = kwargs.get( "glygenFallbackUrl", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/glygen/" ) # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__glycanD = self.__reloadGlycans(baseUrl, fallbackUrl, self.__dirPath, useCache=useCache) self.__glycoproteinD = self.__reloadGlycoproteins(baseUrl, fallbackUrl, self.__dirPath, useCache=useCache)
def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False): """Create the schema definition file for a given database (i.e., the files under 'schema_definitions') Args: databaseName (str): database name (e.g., 'pdbx_comp_model_core') dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...) saveSchema (bool, optional): whether to save the schema to schemaCachePath or not (default False) Returns: dict: schema definition dictionary """ schemaDef = None try: smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath) schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb") if schemaDef and saveSchema: schemaLocator = self.__getSchemaDefLocator( databaseName, dataTyping=dataTyping) localPath = os.path.join( self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) mU = MarshalUtil(workPath=self.__workPath) mU.doExport(localPath, schemaDef, fmt="json", indent=3, enforceAscii=False) except Exception as e: logger.exception("Building schema %s failing with %s", databaseName, str(e)) return schemaDef
def __init__(self, **kwargs): # self.__cachePath = kwargs.get("cachePath", ".") self.__dirPath = os.path.join(self.__cachePath, "CARD-targets") # self.__mU = MarshalUtil(workPath=self.__dirPath) self.__oD, self.__version = self.__reload(self.__dirPath, **kwargs)
def __init__(self, **kwargs): urlTargetPfam = kwargs.get( "urlTargetPfam", "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz" ) urlTargetPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/Pfam-A.clans.tsv.gz" self.__version = "34.0" dirName = "pfam" cachePath = kwargs.get("cachePath", ".") dirPath = os.path.join(cachePath, dirName) super(PfamProvider, self).__init__(cachePath, [dirName]) useCache = kwargs.get("useCache", True) # self.__mU = MarshalUtil(workPath=dirPath) self.__pfamD = self.__rebuildCache(urlTargetPfam, urlTargetPfamFB, dirPath, useCache) urlTargetMapPfam = kwargs.get( "urlTargetMapPfam", "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pdb_pfamA_reg.txt.gz" ) urlTargetMapPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/pdb_pfamA_reg.txt.gz" self.__pfamMapD = self.__rebuildMappingCache(urlTargetMapPfam, urlTargetMapPfamFB, dirPath, useCache)
def __init__(self, **kwargs): """Wrapper class for batch chemical search/depiction operations. Path and prefix data for wrapper class may be set as keyword arguments as environmental variables. Args: ccUrlTarget (str, optional): path to concatenated chemical component definition file. Defaults to public data file. birdUrlTarget (str, optional): path to the concatenated BIRD definition file. Defaults to public data file. cachePath (str): path to top-level cache directory used to store search index file dependencies (default environment variable CHEM_SEARCH_CACHE_PATH or ".") numProc (int): multi-process cores to reserve. Default to 6. chunkSize (int): multi-process batch size. Defaults to 50. """ self.__startTime = time.time() # self.__useCache = kwargs.get("useCache", True) self.__numProc = kwargs.get("numProc", 6) self.__chunkSize = kwargs.get("chunkSize", 50) # self.__ccUrlTarget = kwargs.get("ccUrlTarget", None) self.__birdUrlTarget = kwargs.get("birdUrlTarget", None) self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", os.environ.get("CHEM_SEARCH_CC_PREFIX", "cc-full")) # self.__cachePath = kwargs.get("cachePath", os.environ.get("CHEM_SEARCH_CACHE_PATH", ".")) # --- self.__mU = MarshalUtil(workPath=self.__cachePath) # --- self.__ccsw = self.__reload()
def setUp(self): self.__verbose = True self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "mmcif_pdbx_v5_next.dic") self.__pathJsonTestFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "vrpt_dictmap.json") self.__pathIndexFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "update-lists", "all-pdb-list") self.__pathCifFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_BIRD_CC_REPO", "0", "PRDCC_000010.cif") # self.__locatorCifFile = "https://ftp.wwpdb.org/pub/pdb/data/structures/divided/mmCIF/00/100d.cif.gz" self.__locatorCifFileBad = "https://ftp.wwpdb.org/pub/pdb/data/structures/divided/mmCIF/00/100dx.cif.gz" self.__locatorCifFile = "https://ftp.wwpdb.org/pub/pdb/data/structures/divided/mmCIF/hr/6hrg.cif.gz" # self.__workPath = os.path.join(HERE, "test-output") self.__pathSaveDictionaryFile = os.path.join(self.__workPath, "mmcif_pdbx_v5_next.dic") self.__pathSaveJsonTestFile = os.path.join(self.__workPath, "json-content.json") self.__pathSaveIndexFile = os.path.join(self.__workPath, "all-pdb-list") self.__pathSaveCifFile = os.path.join(self.__workPath, "cif-content.cif") # self.__pathFastaFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "sequence", "pdb_seq_prerelease.fasta") self.__pathSaveFastaFile = os.path.join(self.__workPath, "test-pre-release.fasta") # self.__urlTarget = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" self.__urlTargetBad = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump-missing.tar.gz" # self.__mU = MarshalUtil() self.__startTime = time.time() logger.debug("Running tests on version %s", __version__) logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
def readDefaultDataTypeMap(self, locator, dataTyping="ANY"): """Read data file containing application default data type mapping data_rcsb_data_type_map loop_ _pdbx_data_type_application_map.application_name _pdbx_data_type_application_map.type_code _pdbx_data_type_application_map.app_type_code _pdbx_data_type_application_map.app_precision_default _pdbx_data_type_application_map.app_width_default # .... type mapping data ... Return (dict): map[cifType] -> appType, width, precision mapD['cif_type_code'] -> ['application_name', 'app_type_code', 'app_precision_default', 'app_width_default', 'type_code'] """ try: # mapD = {} mU = MarshalUtil(workPath=self.__workPath) containerList = mU.doImport(locator, fmt="mmcif", enforceAscii=True, useCharRefs=True, raiseExceptions=True) for container in containerList: if container.getName() == "rcsb_data_type_map": catObj = container.getObj("pdbx_data_type_application_map") for ii in range(catObj.getRowCount()): dD = catObj.getRowAttributeDict(ii) if dD["application_name"] == dataTyping: mapD[dD["type_code"]] = {k: dD[k] for k in ["app_type_code", "application_name", "type_code"]} mapD[dD["type_code"]].update({k: int(dD[k]) for k in ["app_precision_default", "app_width_default"]}) return mapD except Exception as e: logger.exception("Failing with %s", str(e)) return {}
def __parseFasta(self, fastaPath, taxonPath, cachePath, dirPath, addTaxonomy=False): # input paths chemblTargetRawPath = os.path.join(dirPath, "chembl_targets_raw.fa.gz") mU = MarshalUtil(workPath=cachePath) oD = {} uD = {} missTax = 0 taxonL = [] try: if addTaxonomy: umP = UniProtIdMappingProvider(cachePath) umP.reload(useCache=True) # fD = mU.doImport(chemblTargetRawPath, fmt="fasta", commentStyle="default") # for seqId, sD in fD.items(): chemblId = seqId.strip().split(" ")[0].strip() unpId = seqId[seqId.find("[") + 1:seqId.find("]")] seq = sD["sequence"] cD = { "sequence": seq, "uniprotId": unpId, "chemblId": chemblId } if addTaxonomy: taxId = umP.getMappedId(unpId, mapName="NCBI-taxon") cD["taxId"] = taxId if taxId else -1 if not taxId: missTax += 1 # seqId = "" cL = [] for k, v in cD.items(): if k in ["sequence"]: continue cL.append(str(v)) cL.append(str(k)) seqId = "|".join(cL) oD[seqId] = cD if addTaxonomy: taxonL.append("%s\t%s" % (seqId, taxId)) # uD.setdefault(unpId, []).append(chemblId) # ok1 = mU.doExport(fastaPath, oD, fmt="fasta", makeComment=True) ok3 = True if addTaxonomy: ok3 = mU.doExport(taxonPath, taxonL, fmt="list") return ok1 & ok3 except Exception as e: logger.exception("Failing with %s", str(e)) # return False
def testSubsetBuildMoleculeCacheFiltered(self): """Test construction of a filtered selection of chemical component definitions.""" mU = MarshalUtil() fD = mU.doImport(self.__missedIdsPath, fmt="json") filterIdD = {ccId: True for ccId in fD["filteredIdList"]} self.__testBuildMoleculeCacheFiles(filterIdD=filterIdD, ccFileNamePrefix="cc-filtered")
def reloadDump(self, fmt="json"): """Reload PubChem reference data store from saved dump. Args: fmt (str, optional): format of the backup file (pickle or json). Defaults to "json". Returns: (int): number of objects restored. """ numUpd = 0 try: # Read from disk backup and update object store - if fmt in ["json", "pickle"]: fp = self.__getdumpFilePath(fmt="json") logger.info("Restoring object store from %s", fp) mU = MarshalUtil(workPath=self.__dirPath) matchD = mU.doImport(fp, fmt=fmt) numUpd = self.__reloadDump( matchD, self.__databaseName, self.__matchIndexCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"]) except Exception as e: logger.exception("Failing for %r with %s", self.__dirPath, str(e)) # -- return numUpd
def getSchemaInfo(self, databaseName, dataTyping="ANY"): """Convenience method to return essential schema details for the input repository content type. Args: databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...) dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...) Returns: tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list """ sd = None dbName = None collectionNameList = [] docIndexD = {} try: mU = MarshalUtil(workPath=self.__workPath) schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping) if self.__rebuildFlag: filePath = os.path.join( self.__schemaCachePath, self.__fileU.getFileName(schemaLocator)) self.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True) else: filePath = self.__reload(schemaLocator, self.__schemaCachePath, useCache=self.__useCache) if not filePath: logger.error("Unable to recover schema %s (%s)", databaseName, dataTyping) logger.debug("ContentType %r dataTyping %r schemaLocator %r", databaseName, dataTyping, schemaLocator) schemaDef = mU.doImport(filePath, fmt="json") if schemaDef: logger.debug( "Using cached schema definition for %s application %s", databaseName, dataTyping) sd = SchemaDefAccess(schemaDef) if sd: dbName = sd.getDatabaseName() collectionInfoList = sd.getCollectionInfo() logger.debug("Schema %s database name %s collections %r", databaseName, dbName, collectionInfoList) for cd in collectionInfoList: collectionName = cd["NAME"] collectionNameList.append(collectionName) docIndexD[collectionName] = sd.getDocumentIndices( collectionName) except Exception as e: logger.exception("Retreiving schema %s for %s failing with %s", databaseName, dataTyping, str(e)) return sd, dbName, collectionNameList, docIndexD
def fetch(self): try: provenanceFileCachePath = self.__reload(self.__provenanceLocator, self.__provenanceCachePath, useCache=self.__useCache) mU = MarshalUtil(workPath=self.__workPath) return mU.doImport(provenanceFileCachePath, fmt="json") except Exception as e: logger.exception("Failed retreiving provenance with %s", str(e)) return {}
def exportBranchedEntityDetails(self, filePath, fmt="json"): """Export branched entity details (BIRD mapping and WURCS descriptors)""" rD = self.getBranchedDetails() # ---- mU = MarshalUtil() ok = mU.doExport(filePath, rD, fmt=fmt, indent=3) logger.info("Exporting (%d) branched entities status %r", len(rD), ok) return ok
def __getRegistry(self, registryPath): """""" try: mU = MarshalUtil() obj = mU.doImport(registryPath, fmt="json") return obj["mmcif_dictionary_registry"] except Exception as e: logger.exception("Failing for %r with %s", registryPath, str(e))
def setUp(self): self.__cachePath = os.path.join(HERE, "test-output", "CACHE") self.__fastaPath = os.path.join(HERE, "test-output", "chembl-targets.fa") self.__taxonPath = os.path.join(HERE, "test-output", "chembl-targets-taxon.tdd") self.__dataPath = os.path.join(HERE, "test-data") self.__mU = MarshalUtil(workPath=self.__cachePath)
def __init__(self, cfgOb, refDbName, **kwargs): self.__cfgOb = cfgOb self.__refDbName = refDbName self.__mU = MarshalUtil() # self.__refIdList = self.__getReferenceAssignments(refDbName, **kwargs) self.__refD, self.__matchD = self.__rebuildCache( refDbName, self.__refIdList, **kwargs)
def __reload(self, dirPath, baseVersion, useCache, **kwargs): startTime = time.time() mU = MarshalUtil(workPath=dirPath) chemblDbUrl = kwargs.get( "ChEMBLDbUrl", "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/") ok = False fU = FileUtil() fU.mkdir(dirPath) # # ChEMBL current version <baseVersion>,... # template: chembl_<baseVersion>.fa.gz # targetFileName = "chembl_" + str(baseVersion) + ".fa.gz" mappingFileName = "chembl_uniprot_mapping.txt" # chemblTargetPath = os.path.join(dirPath, targetFileName) chemblMappingPath = os.path.join(dirPath, mappingFileName) mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json") # mapD = {} if useCache and fU.exists(mappingFilePath): logger.info("useCache %r using %r and %r and %r", useCache, chemblTargetPath, chemblMappingPath, mappingFilePath) mapD = mU.doImport(mappingFilePath, fmt="json") else: # Get the ChEMBL UniProt mapping file url = os.path.join(chemblDbUrl, mappingFileName) ok = fU.get(url, chemblMappingPath) logger.info("Fetched %r url %s path %s", ok, url, chemblMappingPath) logger.info("Reading ChEMBL mapping file path %s", mappingFilePath) rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list") for row in rowL: mapD[row[0]] = (row[1], row[2], row[3]) ok = mU.doExport(mappingFilePath, mapD, fmt="json") logger.info("Processed mapping path %s (%d) %r", mappingFilePath, len(mapD), ok) # # Get the target FASTA files -- for vers in range(baseVersion, baseVersion + 10): logger.info("Now fetching version %r", vers) self.__version = vers targetFileName = "chembl_" + str(vers) + ".fa.gz" chemblTargetPath = os.path.join(dirPath, "chembl_targets_raw.fa.gz") url = os.path.join(chemblDbUrl, targetFileName) ok = fU.get(url, chemblTargetPath) logger.info("Fetched %r url %s path %s", ok, url, chemblTargetPath) if ok: break # logger.info("Completed reload at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime) # return mapD
def __init__(self, **kwargs): self.__cfgOb = kwargs.get("cfgOb", None) self.__cachePath = kwargs.get("cachePath", None) self.__sandboxPath = kwargs.get("sandboxPath", None) self.__filterType = kwargs.get("filterType", "") self.__assignDates = "assign-dates" in self.__filterType # self.__mU = MarshalUtil(workPath=self.__cachePath) self.__currentCacheD = None