def __loadStatus(self, statusList):
     sectionName = "data_exchange_configuration"
     dl = DocumentLoader(
         self.__cfgOb,
         self.__cachePath,
         resourceName=self.__resourceName,
         numProc=self.__numProc,
         chunkSize=self.__chunkSize,
         documentLimit=None,
         verbose=self.__verbose,
         readBackCheck=self.__readBackCheck,
     )
     #
     databaseName = self.__cfgOb.get("DATABASE_NAME",
                                     sectionName=sectionName)
     collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS",
                                       sectionName=sectionName)
     ok = dl.load(
         databaseName,
         collectionName,
         loadType="append",
         documentList=statusList,
         indexAttributeList=["update_id", "database_name", "object_name"],
         keyNames=None)
     return ok
 def loadStatus(self, statusList, readBackCheck=True):
     ret = False
     try:
         dl = DocumentLoader(self.__cfgOb,
                             self.__cachePath,
                             "MONGO_DB",
                             numProc=1,
                             chunkSize=2,
                             documentLimit=None,
                             verbose=False,
                             readBackCheck=readBackCheck)
         #
         sectionName = "data_exchange_configuration"
         databaseName = self.__cfgOb.get("DATABASE_NAME",
                                         sectionName=sectionName)
         collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS",
                                           sectionName=sectionName)
         ret = dl.load(databaseName,
                       collectionName,
                       loadType="append",
                       documentList=statusList,
                       indexAttributeList=[
                           "update_id", "database_name", "object_name"
                       ],
                       keyNames=None)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return ret
    def testLoadExchangeStatus(self):
        """ Test case - load data exchange status objects.

        [data_exchange]
        DATABASE_NAME=data_exchange
        DATABASE_VERSION_STRING=v5
        COLLECTION_UPDATE_STATUS=rcsb_data_exchange_status
        COLLECTION_VERSION_STRING=v0_1

        """
        try:
            for ii in range(1, 100):
                collectionName = "my_collection_" + str(ii)
                dList = []
                desp = DataExchangeStatus()
                tS = desp.setStartTime()
                self.assertGreaterEqual(len(tS), 15)
                ok = desp.setObject("my_database", collectionName)
                self.assertTrue(ok)
                ok = desp.setStatus(updateId=None, successFlag="Y")
                self.assertTrue(ok)
                #
                tS = desp.setEndTime()
                self.assertGreaterEqual(len(tS), 15)
                dList.append(desp.getStatus())
                #
                self.assertEqual(len(dList), 1)
                logger.debug("Status record %r", dList[0])

                sectionName = "data_exchange_configuration"
                dl = DocumentLoader(
                    self.__cfgOb,
                    self.__cachePath,
                    self.__resourceName,
                    numProc=self.__numProc,
                    chunkSize=self.__chunkSize,
                    documentLimit=self.__documentLimit,
                    verbose=self.__verbose,
                    readBackCheck=self.__readBackCheck,
                )
                #
                databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
                # collectionVersion = self.__cfgOb.get('COLLECTION_VERSION_STRING', sectionName=sectionName)
                collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName)
                if ii == 1:
                    loadType = "full"
                else:
                    loadType = "append"
                ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None)
                self.assertTrue(ok)
                #

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
 def testLoadCluster(self):
     """ Test case - load example sequence cluster document data
     """
     try:
         dl = DocumentLoader(
             self.__cfgOb,
             self.__cachePath,
             self.__resourceName,
             numProc=self.__numProc,
             chunkSize=self.__chunkSize,
             documentLimit=self.__documentLimit,
             verbose=self.__verbose,
             readBackCheck=self.__readBackCheck,
         )
         #
         docBySequenceD, docByClusterD = self.__testExtract(
             dataSetId=self.__dataSetId,
             dataLocator=self.__pathClusterData,
             levels=self.__levels)
         #
         dList = docBySequenceD[self.__entitySchemaName]
         ok = dl.load(
             "sequence_clusters",
             "entity_members",
             loadType="full",
             documentList=dList,
             indexAttributeList=["data_set_id", "entry_id", "entity_id"],
             keyNames=None)
         self.assertTrue(ok)
         dList = docByClusterD[self.__clusterSchemaName]
         ok = dl.load(
             "sequence_clusters",
             "cluster_members",
             loadType="full",
             documentList=dList,
             indexAttributeList=["data_set_id", "identity", "cluster_id"],
             keyNames=None)
         self.assertTrue(ok)
         pD = self.__fetchProvenance()
         ok = dl.load("sequence_clusters",
                      "cluster_provenance",
                      loadType="full",
                      documentList=[pD],
                      indexAttributeList=None,
                      keyNames=None)
         self.assertTrue(ok)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
Beispiel #5
0
def loadStatus(statusList, cfgOb, cachePath, readBackCheck=True):
    sectionName = "data_exchange_configuration"
    dl = DocumentLoader(cfgOb,
                        cachePath,
                        "MONGO_DB",
                        numProc=2,
                        chunkSize=2,
                        documentLimit=None,
                        verbose=False,
                        readBackCheck=readBackCheck)
    #
    databaseName = cfgOb.get("DATABASE_NAME", sectionName=sectionName)
    collectionName = cfgOb.get("COLLECTION_UPDATE_STATUS",
                               sectionName=sectionName)
    ok = dl.load(
        databaseName,
        collectionName,
        loadType="append",
        documentList=statusList,
        indexAttributeList=["update_id", "database_name", "object_name"],
        keyNames=None)
    return ok
Beispiel #6
0
    def load(self, updateId, extResource, loadType="full"):
        """Load chemical reference integrated data for the input external resource-"""
        try:
            self.__statusList = []
            desp = DataExchangeStatus()
            statusStartTimestamp = desp.setStartTime()
            #
            if extResource == "DrugBank":
                databaseName = "drugbank_core"
                configName = self.__cfgOb.getDefaultSectionName()
                user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME",
                                        sectionName=configName)
                pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD",
                                      sectionName=configName)
                #
                dbP = DrugBankProvider(cachePath=self.__cachePath,
                                       useCache=self.__useCache,
                                       username=user,
                                       password=pw)
                #
                crExt = ChemRefExtractor(self.__cfgOb)
                idD = crExt.getChemCompAccessionMapping(extResource)
                dList = dbP.getDocuments(mapD=idD)
                #
                logger.info("Resource %r extracted mapped document length %d",
                            extResource, len(dList))
                logger.debug("Objects %r", dList[:2])
                sD, _, collectionList, _ = self.__schP.getSchemaInfo(
                    databaseName)
                collectionName = collectionList[
                    0] if collectionList else "unassigned"
                indexL = sD.getDocumentIndex(collectionName, "primary")
                logger.info("Database %r collection %r index attributes %r",
                            databaseName, collectionName, indexL)
                #
                collectionVersion = sD.getCollectionVersion(collectionName)
                addValues = {"_schema_version": collectionVersion}
                #
                addValues = {}
            #
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=indexL,
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)

            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
    def load(self, updateId, loadType="full"):
        """Load legacy repository holdings and status data -

        Relevant configuration options:

        [DEFAULT]
        RCSB_EXCHANGE_SANDBOX_PATH=MOCK_EXCHANGE_SANDBOX

        [repository_holdings_configuration]
        DATABASE_NAME=repository_holdings
        DATABASE_VERSION_STRING=v5
        COLLECTION_HOLDINGS_UPDATE=rcsb_repository_holdings_update_entry
        COLLECTION_HOLDINGS_CURRENT=rcsb_repository_holdings_current_entry
        COLLECTION_HOLDINGS_UNRELEASED=rcsb_repository_holdings_unreleased_entry
        COLLECTION_HOLDINGS_REMOVED=rcsb_repository_holdings_removed_entry
        COLLECTION_VERSION_STRING=v0_1

        """
        try:
            self.__statusList = []
            desp = DataExchangeStatus()
            statusStartTimestamp = desp.setStartTime()

            discoveryMode = self.__cfgOb.get("DISCOVERY_MODE",
                                             sectionName=self.__cfgSectionName,
                                             default="local")
            # ---
            baseUrlPDB = self.__cfgOb.getPath(
                "PDB_REPO_URL",
                sectionName=self.__cfgSectionName,
                default="https://ftp.wwpdb.org/pub")
            fallbackUrlPDB = self.__cfgOb.getPath(
                "PDB_REPO_FALLBACK_URL",
                sectionName=self.__cfgSectionName,
                default="https://ftp.wwpdb.org/pub")
            edMapUrl = self.__cfgOb.getPath("RCSB_EDMAP_LIST_PATH",
                                            sectionName=self.__cfgSectionName,
                                            default=None)
            #
            kwD = {
                "holdingsTargetUrl":
                os.path.join(baseUrlPDB, "pdb", "holdings"),
                "holdingsFallbackUrl":
                os.path.join(fallbackUrlPDB, "pdb", "holdings"),
                "edmapsLocator":
                edMapUrl,
                "updateTargetUrl":
                os.path.join(baseUrlPDB, "pdb", "data", "status", "latest"),
                "updateFallbackUrl":
                os.path.join(fallbackUrlPDB, "pdb", "data", "status",
                             "latest"),
                "filterType":
                self.__filterType,
            }
            # ---
            if discoveryMode == "local":
                rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb,
                                            sandboxPath=self.__sandboxPath,
                                            cachePath=self.__cachePath,
                                            filterType=self.__filterType)
            else:
                rhdp = RepoHoldingsRemoteDataPrep(cachePath=self.__cachePath,
                                                  **kwD)
            #
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            sectionName = "repository_holdings_configuration"
            databaseName = self.__cfgOb.get("DATABASE_NAME",
                                            sectionName=sectionName)
            # collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
            # addValues = {"_schema_version": collectionVersion}
            addValues = None
            #
            dList = rhdp.getHoldingsUpdateEntry(updateId=updateId)
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UPDATE",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)
            #
            dList = rhdp.getHoldingsCurrentEntry(updateId=updateId)
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_CURRENT",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)

            dList = rhdp.getHoldingsUnreleasedEntry(updateId=updateId)
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UNRELEASED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)
            #
            dList = rhdp.getHoldingsRemovedEntry(updateId=updateId)
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_REMOVED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)
            #
            dList = rhdp.getHoldingsCombinedEntry(updateId=updateId)
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_COMBINED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)
            #
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
Beispiel #8
0
    def load(self, updateId, loadType="full", doLoad=True):
        """Load tree node lists and status data -

        Relevant configuration options:

        tree_node_lists_configuration:
            DATABASE_NAME: tree_node_lists
            DATABASE_VERSION_STRING: v5
            COLLECTION_VERSION_STRING: 1.0.0
            COLLECTION_TAXONOMY: tree_taxonomy_node_list
            COLLECTION_ENZYME: tree_ec_node_list
            COLLECTION_SCOP: tree_scop_node_list
            COLLECTION_CATH: tree_cath_node_list

        """
        try:
            useCache = self.__useCache
            #
            # if not useCache:
            #    cDL = ["domains_struct", "NCBI", "ec", "go", "atc"]
            #    for cD in cDL:
            #        try:
            #            cfp = os.path.join(self.__cachePath, cD)
            #            os.makedirs(cfp, 0o755)
            #        except Exception:
            #            pass
            #        #
            #        try:
            #            cfp = os.path.join(self.__cachePath, cD)
            #            fpL = glob.glob(os.path.join(cfp, "*"))
            #            if fpL:
            #                for fp in fpL:
            #                    os.remove(fp)
            #        except Exception:
            #            pass
            #
            #
            logger.info("Starting with cache path %r (useCache=%r)",
                        self.__cachePath, useCache)
            #
            self.__statusList = []
            desp = DataExchangeStatus()
            statusStartTimestamp = desp.setStartTime()
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            databaseName = "tree_node_lists"
            # collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
            # addValues = {"_schema_version": collectionVersion}
            addValues = None
            # --- GO
            goP = GeneOntologyProvider(goDirPath=os.path.join(
                self.__cachePath, "go"),
                                       useCache=useCache)
            ok = goP.testCache()
            anEx = AnnotationExtractor(self.__cfgOb)
            goIdL = anEx.getUniqueIdentifiers("GO")
            logger.info("Unique GO assignments %d", len(goIdL))
            nL = goP.exportTreeNodeList(goIdL)
            logger.info("GO tree node list length %d", len(nL))
            if doLoad:
                collectionName = "tree_go_node_list"
                ok = dl.load(databaseName,
                             collectionName,
                             loadType=loadType,
                             documentList=nL,
                             indexAttributeList=["update_id"],
                             keyNames=None,
                             addValues=addValues,
                             schemaLevel=None)
                self.__updateStatus(updateId, databaseName, collectionName, ok,
                                    statusStartTimestamp)
                # ---- CATH
            ccu = CathClassificationProvider(cachePath=self.__cachePath,
                                             useCache=useCache)
            nL = ccu.getTreeNodeList()
            logger.info("Starting load SCOP node tree length %d", len(nL))
            if doLoad:
                collectionName = "tree_cath_node_list"
                ok = dl.load(databaseName,
                             collectionName,
                             loadType=loadType,
                             documentList=nL,
                             indexAttributeList=["update_id"],
                             keyNames=None,
                             addValues=addValues,
                             schemaLevel=None)
                self.__updateStatus(updateId, databaseName, collectionName, ok,
                                    statusStartTimestamp)
            # ---- SCOP
            scu = ScopClassificationProvider(cachePath=self.__cachePath,
                                             useCache=useCache)
            nL = scu.getTreeNodeList()
            logger.info("Starting load SCOP node tree length %d", len(nL))
            if doLoad:
                collectionName = "tree_scop_node_list"
                ok = dl.load(databaseName,
                             collectionName,
                             loadType=loadType,
                             documentList=nL,
                             indexAttributeList=["update_id"],
                             keyNames=None,
                             addValues=addValues,
                             schemaLevel=None)
                self.__updateStatus(updateId, databaseName, collectionName, ok,
                                    statusStartTimestamp)
            # --- SCOP2
            scu = Scop2ClassificationProvider(cachePath=self.__cachePath,
                                              useCache=useCache)
            nL = scu.getTreeNodeList()
            logger.info("Starting load SCOP2 node tree length %d", len(nL))
            if doLoad:
                collectionName = "tree_scop2_node_list"
                ok = dl.load(databaseName,
                             collectionName,
                             loadType=loadType,
                             documentList=nL,
                             indexAttributeList=["update_id"],
                             keyNames=None,
                             addValues=addValues,
                             schemaLevel=None)
                self.__updateStatus(updateId, databaseName, collectionName, ok,
                                    statusStartTimestamp)
            # ---- Ecod
            ecu = EcodClassificationProvider(cachePath=self.__cachePath,
                                             useCache=useCache)
            nL = ecu.getTreeNodeList()
            logger.info("Starting load ECOD node tree length %d", len(nL))
            if doLoad:
                collectionName = "tree_ecod_node_list"
                ok = dl.load(databaseName,
                             collectionName,
                             loadType=loadType,
                             documentList=nL,
                             indexAttributeList=["update_id"],
                             keyNames=None,
                             addValues=addValues,
                             schemaLevel=None)
                self.__updateStatus(updateId, databaseName, collectionName, ok,
                                    statusStartTimestamp)
            # ---- EC
            edbu = EnzymeDatabaseProvider(cachePath=self.__cachePath,
                                          useCache=useCache)
            nL = edbu.getTreeNodeList()
            logger.info("Starting load of EC node tree length %d", len(nL))
            if doLoad:
                collectionName = "tree_ec_node_list"
                ok = dl.load(databaseName,
                             collectionName,
                             loadType=loadType,
                             documentList=nL,
                             indexAttributeList=["update_id"],
                             keyNames=None,
                             addValues=addValues,
                             schemaLevel=None)
                self.__updateStatus(updateId, databaseName, collectionName, ok,
                                    statusStartTimestamp)
            # ---- Taxonomy
            # Get the taxon coverage in the current data set -
            epe = TaxonomyExtractor(self.__cfgOb)
            tL = epe.getUniqueTaxons()
            logger.info("Taxon coverage length %d", len(tL))
            #
            tU = TaxonomyProvider(cachePath=self.__cachePath,
                                  useCache=useCache)
            fD = {1}
            for taxId in tL:
                fD.update({k: True for k in tU.getLineage(taxId)})
            logger.info("Taxon filter dictionary length %d", len(fD))
            # logger.info("fD %r" % sorted(fD))
            #
            nL = tU.exportNodeList(filterD=fD)
            self.__checkTaxonNodeList(nL)
            logger.info("Starting load of taxonomy node tree length %d",
                        len(nL))
            if doLoad:
                collectionName = "tree_taxonomy_node_list"
                logger.debug("Taxonomy nodes (%d) %r", len(nL), nL[:5])
                ok = dl.load(databaseName,
                             collectionName,
                             loadType=loadType,
                             documentList=nL,
                             indexAttributeList=["update_id"],
                             keyNames=None,
                             addValues=addValues,
                             schemaLevel=None)
                self.__updateStatus(updateId, databaseName, collectionName, ok,
                                    statusStartTimestamp)
            logger.info("Tree loading operations completed.")
            #
            # ---  ATC
            crEx = ChemRefExtractor(self.__cfgOb)
            atcFilterD = crEx.getChemCompAccessionMapping("ATC")
            logger.info("Length of ATC filter %d", len(atcFilterD))
            atcP = AtcProvider(cachePath=self.__cachePath, useCache=useCache)
            nL = atcP.getTreeNodeList(filterD=atcFilterD)
            collectionName = "tree_atc_node_list"
            logger.debug("ATC node list length %d %r", len(nL), nL[:5])
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=nL,
                         indexAttributeList=["update_id"],
                         keyNames=None,
                         addValues=addValues,
                         schemaLevel=None)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)
            #
            # ---
            logger.info("Completed tree node list loading operations.\n")
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
 def etl(self, dataSetId, dataLocator=None, loadType="full"):
     """ Prepare and load sequence cluster data by entity and by cluster identifer.
     """
     try:
         self.__statusList = []
         desp = DataExchangeStatus()
         statusStartTimestamp = desp.setStartTime()
         #
         docBySequenceD, docByClusterD = self.__extract(
             dataSetId=dataSetId,
             dataLocator=dataLocator,
             levels=self.__identityLevels)
         #
         dl = DocumentLoader(
             self.__cfgOb,
             self.__cachePath,
             self.__resourceName,
             numProc=self.__numProc,
             chunkSize=self.__chunkSize,
             documentLimit=self.__documentLimit,
             verbose=self.__verbose,
             readBackCheck=self.__readBackCheck,
         )
         #
         databaseName = self.__databaseName
         # addValues = {"_schema_version": self.__collectionVersion}
         addValues = None
         #
         collectionName = self.__entityMemberCollection
         dList = docBySequenceD[self.__entitySchemaName]
         ok1 = dl.load(
             databaseName,
             collectionName,
             loadType=loadType,
             documentList=dList,
             indexAttributeList=self.__entityMemberCollectionIndexL,
             keyNames=None,
             addValues=addValues)
         self.__updateStatus(dataSetId, databaseName, collectionName, ok1,
                             statusStartTimestamp)
         #
         collectionName = self.__clusterMembersCollection
         dList = docByClusterD[self.__clusterSchemaName]
         ok2 = dl.load(
             databaseName,
             collectionName,
             loadType=loadType,
             documentList=dList,
             indexAttributeList=self.__clusterMembersCollectionIndexL,
             keyNames=None,
             addValues=addValues)
         self.__updateStatus(dataSetId, databaseName, collectionName, ok2,
                             statusStartTimestamp)
         #
         pD = self.__fetchProvenance()
         collectionName = self.__clusterProvenanceCollection
         ok3 = dl.load(databaseName,
                       collectionName,
                       loadType=loadType,
                       documentList=[pD],
                       indexAttributeList=None,
                       keyNames=None,
                       addValues=addValues)
         self.__updateStatus(dataSetId, databaseName, collectionName, ok3,
                             statusStartTimestamp)
         #
         return ok1 and ok2 and ok3
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return False
Beispiel #10
0
    def testLoadHoldingsRemote(self):
        """Test case - load legacy repository holdings and status data -

        [repository_holdings]
        DATABASE_NAME=repository_holdings
        DATABASE_VERSION_STRING=v5
        COLLECTION_HOLDINGS_UPDATE=rcsb_repository_holdings_update_entry
        COLLECTION_HOLDINGS_CURRENT=rcsb_repository_holdings_current_entry
        COLLECTION_HOLDINGS_UNRELEASED=rcsb_repository_holdings_unreleased_entry
        COLLECTION_HOLDINGS_REMOVED=rcsb_repository_holdings_removed_entry
        COLLECTION_HOLDINGS_COMBINED=rcsb_repository_holdings_combined_entry

        """
        try:
            sectionName = "repository_holdings_configuration"
            rhdp = RepoHoldingsRemoteDataPrep(cachePath=self.__cachePath,
                                              filterType=self.__filterType)
            #
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            databaseName = self.__cfgOb.get("DATABASE_NAME",
                                            sectionName=sectionName)
            logger.info("databaseName %r", databaseName)
            addValues = None
            #
            maxDoc = 5
            dList = rhdp.getHoldingsRemovedEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_REMOVED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsUnreleasedEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UNRELEASED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsUpdateEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UPDATE",
                                              sectionName=sectionName)
            logger.info("collectionName %r", collectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsCurrentEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_CURRENT",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsCombinedEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_COMBINED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #11
0
    def load(self, updateId, loadType="full"):
        """Load legacy repository holdings and status data -

        Relevant configuration options:

        [DEFAULT]
        RCSB_EXCHANGE_SANDBOX_PATH=MOCK_EXCHANGE_SANDBOX

        [repository_holdings_configuration]
        DATABASE_NAME=repository_holdings
        DATABASE_VERSION_STRING=v5
        COLLECTION_HOLDINGS_UPDATE=rcsb_repository_holdings_update_entry
        COLLECTION_HOLDINGS_CURRENT=rcsb_repository_holdings_current_entry
        COLLECTION_HOLDINGS_UNRELEASED=rcsb_repository_holdings_unreleased_entry
        COLLECTION_HOLDINGS_REMOVED=rcsb_repository_holdings_removed_entry
        COLLECTION_VERSION_STRING=v0_1

        """
        try:
            self.__statusList = []
            desp = DataExchangeStatus()
            statusStartTimestamp = desp.setStartTime()

            sectionName = "repository_holdings_configuration"

            rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb,
                                        sandboxPath=self.__sandboxPath,
                                        cachePath=self.__cachePath,
                                        filterType=self.__filterType)
            #
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            databaseName = self.__cfgOb.get("DATABASE_NAME",
                                            sectionName=sectionName)
            # collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
            # addValues = {"_schema_version": collectionVersion}
            addValues = None
            #
            dList = rhdp.getHoldingsUpdateEntry(updateId=updateId)
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UPDATE",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)
            #
            dList = rhdp.getHoldingsCurrentEntry(updateId=updateId)
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_CURRENT",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)

            dList = rhdp.getHoldingsUnreleasedEntry(updateId=updateId)
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UNRELEASED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)
            #
            dList = rhdp.getHoldingsRemovedEntry(updateId=updateId)
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_REMOVED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)
            #
            dList = rhdp.getHoldingsCombinedEntry(updateId=updateId)
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_COMBINED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)
            #
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
    def load(self, updateId, extResource, loadType="full"):
        """Load sequence reference data"""
        try:
            self.__statusList = []
            desp = DataExchangeStatus()
            statusStartTimestamp = desp.setStartTime()
            #
            dList = indexL = []
            databaseName = collectionName = collectionVersion = None
            #
            if extResource == "UniProt":
                databaseName = "uniprot_core"
                # configName = self.__cfgOb.getDefaultSectionName()
                # dirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", self.__cfgOb.getDefaultSectionName()))
                #
                ok, rsP = self.__getReferenceSequenceProvider()
                if not ok:
                    return False
                #
                dList = rsP.getDocuments()
                logger.info("Resource %r extracted mapped document length %d",
                            extResource, len(dList))
                logger.debug("Objects %r", dList[:2])
                #
                cDL = self.__docHelper.getCollectionInfo(databaseName)
                collectionName = cDL[0]["NAME"]
                collectionVersion = cDL[0]["VERSION"]
                indexL = self.__docHelper.getDocumentIndexAttributes(
                    collectionName, "primary")
                logger.info(
                    "Database %r collection %r version %r index attributes %r",
                    databaseName, collectionName, collectionVersion, indexL)
                addValues = {}
            else:
                logger.error("Unsupported external resource %r", extResource)
            #
            if self.__doValidate:
                self.__valInst = self.__getValidator(databaseName,
                                                     collectionName,
                                                     schemaLevel="full")
                for dObj in dList:
                    self.__validateObj(databaseName,
                                       collectionName,
                                       dObj,
                                       label="Original")
            #
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=indexL,
                         keyNames=None,
                         addValues=addValues)
            okS = self.__updateStatus(updateId, databaseName, collectionName,
                                      ok, statusStartTimestamp)

            return ok and okS
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False