Exemple #1
0
    def update(self, databaseName, collectionName, updateDL):
        """Update documents satisfying the selection details with the content of updateDL.

        Args:
            databaseName (str): Target database name
            collectionName (str): Target collection name
            updateDL = [{selectD: ..., updateD: ... }, ....]
                selectD    = {'ky1': 'val1', 'ky2': 'val2',  ...}
                updateD = {'key1.subkey1...': 'val1', 'key2.subkey2..': 'val2', ...}

        """
        try:
            numUpdated = 0
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.debug("%s %s document count is %d", databaseName,
                                 collectionName,
                                 mg.count(databaseName, collectionName))
                    for updateD in updateDL:
                        num = mg.update(databaseName,
                                        collectionName,
                                        updateD["updateD"],
                                        updateD["selectD"],
                                        upsertFlag=True)
                        numUpdated += num

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return numUpdated
Exemple #2
0
    def testInsertSingle(self):
        """Test case -  create collection and insert data -

        """
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                ok = mg.createCollection(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                ok = mg.databaseExists(self.__dbName)
                self.assertTrue(ok)
                ok = mg.collectionExists(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                #
                dObj = self.__makeDataObj(2, 5, 5)
                rId = mg.insert(self.__dbName, self.__collectionName, dObj)
                self.assertTrue(rId is not None)
                # Note that dObj is mutated by additional key '_id' that is added on insert -
                #
                rObj = mg.fetchOne(self.__dbName, self.__collectionName, "_id",
                                   rId)
                logger.debug("Return Object %s", pprint.pformat(rObj))
                self.assertEqual(len(dObj), len(rObj))
                self.assertEqual(dObj, rObj)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
 def getEntityIds(self, entryIdList):
     """ """
     dbName = "pdbx_core"
     collectionName = "pdbx_core_polymer_entity"
     docD = {}
     try:
         with Connection(cfgOb=self.__cfgOb,
                         resourceName=self.__resourceName) as client:
             mg = MongoDbUtil(client)
             if mg.collectionExists(dbName, collectionName):
                 logger.info("%s %s document count is %d",
                             dbName, collectionName,
                             mg.count(dbName, collectionName))
                 for entryId in entryIdList:
                     qD = {
                         "rcsb_polymer_entity_container_identifiers.entry_id":
                         entryId
                     }
                     selectL = ["rcsb_polymer_entity_container_identifiers"]
                     tL = mg.fetch(dbName,
                                   collectionName,
                                   selectL,
                                   queryD=qD)
                     #
                     logger.debug("Selection %r fetch result count %d",
                                  selectL, len(tL))
                     docD[entryId] = [
                         vv["rcsb_polymer_entity_container_identifiers"]
                         for vv in tL
                     ]
         logger.debug("docD is %r", docD)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return docD
Exemple #4
0
    def __selectObjectIds(self, databaseName, collectionName, selectionQueryD):
        """Return a list of object identifiers for the input selection query."""
        try:

            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(selectionQueryD)
                    selectL = ["_id"]
                    dL = mg.fetch(databaseName,
                                  collectionName,
                                  selectL,
                                  queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return dL
Exemple #5
0
 def testCreateDropCollection(self):
     """Test case -  create/drop collection -
     """
     try:
         with Connection(cfgOb=self.__cfgOb,
                         resourceName=self.__resourceName) as client:
             mg = MongoDbUtil(client)
             ok = mg.createCollection(self.__dbName, self.__collectionName)
             self.assertTrue(ok)
             ok = mg.databaseExists(self.__dbName)
             self.assertTrue(ok)
             ok = mg.collectionExists(self.__dbName, self.__collectionName)
             self.assertTrue(ok)
             #
             logger.debug("Databases = %r", mg.getDatabaseNames())
             logger.debug("Collections = %r",
                          mg.getCollectionNames(self.__dbName))
             ok = mg.dropCollection(self.__dbName, self.__collectionName)
             self.assertTrue(ok)
             logger.debug("Databases = %r", mg.getDatabaseNames())
             logger.debug("Collections = %r",
                          mg.getCollectionNames(self.__dbName))
             # Removing the last collection will remove the database (results appear differ between mac and linux - )
             ok = mg.databaseExists(self.__dbName)
             # self.assertFalse(ok)
             #
             ok = mg.collectionExists(self.__dbName, self.__collectionName)
             self.assertFalse(ok)
             logger.debug("Collections = %r",
                          mg.getCollectionNames(self.__dbName))
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
Exemple #6
0
 def testInsertList(self):
     """Test case -  create collection and insert data -"""
     try:
         with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
             mg = MongoDbUtil(client)
             ok = mg.createCollection(self.__dbName, self.__collectionName)
             self.assertTrue(ok)
             ok = mg.databaseExists(self.__dbName)
             self.assertTrue(ok)
             ok = mg.collectionExists(self.__dbName, self.__collectionName)
             self.assertTrue(ok)
             #
             dList = []
             for ii in range(100):
                 dList.append(self.__makeDataObj(2, 5, 5, ii))
             #
             keyName = "DOC_ID"
             rIdL = mg.insertList(self.__dbName, self.__collectionName, dList, keyNames=[keyName], salvage=True)
             self.assertEqual(len(rIdL), len(dList))
             #
             # Note that dObj is mutated by additional key '_id' that is added on insert -
             #
             for ii, rId in enumerate(rIdL):
                 rObj = mg.fetchOne(self.__dbName, self.__collectionName, "_id", rId)
                 logger.debug("Return Object %s", pprint.pformat(rObj))
                 jj = int(rObj["DOC_ID"][4:])
                 self.assertEqual(len(dList[jj]), len(rObj))
                 self.assertEqual(dList[jj], rObj)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
    def __selectEntries(self, **kwargs):
        """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""

        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        selectionQueryD = kwargs.get("entrySelectionQuery", {})
        #
        entryD = {}
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(qD)
                    selectL = ["rcsb_entry_container_identifiers"]
                    dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL, len(dL))
                    #
                    for dD in dL:
                        #
                        if (
                            ("rcsb_entry_container_identifiers" in dD)
                            and ("entry_id" in dD["rcsb_entry_container_identifiers"])
                            and ("polymer_entity_ids" in dD["rcsb_entry_container_identifiers"])
                            and dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]
                        ):
                            entryD[dD["rcsb_entry_container_identifiers"]["entry_id"]] = {"polymer_entity_ids": dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]}

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD
    def __removeCollection(self, dbName, collectionName):
        """Drop collection within database

        """
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                #
                logger.debug("Remove collection database %s collection %s",
                             dbName, collectionName)
                logger.debug("Starting databases = %r", mg.getDatabaseNames())
                logger.debug("Starting collections = %r",
                             mg.getCollectionNames(dbName))
                ok = mg.dropCollection(dbName, collectionName)
                logger.debug("Databases = %r", mg.getDatabaseNames())
                logger.debug("Post drop collections = %r",
                             mg.getCollectionNames(dbName))
                ok = mg.collectionExists(dbName, collectionName)
                logger.debug("Post drop collections = %r",
                             mg.getCollectionNames(dbName))
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
Exemple #9
0
    def testSchemaValidation1(self):
        """Test case -  create collection and insert data with schema validation (ext. schema assignment)

        """

        #  Example of a Mongo flavor of JsonSchema
        vexpr = {"$jsonSchema": self.__mongoSchema}

        query = [("collMod", self.__collectionName), ("validator", vexpr),
                 ("validationLevel", "moderate")]
        query = OrderedDict(query)

        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.databaseExists(self.__dbName):
                    ok = mg.dropDatabase(self.__dbName)
                    self.assertTrue(ok)
                #
                ok = mg.createDatabase(self.__dbName)
                self.assertTrue(ok)
                #
                ok = mg.createCollection(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                ok = mg.databaseExists(self.__dbName)
                self.assertTrue(ok)
                ok = mg.collectionExists(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                #
                mg.databaseCommand(self.__dbName, query)
                dObj = {"x": 1}
                rId = mg.insert(self.__dbName, self.__collectionName, dObj)
                logger.info("rId is %r", rId)
                self.assertEqual(rId, None)
                #
                s2 = unescapeXmlCharRef(
                    " " Φ Ψ α £  ℅  ☆  𝕫"
                )
                dObj = {
                    "strField1": "test value",
                    "strField2": s2,
                    "intField1": 50,
                    "enumField1": "v3",
                    "dblField1": 100.1
                }
                rId = mg.insert(self.__dbName, self.__collectionName, dObj)
                logger.info("rId is %r", rId)
                rObj = mg.fetchOne(self.__dbName, self.__collectionName, "_id",
                                   rId)
                logger.debug("Return Object %s", pprint.pformat(rObj))
                self.assertEqual(len(dObj), len(rObj))
                self.assertEqual(dObj, rObj)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
    def getEntryInfo(self, **kwargs):
        """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""

        resLimit = kwargs.get("resLimit", 3.5)
        expMethod = kwargs.get("expMethod", "X-ray")
        #
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        #
        entryD = {}
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    qD = {
                        "rcsb_entry_info.experimental_method": expMethod,
                        "refine.0.ls_d_res_high": {
                            "$lte": resLimit
                        }
                    }
                    selectL = [
                        "rcsb_entry_container_identifiers", "rcsb_entry_info",
                        "refine"
                    ]
                    dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))
                    #
                    for dV in dL:
                        if "rcsb_entry_container_identifiers" not in dV:
                            continue
                        entryId = dV["rcsb_entry_container_identifiers"][
                            "entry_id"]
                        entryD[entryId] = {}
                        if "rcsb_entry_info" in dV and "polymer_composition" in dV[
                                "rcsb_entry_info"]:
                            entryD[entryId] = {
                                "polymer_composition":
                                dV["rcsb_entry_info"]["polymer_composition"],
                                "experimental_method":
                                dV["rcsb_entry_info"]["experimental_method"],
                            }
                        if "refine" in dV and dV[
                                "refine"] and "ls_d_res_high" in dV["refine"][
                                    0]:
                            entryD[entryId]["ls_d_res_high"] = dV["refine"][0][
                                "ls_d_res_high"]
                            logger.debug("Got res %r",
                                         dV["refine"][0]["ls_d_res_high"])

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD
Exemple #11
0
    def __transform(self,
                    databaseName,
                    collectionName,
                    docSelectList,
                    logIncrement=100):
        """Return a list of object identifiers for the input selection query."""
        #
        ok = True
        try:
            self.__valInst = self.__getValidator(databaseName,
                                                 collectionName,
                                                 schemaLevel="full")
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    numDoc = len(docSelectList)
                    for ii, dD in enumerate(docSelectList, 1):
                        if "_id" not in dD:
                            continue
                        rObj = mg.fetchOne(databaseName, collectionName, "_id",
                                           dD["_id"])
                        del rObj["_id"]
                        #
                        fOk = True

                        if self.__oAdapt:
                            self.__validateObj(databaseName,
                                               collectionName,
                                               rObj,
                                               label="Original")
                            fOk, rObj = self.__oAdapt.filter(rObj)
                            self.__validateObj(databaseName,
                                               collectionName,
                                               rObj,
                                               label="Updated")
                        if fOk:
                            rOk = mg.replace(databaseName, collectionName,
                                             rObj, dD)
                            if rOk is None:
                                tId = rObj[
                                    "rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
                                logger.error("%r %r (%r) failing",
                                             databaseName, collectionName, tId)
                                # logger.info("rObj.keys() %r", list(rObj.keys()))
                                # logger.info("rObj.items() %s", rObj.items())
                                rOk = False
                            ok = ok and rOk
                        #
                        if ii % logIncrement == 0 or ii == numDoc:
                            logger.info("Replace status %r object (%d of %d)",
                                        ok, ii, numDoc)
                        #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok
Exemple #12
0
 def __selectObjects(self, **kwargs):
     """Return a dictionary of objects satisfying the input conditions (e.g. method, resolution limit)"""
     databaseName = kwargs.get("databaseName", "pdbx_core")
     collectionName = kwargs.get("collectionName", "pdbx_core_entry")
     selectionQueryD = kwargs.get("selectionQuery", {})
     #
     uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"])
     #
     tV = kwargs.get("objectLimit", None)
     objLimit = int(tV) if tV is not None else None
     stripObjectId = kwargs.get("stripObjectId", False)
     logIncrement = kwargs.get("logIncrement", 10000)
     #
     objectD = {}
     try:
         with Connection(cfgOb=self.__cfgOb,
                         resourceName=self.__resourceName) as client:
             mg = MongoDbUtil(client)
             if mg.collectionExists(databaseName, collectionName):
                 logger.info("%s %s document count is %d", databaseName,
                             collectionName,
                             mg.count(databaseName, collectionName))
                 qD = {}
                 if selectionQueryD:
                     qD.update(selectionQueryD)
                 selectL = ["_id"]
                 dL = mg.fetch(databaseName,
                               collectionName,
                               selectL,
                               queryD=qD)
                 numDoc = len(dL) if dL else 0
                 logger.info("Selection %r fetch result count %d", selectL,
                             numDoc)
                 #
                 for ii, dD in enumerate(dL, 1):
                     if "_id" not in dD:
                         continue
                     rObj = mg.fetchOne(databaseName, collectionName, "_id",
                                        dD["_id"])
                     if stripObjectId and rObj and "_id" in rObj:
                         rObj.pop("_id")
                     else:
                         rObj["_id"] = str(rObj["_id"])
                     #
                     stKey = ".".join([rObj[ky] for ky in uniqueAttributes])
                     objectD[stKey] = copy.copy(rObj)
                     if objLimit and ii >= objLimit:
                         break
                     logger.debug("Saving %d %s", ii, stKey)
                     if ii % logIncrement == 0 or ii == numDoc:
                         logger.info("Extracting object (%d of %d)", ii,
                                     numDoc)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return objectD
Exemple #13
0
 def count(self, databaseName, collectionName):
     try:
         numTotal = 0
         with Connection(cfgOb=self.__cfgOb,
                         resourceName=self.__resourceName) as client:
             mg = MongoDbUtil(client)
             if mg.collectionExists(databaseName, collectionName):
                 numTotal = mg.count(databaseName, collectionName)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return numTotal
Exemple #14
0
    def testSchemaValidation3(self):
        """Test case -  create collection and insert data with schema validation (warn mode) (integrated schema assignment)

        """
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.databaseExists(self.__dbName):
                    ok = mg.dropDatabase(self.__dbName)
                    self.assertTrue(ok)
                #
                ok = mg.createDatabase(self.__dbName)
                self.assertTrue(ok)
                #
                ok = mg.createCollection(self.__dbName,
                                         self.__collectionName,
                                         overWrite=True,
                                         bsonSchema=self.__mongoSchema,
                                         validationAction="warn")
                self.assertTrue(ok)
                ok = mg.databaseExists(self.__dbName)
                self.assertTrue(ok)
                ok = mg.collectionExists(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                #
                dObj = {"x": 1}
                rId = mg.insert(self.__dbName, self.__collectionName, dObj)
                logger.info("rId is %r", rId)
                self.assertNotEqual(rId, None)
                #
                s2 = unescapeXmlCharRef(
                    " " Φ Ψ α £  ℅  ☆  𝕫"
                )
                dObj = {
                    "strField1": "test value",
                    "strField2": s2,
                    "intField1": 50,
                    "enumField1": "v3a",
                    "dblField1": 100.1
                }
                rId = mg.insert(self.__dbName, self.__collectionName, dObj)
                self.assertNotEqual(rId, None)
                logger.info("rId is %r", rId)
                rObj = mg.fetchOne(self.__dbName, self.__collectionName, "_id",
                                   rId)
                logger.debug("Return Object %s", pprint.pformat(rObj))
                self.assertEqual(len(dObj), len(rObj))
                self.assertEqual(dObj, rObj)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Exemple #15
0
 def testCreateDatabase(self):
     """Test case -  create database -"""
     try:
         with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
             mg = MongoDbUtil(client)
             ok = mg.createDatabase(self.__dbName)
             self.assertTrue(ok)
             ok = mg.createDatabase(self.__dbName)
             self.assertTrue(ok)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
Exemple #16
0
 def testReplaceList(self):
     """Test case -  create collection and insert document list - replace and upsert document list"""
     try:
         with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
             nDocs = 10
             mg = MongoDbUtil(client)
             ok = mg.createCollection(self.__dbName, self.__collectionName)
             self.assertTrue(ok)
             ok = mg.databaseExists(self.__dbName)
             self.assertTrue(ok)
             ok = mg.collectionExists(self.__dbName, self.__collectionName)
             self.assertTrue(ok)
             #
             dList = []
             for ii in range(nDocs):
                 dObj = self.__makeDataObj(2, 5, 5, ii)
                 dList.append(dObj)
             #
             keyName = "DOC_ID"
             rIdL = mg.insertList(self.__dbName, self.__collectionName, dList, keyNames=[keyName], salvage=True)
             self.assertEqual(len(rIdL), len(dList))
             #
             for ii, rId in enumerate(rIdL):
                 rObj = mg.fetchOne(self.__dbName, self.__collectionName, "_id", rId)
                 # logger.debug("Return Object %s", pprint.pformat(rObj))
                 self.assertEqual(len(dList[ii]), len(rObj))
                 self.assertEqual(dList[ii], rObj)
             #
             #  Replace with 2x the list length - half are duplicates id's
             dList = []
             for ii in range(nDocs + nDocs):
                 dObj = self.__makeDataObj(4, 10, 10, ii)
                 dList.append(dObj)
             #
             updL = mg.replaceList(self.__dbName, self.__collectionName, dList, ["DOC_ID"], upsertFlag=True)
             #
             logger.info("Upserted id list length %d", len(updL))
             for ii in range(nDocs + nDocs):
                 kVal = "DOC_%d" % ii
                 rObj = mg.fetchOne(self.__dbName, self.__collectionName, "DOC_ID", kVal)
                 if not rObj:
                     logger.info("Failing to recover doc %s", kVal)
                 # logger.debug("Return Object %s", pprint.pformat(rObj))
                 rObj.pop("_id", None)
                 dList[ii].pop("_id", None)
                 self.assertEqual(len(dList[ii]), len(rObj))
                 self.assertEqual(dList[ii], rObj)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
Exemple #17
0
    def testReplaceSingle(self):
        """Test case -  create collection and insert document  and then replace document -"""
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                ok = mg.createCollection(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                ok = mg.databaseExists(self.__dbName)
                self.assertTrue(ok)
                ok = mg.collectionExists(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                #
                dObj = self.__makeDataObj(2, 5, 5, 1)
                rId = mg.insert(self.__dbName, self.__collectionName, dObj)
                self.assertTrue(rId is not None)
                # Note that dObj is mutated by additional key '_id' that is added on insert -
                #
                rObj = mg.fetchOne(self.__dbName, self.__collectionName, "_id", rId)
                logger.debug("Return Object %s", pprint.pformat(rObj))
                self.assertEqual(len(dObj), len(rObj))
                self.assertEqual(dObj, rObj)
                #
                # Now replace with a new document with the same document id
                dObj = self.__makeDataObj(3, 2, 2, 1)
                logger.debug("Replace Object %s", pprint.pformat(dObj))

                rId = mg.replace(self.__dbName, self.__collectionName, dObj, {"DOC_ID": "DOC_1"}, upsertFlag=True)
                # self.assertTrue(rId is not None)
                rObj = mg.fetchOne(self.__dbName, self.__collectionName, "DOC_ID", "DOC_1")
                rObj.pop("_id", None)
                dObj.pop("_id", None)
                logger.debug("Return Object %s", pprint.pformat(rObj))
                self.assertEqual(len(dObj), len(rObj))
                self.assertEqual(dObj, rObj)
                #
                # Now replace with a new document with a different key
                dObj2 = self.__makeDataObj(5, 5, 5, 2)
                logger.debug("Replace Object %s", pprint.pformat(dObj))
                #
                rId = mg.replace(self.__dbName, self.__collectionName, dObj2, {"DOC_ID": "DOC_2"}, upsertFlag=True)
                rObj = mg.fetchOne(self.__dbName, self.__collectionName, "DOC_ID", "DOC_2")
                rObj.pop("_id", None)
                dObj2.pop("_id", None)
                logger.debug("Return Object %s", pprint.pformat(rObj))
                self.assertEqual(len(dObj2), len(rObj))
                self.assertEqual(dObj2, rObj)
                #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Exemple #18
0
    def createCollection(self,
                         databaseName,
                         collectionName,
                         indexAttributeNames=None,
                         indexName="primary",
                         checkExists=False,
                         bsonSchema=None):
        """Create collection and optionally set index attributes for the named index and validation schema for a new collection.

        Args:
            databaseName (str): target database name
            collectionName (str): target collection name
            indexAttributeNames (list, optional): list of attribute names for the 'primary' index. Defaults to None.
            checkExists (bool, optional): reuse an existing collection if True. Defaults to False.
            bsonSchema (object, optional): BSON compatable validation schema. Defaults to None.

        Returns:
            (bool): True for success or False otherwise
        """
        try:
            logger.debug("Create database %s collection %s", databaseName,
                         collectionName)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if checkExists and mg.databaseExists(
                        databaseName) and mg.collectionExists(
                            databaseName, collectionName):
                    ok1 = True
                else:
                    ok1 = mg.createCollection(databaseName,
                                              collectionName,
                                              bsonSchema=bsonSchema)
                ok2 = mg.databaseExists(databaseName)
                ok3 = mg.collectionExists(databaseName, collectionName)
                okI = True
                if indexAttributeNames:
                    okI = mg.createIndex(databaseName,
                                         collectionName,
                                         indexAttributeNames,
                                         indexName=indexName,
                                         indexType="DESCENDING",
                                         uniqueFlag=False)

            return ok1 and ok2 and ok3 and okI
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
Exemple #19
0
    def __select(self, **kwargs):
        """Return a dictionary of object content satisfying the input conditions
        (e.g. method, resolution limit) and selection options.
        """
        databaseName = kwargs.get("databaseName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        selectionQueryD = kwargs.get("selectionQuery", {})
        uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"])
        selectL = kwargs.get("selectionList", [])
        stripObjectId = kwargs.get("stripObjectId", False)
        #
        tV = kwargs.get("objectLimit", None)
        objLimit = int(tV) if tV is not None else None
        #
        objectD = {}
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(selectionQueryD)
                    dL = mg.fetch(databaseName,
                                  collectionName,
                                  selectL,
                                  queryD=qD,
                                  suppressId=True)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))
                    #
                    for ii, rObj in enumerate(dL, 1):
                        stKey = ".".join([rObj[ky] for ky in uniqueAttributes])
                        if stripObjectId and rObj and "_id" in rObj:
                            rObj.pop("_id")
                        objectD[stKey] = copy.copy(rObj)
                        if objLimit and ii >= objLimit:
                            break
                        # logger.debug("Saving %d %s", ii, stKey)
                        # logger.debug("Current objectD keys %r", list(objectD.keys()))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return objectD
Exemple #20
0
    def testSingleIndex(self):
        """Test case -  create collection, create simple single index, insert document list, read check documents"""
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                nDocs = 100
                mg = MongoDbUtil(client)
                ok = mg.createCollection(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                ok = mg.databaseExists(self.__dbName)
                self.assertTrue(ok)
                ok = mg.collectionExists(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                #
                # Create before insert
                ok = mg.createIndex(self.__dbName, self.__collectionName, keyList=["DOC_ID"], indexName="primary", indexType="DESCENDING", uniqueFlag=True)
                self.assertTrue(ok)

                dList = []
                for ii in range(nDocs):
                    dObj = self.__makeDataObj(2, 5, 5, ii)
                    dList.append(dObj)
                #
                keyName = "DOC_ID"
                rIdL = mg.insertList(self.__dbName, self.__collectionName, dList, keyNames=[keyName], salvage=True)
                self.assertEqual(len(dList), len(rIdL))
                #
                for ii in range(nDocs):
                    kVal = "DOC_%d" % ii
                    rObj = mg.fetchOne(self.__dbName, self.__collectionName, "DOC_ID", kVal)
                    # logger.debug("Return Object %s" % pprint.pformat(rObj))
                    rObj.pop("_id", None)
                    dList[ii].pop("_id", None)
                    self.assertEqual(len(dList[ii]), len(rObj))
                    self.assertEqual(dList[ii], rObj)
                #
                ok = mg.dropIndex(self.__dbName, self.__collectionName, indexName="primary")
                self.assertTrue(ok)
                ok = mg.createIndex(self.__dbName, self.__collectionName, keyList=["DOC_ID"], indexName="primary", indexType="DESCENDING", uniqueFlag=True)
                self.assertTrue(ok)
                ok = mg.reIndex(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Exemple #21
0
 def testCreateCollectionDropDatabase(self):
     """Test case -  create/drop collection -"""
     try:
         with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
             mg = MongoDbUtil(client)
             ok = mg.createCollection(self.__dbName, self.__collectionName)
             self.assertTrue(ok)
             ok = mg.databaseExists(self.__dbName)
             self.assertTrue(ok)
             ok = mg.collectionExists(self.__dbName, self.__collectionName)
             self.assertTrue(ok)
             #
             ok = mg.dropDatabase(self.__dbName)
             self.assertTrue(ok)
             ok = mg.databaseExists(self.__dbName)
             self.assertFalse(ok)
             ok = mg.collectionExists(self.__dbName, self.__collectionName)
             self.assertFalse(ok)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
    def __createCollection(self,
                           dbName,
                           collectionName,
                           indexAttributeNames=None,
                           checkExists=False,
                           bsonSchema=None):
        """Create database and collection and optionally a primary index -
        """
        try:
            logger.debug("Create database %s collection %s", dbName,
                         collectionName)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if checkExists and mg.databaseExists(
                        dbName) and mg.collectionExists(
                            dbName, collectionName):
                    ok1 = True
                else:
                    ok1 = mg.createCollection(dbName,
                                              collectionName,
                                              bsonSchema=bsonSchema)
                ok2 = mg.databaseExists(dbName)
                ok3 = mg.collectionExists(dbName, collectionName)
                okI = True
                if indexAttributeNames:
                    okI = mg.createIndex(dbName,
                                         collectionName,
                                         indexAttributeNames,
                                         indexName="primary",
                                         indexType="DESCENDING",
                                         uniqueFlag=False)

            return ok1 and ok2 and ok3 and okI
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
Exemple #23
0
    def delete(self, databaseName, collectionName, selectD):
        """Remove documents satisfying the input selection details.

        Args:
            databaseName (str): Target database name
            collectionName (str): Target collection name
            selectD    = {'ky1': 'val1', 'ky2': 'val2',  ...}

        """
        try:
            numDeleted = 0
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    numDeleted = mg.delete(databaseName, collectionName,
                                           selectD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return numDeleted
Exemple #24
0
    def __loadDocuments(self,
                        dbName,
                        collectionName,
                        docList,
                        loadType="full",
                        readBackCheck=False,
                        keyNames=None):
        #
        # Load database/collection with input document list -
        #
        failList = []
        rIdL = []
        successList = []
        logger.debug(
            "Loading dbName %s collectionName %s with document count %d keynames %r",
            dbName, collectionName, len(docList), keyNames)
        if keyNames:
            # map the document list to some document key if this is provided
            indD = {}
            indL = []
            try:
                for ii, doc in enumerate(docList):
                    dIdTup = self.__getKeyValues(doc, keyNames)
                    indD[dIdTup] = ii
                indL = list(range(len(docList)))
            except Exception as e:
                logger.exception("Failing ii %d d %r with %s", ii, doc, str(e))
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                #
                if loadType == "replace" and keyNames:
                    dTupL = mg.deleteList(dbName, collectionName, docList,
                                          keyNames)
                    logger.debug("Deleted document status %r", (dTupL, ))
                #
                rIdL = mg.insertList(dbName,
                                     collectionName,
                                     docList,
                                     keyNames=keyNames)
                logger.debug("Insert returns rIdL length %r", len(rIdL))

                # ---
                #  If there is a failure then determine the specific successes and failures -
                #
                successList = docList
                failList = []
                if len(rIdL) != len(docList):
                    if keyNames:
                        successIndList = []
                        for rId in rIdL:
                            rObj = mg.fetchOne(dbName, collectionName, "_id",
                                               rId)
                            dIdTup = self.__getKeyValues(rObj, keyNames)
                            successIndList.append(indD[dIdTup])
                        failIndList = list(set(indL) - set(successIndList))
                        failList = [docList[ii] for ii in failIndList]
                        successList = [docList[ii] for ii in successIndList]
                    else:
                        # fail the whole batch if we don't have visibility into each document
                        failList = docList
                        successList = []
                #
                rbStatus = True
                if readBackCheck and keyNames:
                    #
                    # Note that objects in docList are mutated by the insert operation with the additional key '_id',
                    # hence, it is possible to compare the fetched object with the input object.
                    #
                    for ii, rId in enumerate(rIdL):
                        rObj = mg.fetchOne(dbName, collectionName, "_id", rId)
                        dIdTup = self.__getKeyValues(rObj, keyNames)
                        jj = indD[dIdTup]
                        if rObj != docList[jj]:
                            rbStatus = False
                            break
                #
                if readBackCheck and not rbStatus:
                    return False, successList, failList
                #
            return len(rIdL) == len(docList), successList, failList
        except Exception as e:
            logger.exception("Failing %r %r (len=%d) %s with %s", dbName,
                             collectionName, len(docList), keyNames, str(e))
        return False, [], docList
    def __selectPolymerEntities(self, entryD, **kwargs):
        """Skeleton entity selector recovering essential biological sequence mapping features
        for macromolecules (default type = protein).

         "1CP9": {
             "polymer_entity_ids": [
                "1",
                "2"
             ],
             "selected_polymer_entities": {
                "1": {
                   "rcsb_multiple_source_flag": "N",
                   "asym_ids": [
                      "A"
                   ],
                   "auth_asym_ids": [
                      "A"
                   ],
                   "entity_id": "1",
                   "type": "polypeptide(L)",
                   "rcsb_entity_polymer_type": "Protein",
                   "rcsb_entity_source_organism": [
                      {
                         "ncbi_taxonomy_id": 587,
                         "beg_seq_num": 1,
                         "end_seq_num": 205,
                         "ncbi_scientific_name": "Providencia rettgeri"
                      }
                   ],
                   "struct_ref": [
                      {
                         "id": "1",
                         "db_name": "UNP",
                         "pdbx_db_accession": "Q7WZI9",
                         "entity_id": "1",
                         "pdbx_seq_one_letter_code": "QSTQIKIERDNYGVPHIYANDTYSLFYGYGYA...",
                         "alignD": {
                            "A": [
                               {
                                  "align_id": "1",
                                  "ref_id": "1",
                                  "pdbx_PDB_id_code": "1CP9",
                                  "pdbx_strand_id": "A",
                                  "seq_align_beg": 1,
                                  "seq_align_end": 205,
                                  "pdbx_db_accession": "Q7WZI9",
                                  "db_align_beg": 24,
                                  "db_align_end": 228,
                                  "pdbx_auth_seq_align_beg": "1",
                                  "pdbx_auth_seq_align_end": "205",
                                  "rcsb_entity_id": "1"
                               }
                            ]
                         }
                      }
                   ]
                },
            "2": {
                   "rcsb_multiple_source_flag": "N",
                   "asym_ids": [
                      "B"
                   ],
                   "auth_asym_ids": [
                      "B"
                   ],
                   "entity_id": "2",
                   "type": "polypeptide(L)",
                   "rcsb_entity_polymer_type": "Protein",
                   "rcsb_entity_source_organism": [
                      {
                         "ncbi_taxonomy_id": 587,
                         "beg_seq_num": 1,
                         "end_seq_num": 553,
                         "ncbi_scientific_name": "Providencia rettgeri"
                      }
                   ],
                   "struct_ref": [
                      {
                         "id": "2",
                         "db_name": "UNP",
                         "pdbx_db_accession": "Q7WZI9",
                         "entity_id": "2",
                         "pdbx_seq_one_letter_code": "SNVWLVGKTKASGAKAILLNGPQFGWFNPAYTYGIGLHG",
                         "alignD": {
                            "B": [
                               {
                                  "align_id": "2",
                                  "ref_id": "2",
                                  "pdbx_PDB_id_code": "1CP9",
                                  "pdbx_strand_id": "B",
                                  "seq_align_beg": 1,
                                  "seq_align_end": 553,
                                  "pdbx_db_accession": "Q7WZI9",
                                  "db_align_beg": 285,
                                  "db_align_end": 837,
                                  "pdbx_auth_seq_align_beg": "1",
                                  "pdbx_auth_seq_align_end": "553",
                                  "rcsb_entity_id": "2"
                               }
                            ]
                         }
                      }
                   ]
                }
             }
           },

        """
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity")
        resultKey = kwargs.get("resultKey", "selected_polymer_entities")

        entryLimit = kwargs.get("entryLimit", None)
        selectionQueryD = kwargs.get("entitySelectionQuery", {"entity_poly.rcsb_entity_polymer_type": "Protein"})
        #
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
                    selectL = [
                        "rcsb_polymer_entity_container_identifiers",
                        "entity.rcsb_multiple_source_flag",
                        "entity_poly.type",
                        "entity_poly.rcsb_entity_polymer_type",
                        "entity_poly.pdbx_seq_one_letter_code_can",
                        "rcsb_entity_source_organism.ncbi_taxonomy_id",
                        "rcsb_entity_source_organism.ncbi_scientific_name",
                        "rcsb_entity_source_organism.beg_seq_num",
                        "rcsb_entity_source_organism.end_seq_num",
                        "struct_ref.id",
                        "struct_ref.pdbx_db_accession",
                        "struct_ref.db_name",
                        "struct_ref.entity_id",
                        "struct_ref.pdbx_seq_one_letter_code",
                        "struct_ref.pdbx_align_begin",
                        "struct_ref_seq",
                        #
                        "entity_src_nat.pdbx_ncbi_taxonomy_id",
                        "entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id",
                        "entity_src_gen.pdbx_host_org_ncbi_taxonomy_id",
                        "pdbx_entity_src_syn.ncbi_taxonomy_id",
                    ]
                    iCount = 0
                    for entryId in entryD:
                        #
                        if resultKey in entryD[entryId]:
                            continue
                        #
                        qD = {"rcsb_polymer_entity_container_identifiers.entry_id": entryId}
                        qD.update(selectionQueryD)
                        #
                        dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
                        logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL))
                        eD = {}
                        for ii, dD in enumerate(dL, 1):
                            rD = {}
                            logger.debug("%s (%4d) d is %r", entryId, ii, dD)
                            if "entity" in dD:
                                rD["rcsb_multiple_source_flag"] = dD["entity"]["rcsb_multiple_source_flag"] if "rcsb_multiple_source_flag" in dD["entity"] else "N"
                            #
                            if "rcsb_polymer_entity_container_identifiers" in dD:
                                rD["asym_ids"] = dD["rcsb_entity_container_identifiers"]["asym_ids"] if "asym_ids" in dD["rcsb_entity_container_identifiers"] else []
                                rD["auth_asym_ids"] = dD["rcsb_entity_container_identifiers"]["auth_asym_ids"] if "auth_asym_ids" in dD["rcsb_entity_container_identifiers"] else []
                                rD["entity_id"] = dD["rcsb_entity_container_identifiers"]["entity_id"]
                            #
                            if "entity_poly" in dD:
                                rD["type"] = dD["entity_poly"]["type"] if "type" in dD["entity_poly"] else None
                                rD["rcsb_entity_polymer_type"] = dD["entity_poly"]["rcsb_entity_polymer_type"] if "rcsb_entity_polymer_type" in dD["entity_poly"] else None
                                rD["entity_polymer_length"] = len(dD["entity_poly"]["pdbx_seq_one_letter_code_can"]) if "pdbx_seq_one_letter_code_can" in dD["entity_poly"] else 0
                            #
                            tL = []
                            if "rcsb_entity_source_organism" in dD:
                                for tD in dD["rcsb_entity_source_organism"]:
                                    tL.append(tD)
                            rD["rcsb_entity_source_organism"] = copy.copy(tL)
                            #
                            qDL = []
                            if "struct_ref" in dD:
                                for tD in dD["struct_ref"]:
                                    if "db_name" in tD:
                                        tD["db_name"] = str(tD["db_name"]).upper().strip()
                                        tD["db_name"] = "UNP" if tD["db_name"] in ["TREMBL"] else tD["db_name"]
                                    qDL.append(tD)
                                if "struct_ref_seq" in dD:
                                    for qD in qDL:
                                        refId = qD["id"]
                                        alignL = []
                                        for tD in dD["struct_ref_seq"]:
                                            if refId == tD["ref_id"]:
                                                alignL.append(tD)
                                        # qD['align_list'] = copy.copy(aL)
                                        for align in alignL:
                                            authAsymId = align["pdbx_strand_id"]
                                            qD.setdefault("alignD", {}).setdefault(authAsymId, []).append(align)

                            rD["struct_ref"] = qDL
                            #
                            taxIdL = []
                            if "entity_src_nat" in dD:
                                for tD in dD["entity_src_nat"]:
                                    if "pdbx_ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["pdbx_ncbi_taxonomy_id"])
                            if "entity_src_gen" in dD:
                                for tD in dD["entity_src_gen"]:
                                    if "pdbx_gene_src_ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["pdbx_gene_src_ncbi_taxonomy_id"])
                                    if "pdbx_host_org_ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["pdbx_host_org_ncbi_taxonomy_id"])
                            if "pdbx_entity_src_syn" in dD:
                                for tD in dD["pdbx_entity_src_syn"]:
                                    if "ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["ncbi_taxonomy_id"])
                            qL = []
                            for taxId in taxIdL:
                                ttL = [int(t.strip()) for t in taxId.split(",") if t.strip().isdigit()]
                                qL.extend(ttL)
                            logger.debug("TaxId list %r", qL)
                            rD["original_taxonomy_ids"] = copy.copy(list(set(qL)))
                            #
                            if "entity_id" in rD:
                                eD[rD["entity_id"]] = copy.copy(rD)

                        entryD[entryId][resultKey] = copy.copy(eD)

                        iCount += 1
                        if iCount % 1000 == 0:
                            logger.info("Completed fetch %d/%d entries", iCount, len(entryD))
                        if entryLimit and iCount >= entryLimit:
                            logger.info("Quitting after %d", iCount)
                            break

        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return entryD
    def getPolymerEntities(self, entryD, **kwargs):
        """Add 'selected_polymer_entities' satisfying the input contiditions and add this to the input entry dictionary."""
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName",
                                    "pdbx_core_polymer_entity")
        resultKey = kwargs.get("resultKey", "selected_polymer_entities")
        savePath = kwargs.get("savePath", "entry-data.pic")
        entryLimit = kwargs.get("entryLimit", None)
        saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
        #
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    selectL = [
                        "rcsb_polymer_entity_container_identifiers",
                        "entity_poly.type",
                        "entity_poly.pdbx_seq_one_letter_code_can",
                        "rcsb_entity_source_organism.ncbi_taxonomy_id",
                        "rcsb_entity_source_organism.ncbi_scientific_name",
                        "struct_ref.pdbx_seq_one_letter_code",
                        "struct_ref.pdbx_db_accession",
                        "struct_ref.db_name",
                        "struct_ref.entity_id",
                    ]
                    iCount = 0
                    for entryId in entryD:
                        #
                        if resultKey in entryD[entryId]:
                            continue
                        #
                        qD = {
                            "rcsb_polymer_entity_container_identifiers.entry_id":
                            entryId,
                            "entity_poly.rcsb_entity_polymer_type": "Protein",
                            "entity.rcsb_multiple_source_flag": "N",
                        }
                        #
                        dL = mg.fetch(dbName,
                                      collectionName,
                                      selectL,
                                      queryD=qD)
                        logger.debug("%s query %r fetch result count %d",
                                     entryId, qD, len(dL))
                        eD = {}
                        for ii, dV in enumerate(dL, 1):
                            rD = {}
                            logger.debug("%s (%4d) d is %r", entryId, ii, dV)
                            if "rcsb_polymer_entity_container_identifiers" in dV and "asym_ids" in dV[
                                    "rcsb_polymer_entity_container_identifiers"]:
                                rD["asym_ids"] = dV[
                                    "rcsb_polymer_entity_container_identifiers"][
                                        "asym_ids"]
                                rD["entity_id"] = dV[
                                    "rcsb_polymer_entity_container_identifiers"][
                                        "entity_id"]
                            if "entity_poly" in dV and "type" in dV[
                                    "entity_poly"]:
                                rD["type"] = dV["entity_poly"]["type"]
                                rD["seq_one_letter_code_can"] = dV[
                                    "entity_poly"][
                                        "pdbx_seq_one_letter_code_can"]

                            if "rcsb_entity_source_organism" in dV:
                                rD["ncbi_taxonomy_id"] = dV[
                                    "rcsb_entity_source_organism"][0][
                                        "ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in dV[
                                            "rcsb_entity_source_organism"][
                                                0] else None
                                rD["ncbi_scientific_name"] = (
                                    dV["rcsb_entity_source_organism"][0]
                                    ["ncbi_scientific_name"]
                                    if "ncbi_scientific_name"
                                    in dV["rcsb_entity_source_organism"][0]
                                    else None)

                            if "struct_ref" in dV and len(
                                    dV["struct_ref"]) == 1:
                                rD["seq_one_letter_code_ref"] = dV["struct_ref"][
                                    0]["pdbx_seq_one_letter_code"] if "pdbx_seq_one_letter_code" in dV[
                                        "struct_ref"][0] else None
                                rD["db_accession"] = dV["struct_ref"][0][
                                    "pdbx_db_accession"] if "pdbx_db_accession" in dV[
                                        "struct_ref"][0] else None
                                rD["db_name"] = dV["struct_ref"][0][
                                    "db_name"] if "db_name" in dV[
                                        "struct_ref"][0] else None
                                #
                                refDbName = rD["db_name"]
                                dbAccession = rD["db_accession"]
                                dbRefSeq = self.__seqCache[
                                    dbAccession] if dbAccession in self.__seqCache else None

                                if refDbName in ["UNP"] and not dbRefSeq:
                                    dbRefSeq = self.__fetchUniprot(dbAccession)
                                    self.__seqCache[dbAccession] = dbRefSeq
                                    logger.debug("Fetch uniprot %r", dbRefSeq)
                                rD["ref_db_seq"] = dbRefSeq
                            else:
                                rD["seq_one_letter_code_ref"] = rD[
                                    "db_accession"] = rD["db_name"] = None
                            #
                            if "entity_id" in rD:
                                eD[rD["entity_id"]] = copy.copy(rD)

                        entryD[entryId][resultKey] = copy.copy(eD)

                        iCount += 1
                        if iCount % 10 == 0:
                            logger.info(
                                "Completed polymer entities fetch %d/%d entries",
                                iCount, len(entryD))
                        if iCount % 2000 == 0:
                            ok = self.__mU.doExport(savePath, entryD,
                                                    **saveKwargs)
                            logger.info(
                                "Saved polymer entity results (%d) status %r in %s",
                                iCount, ok, savePath)
                        if entryLimit and iCount >= entryLimit:
                            logger.info("Quitting after %d", iCount)
                            break
            #
            # for entryId in entryD:
            #    logger.debug(">>  %s docD  %r" % (entryId, entryD[entryId]))
            ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
            logger.info(
                "Saved polymer entity results (%d) entries %d status %r in %s",
                iCount, len(entryD), ok, savePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD
    def getEntityInstances(self, entryD, **kwargs):
        """Get the selected validation data for the instances in the input entry dictionary.

        entryD[entryId]['selected_polymer_entities'][entityId]['validation'] = {}

        Add keys: 'pdbx_vrpt_instance_results'  and  'pdbx_unobs_or_zero_occ_residues' to the validation dictionary above.

        Args:
            resourceName (str):  resource name (e.g. DrugBank, CCDC)
            **kwargs: unused

        Returns:
            entryD: { }
        """
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName",
                                    "pdbx_core_polymer_entity_instance")
        savePath = kwargs.get("savePath", "entry-data.pic")
        saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
        entryLimit = kwargs.get("entryLimit", None)
        #
        try:
            optF = False
            iCount = 0
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s total document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    #
                    for entryId, dV in entryD.items():
                        for entityId, peD in dV[
                                "selected_polymer_entities"].items():
                            # if 'anal_instances' in peD:
                            #    continue
                            vD = {}
                            for asymId in peD["asym_ids"]:
                                qD = {
                                    "rcsb_polymer_entity_instance_container_identifiers.entry_id":
                                    entryId,
                                    "rcsb_polymer_entity_instance_container_identifiers.asym_id":
                                    asymId,
                                }
                                # qD = {'rcsb_entity_instance_container_validation_identifiers.entity_type': 'polymer'}
                                # selectL = ['pdbx_vrpt_instance_results', 'pdbx_unobs_or_zero_occ_residues']
                                selectL = ["pdbx_vrpt_instance_results"]
                                tL = mg.fetch(dbName,
                                              collectionName,
                                              selectL,
                                              queryD=qD)
                                dV = {}
                                if not tL:
                                    logger.info(
                                        "No validation data for %s %s %s(%s)",
                                        dbName, collectionName, entryId,
                                        asymId)
                                    continue
                                #
                                logger.debug(
                                    ">>> %s %s (%s) dict key length %d ",
                                    collectionName, entryId, asymId,
                                    len(tL[0]))

                                #
                                if optF:
                                    dV["pdbx_vrpt_instance_results"] = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    dV["pdbx_unobs_or_zero_occ_residues"] = tL[0][
                                        "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[
                                            0] else []
                                #
                                if optF:
                                    urdL = tL[0][
                                        "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[
                                            0] else []
                                    oL = [{
                                        "label_seq_id": urd["label_seq_id"],
                                        "label_comp_id": urd["label_comp_id"]
                                    } for urd in urdL]
                                    dV["pdbx_unobs_or_zero_occ_residues"] = oL
                                #
                                try:
                                    irdL = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    oL = [{
                                        "label_seq_id": ird["label_seq_id"],
                                        "label_comp_id": ird["label_comp_id"]
                                    } for ird in irdL]
                                    dV["pdbx_vrpt_instance_results_seq"] = oL
                                except Exception as e:
                                    logger.error(
                                        "Failing with entryId %s entityId %s asymId %s bad validation data %s",
                                        entryId, entityId, asymId, str(e))

                                #
                                try:
                                    irdL = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    oL = [{
                                        "OWAB": ird["OWAB"],
                                        "label_seq_id": ird["label_seq_id"],
                                        "label_comp_id": ird["label_comp_id"]
                                    } for ird in irdL]
                                    dV["pdbx_vrpt_instance_results_occ"] = oL
                                except Exception as e:
                                    logger.debug(
                                        "Failing with entryId %s entityId %s asymId %s bad validation data %s",
                                        entryId, entityId, asymId, str(e))

                                vD[asymId] = copy.copy(dV)
                                #
                            analD = self.analEntity(entryId, peD, vD)
                            entryD[entryId]["selected_polymer_entities"][
                                entityId]["anal_instances"] = copy.copy(analD)
                        iCount += 1
                        if iCount % 500 == 0:
                            logger.info("Completed %d/%d entries", iCount,
                                        len(entryD))
                        if iCount % 2000 == 0:
                            ok = self.__mU.doExport(savePath, entryD,
                                                    **saveKwargs)
                            logger.info(
                                "Saved polymer entity instance results (%d) status %r in %s",
                                iCount, ok, savePath)
                        if entryLimit and iCount >= entryLimit:
                            break
            ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
            logger.info(
                "Saved polymer instance results (%d) entries %d status %r in %s",
                iCount, len(entryD), ok, savePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD
Exemple #28
0
    def testSingleIndexSelect(self):
        """Test case -  create collection, create simple single index, insert document list, read check documents."""
        try:
            logger.debug("Starting testSingleIndexSelect")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                nDocs = 100
                mg = MongoDbUtil(client)
                ok = mg.createCollection(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                ok = mg.databaseExists(self.__dbName)
                self.assertTrue(ok)
                ok = mg.collectionExists(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                #
                # Create before insert
                ok = mg.createIndex(self.__dbName, self.__collectionName, keyList=["DOC_ID"], indexName="primary", indexType="DESCENDING", uniqueFlag=True)
                self.assertTrue(ok)

                dList = []
                nRows = 5
                for ii in range(nDocs):
                    dObj = self.__makeDataObj(2, 5, nRows, ii)
                    dList.append(dObj)
                #
                keyName = "DOC_ID"
                rIdL = mg.insertList(self.__dbName, self.__collectionName, dList, keyNames=[keyName], salvage=True)
                self.assertEqual(len(dList), len(rIdL))
                #
                for ii in range(nDocs):
                    kVal = "DOC_%d" % ii
                    rObj = mg.fetchOne(self.__dbName, self.__collectionName, "DOC_ID", kVal)
                    # logger.debug("Return Object %s" % pprint.pformat(rObj))
                    rObj.pop("_id", None)
                    dList[ii].pop("_id", None)
                    self.assertEqual(len(dList[ii]), len(rObj))
                    self.assertEqual(dList[ii], rObj)
                #
                ok = mg.dropIndex(self.__dbName, self.__collectionName, indexName="primary")
                self.assertTrue(ok)
                ok = mg.createIndex(self.__dbName, self.__collectionName, keyList=["DOC_ID"], indexName="primary", indexType="DESCENDING", uniqueFlag=True)
                self.assertTrue(ok)
                ok = mg.reIndex(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                #
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                ii = mg.count(self.__dbName, self.__collectionName)
                logger.debug("collection length %d", ii)
                #
                dList = mg.fetch(self.__dbName, self.__collectionName, ["DOC_ID"])
                self.assertEqual(len(dList), nDocs)
                logger.debug("Fetch length %d", len(dList))
                for ii, dD in enumerate(dList):
                    logger.debug("Fetch num %d: %r", ii, dD)
                #
                dList = mg.fetch(self.__dbName, self.__collectionName, ["category_0.attribute_0"], queryD={"category_0.attribute_0": "val_0_0"})
                self.assertEqual(len(dList), nDocs)
                logger.debug("Fetch length %d", len(dList))
                for ii, dD in enumerate(dList):
                    logger.debug("Fetch num %d: %r", ii, dD)
                atName = "category_0.attribute_0"
                vL0 = mg.distinct(self.__dbName, self.__collectionName, atName)
                self.assertEqual(len(vL0), nRows + 2)
                logger.debug("vL0 %r", vL0)
                vL1 = mg.distinct(self.__dbName, self.__collectionName, "category_1.attribute_0")
                self.assertEqual(len(vL1), nRows + 2)
                for v in vL0:
                    num = mg.count(self.__dbName, self.__collectionName, countFilter={atName: v})
                    logger.debug("%s value %s (%d)", atName, v, num)
                    self.assertGreaterEqual(num, 100)
                #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()