Beispiel #1
0
 def doTransform(self, **kwargs):
     desp = DataExchangeStatus()
     statusStartTimestamp = desp.setStartTime()
     #
     databaseName = kwargs.get("databaseName", "pdbx_core")
     collectionName = kwargs.get("collectionName", "pdbx_core_entry")
     selectionQueryD = kwargs.get("selectionQuery", {})
     fetchLimit = kwargs.get("fetchLimit", None)
     tU = TimeUtil()
     updateId = kwargs.get("updateId", tU.getCurrentWeekSignature())
     #
     docSelectList = self.__selectObjectIds(databaseName, collectionName,
                                            selectionQueryD)
     docSelectList = docSelectList[:fetchLimit] if fetchLimit else docSelectList
     ok = self.__transform(databaseName, collectionName, docSelectList)
     #
     if updateId:
         okS = self.__updateStatus(updateId, databaseName, collectionName,
                                   ok, statusStartTimestamp)
     return ok and okS
Beispiel #2
0
    def testTimeStamps(self):
        """ Verify time stamp operations.
        """
        try:
            tU = TimeUtil()
            tS = tU.getTimestamp(useUtc=True)
            logger.debug("TS (UTC) = %s(%d)", tS, len(tS))
            self.assertTrue(len(tS) >= 32)
            #
            tS = tU.getTimestamp(useUtc=False)
            logger.debug("TS = %s(%d)", tS, len(tS))
            self.assertTrue(len(tS) >= 32)

            # self.assertTrue(ok)
            wS1 = tU.getCurrentWeekSignature()
            logger.debug("Current week signature %s", wS1)
            td = datetime.date.today()

            wS2 = tU.getWeekSignature(td.year, td.month, td.day)
            logger.debug("Computed week signature %s", wS2)
            self.assertEqual(wS1, wS2)
            #
            tS = tU.getTimestamp(useUtc=True)
            logger.debug("TS (UTC) = %s(%d)", tS, len(tS))
            self.assertTrue(len(tS) >= 32)
            dt = tU.getDateTimeObj(tS)
            logger.debug("Recycled DT (UTC) %s", dt.isoformat(" "))
            #
            tS = tU.getTimestamp(useUtc=False)
            logger.debug("TS (local) = %s(%d)", tS, len(tS))
            self.assertTrue(len(tS) >= 32)
            #
            dt = tU.getDateTimeObj(tS)
            logger.debug("Recycled DT (local) %s", dt.isoformat(" "))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
 def testTreeLoader(self):
     """Test case - extract entity polymer info"""
     try:
         tU = TimeUtil()
         updateId = tU.getCurrentWeekSignature()
         rhw = TreeNodeListWorker(
             self.__cfgOb,
             self.__cachePath,
             numProc=self.__numProc,
             chunkSize=self.__chunkSize,
             documentLimit=self.__documentLimit,
             verbose=self.__debugFlag,
             readBackCheck=self.__readBackCheck,
             useCache=self.__useCache,
         )
         #
         ok = rhw.load(updateId,
                       loadType=self.__loadType,
                       doLoad=self.__doLoad)
         self.assertTrue(ok)
         #
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument(
        "--full",
        default=True,
        action="store_true",
        help="Fresh full load in a new tables/collections (Default)")
    #
    parser.add_argument("--etl_entity_sequence_clusters",
                        default=False,
                        action="store_true",
                        help="ETL entity sequence clusters")
    parser.add_argument("--etl_repository_holdings",
                        default=False,
                        action="store_true",
                        help="ETL repository holdings")
    # parser.add_argument("--etl_chemref", default=False, action="store_true", help="ETL integrated chemical reference data")
    # parser.add_argument("--etl_tree_node_lists", default=False, action='store_true', help="ETL tree node lists")

    parser.add_argument(
        "--data_set_id",
        default=None,
        help="Data set identifier (default= 2018_14 for current week)")
    #
    parser.add_argument(
        "--sequence_cluster_data_path",
        default=None,
        help="Sequence cluster data path (default set by configuration")
    parser.add_argument(
        "--sandbox_data_path",
        default=None,
        help="Date exchange sandboxPath data path (default set by configuration"
    )

    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")

    parser.add_argument("--db_type",
                        default="mongo",
                        help="Database server type (default=mongo)")

    # parser.add_argument("--document_style", default="rowwise_by_name_with_cardinality",
    #                    help="Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name")
    parser.add_argument("--read_back_check",
                        default=False,
                        action="store_true",
                        help="Perform read back check on all documents")
    #
    parser.add_argument("--num_proc",
                        default=2,
                        help="Number of processes to execute (default=2)")
    parser.add_argument("--chunk_size",
                        default=10,
                        help="Number of files loaded per process")
    parser.add_argument("--document_limit",
                        default=None,
                        help="Load document limit for testing")
    parser.add_argument("--prune_document_size",
                        default=None,
                        help="Prune large documents to this size limit (MB)")
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument("--mock",
                        default=False,
                        action="store_true",
                        help="Use MOCK repository configuration for testing")
    parser.add_argument("--cache_path",
                        default=None,
                        help="Path containing cache directories")
    # parser.add_argument("--use_cache", default=False, action="store_true", help="Use cache files from remote resources")
    parser.add_argument("--rebuild_cache",
                        default=False,
                        action="store_true",
                        help="Rebuild cached resource files")
    # parser.add_argument("--rebuild_schema", default=False, action="store_true", help="Rebuild schema on-the-fly if not cached")
    #
    #
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    # useCache = args.use_cache
    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=defaultConfigName,
                           mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)
        #
    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)

    #
    try:
        readBackCheck = args.read_back_check
        tU = TimeUtil()
        dataSetId = args.data_set_id if args.data_set_id else tU.getCurrentWeekSignature(
        )
        seqDataLocator = args.sequence_cluster_data_path if args.sequence_cluster_data_path else cfgOb.getPath(
            "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName)
        sandboxPath = args.sandbox_data_path if args.sandbox_data_path else cfgOb.getPath(
            "RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName)
        numProc = int(args.num_proc)
        chunkSize = int(args.chunk_size)
        documentLimit = int(
            args.document_limit) if args.document_limit else None

        loadType = "full" if args.full else "replace"
        # loadType = 'replace' if args.replace else 'full'

        cachePath = args.cache_path if args.cache_path else "."
        rebuildCache = args.rebuild_cache if args.rebuild_cache else False
        # rebuildSchemaFlag = args.rebuild_schema if args.rebuild_schema else False
        #
        # if args.document_style not in ['rowwise_by_name', 'rowwise_by_name_with_cardinality', 'columnwise_by_name', 'rowwise_by_id', 'rowwise_no_name']:
        #    logger.error("Unsupported document style %s" % args.document_style)

        if args.db_type != "mongo":
            logger.error("Unsupported database server type %s", args.db_type)
    except Exception as e:
        logger.exception("Argument processing problem %s", str(e))
        parser.print_help(sys.stderr)
        exit(1)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #  Rebuild or check resource cache
    ok = buildResourceCache(cfgOb,
                            configName,
                            cachePath,
                            rebuildCache=rebuildCache)
    if not ok:
        logger.error("Cache rebuild or check failure (rebuild %r) %r",
                     rebuildCache, cachePath)
        exit(1)
    ##
    if args.db_type == "mongo":
        if args.etl_entity_sequence_clusters:
            cw = SequenceClustersEtlWorker(cfgOb,
                                           numProc=numProc,
                                           chunkSize=chunkSize,
                                           documentLimit=documentLimit,
                                           verbose=debugFlag,
                                           readBackCheck=readBackCheck,
                                           workPath=cachePath)
            ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType)
            okS = loadStatus(cw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.etl_repository_holdings:
            rhw = RepoHoldingsEtlWorker(cfgOb,
                                        sandboxPath,
                                        cachePath,
                                        numProc=numProc,
                                        chunkSize=chunkSize,
                                        documentLimit=documentLimit,
                                        verbose=debugFlag,
                                        readBackCheck=readBackCheck)
            ok = rhw.load(dataSetId, loadType=loadType)
            okS = loadStatus(rhw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        logger.info("Operation completed with status %r " % ok and okS)
    def load(self, op, **kwargs):
        # if not self.__cacheStatus:
        #    logger.error("Resource cache test or rebuild has failed - exiting")
        #    return False
        # argument processing
        if op not in [
                "pdbx-loader", "etl-repository-holdings",
                "etl-entity-sequence-clusters"
        ]:
            logger.error("Unsupported operation %r - exiting", op)
            return False
        try:
            readBackCheck = kwargs.get("readBackCheck", False)
            numProc = int(kwargs.get("numProc", 1))
            chunkSize = int(kwargs.get("chunkSize", 10))
            fileLimit = int(
                kwargs.get("fileLimit")) if "fileLimit" in kwargs else None
            documentLimit = int(kwargs.get(
                "documentLimit")) if "documentLimit" in kwargs else None
            failedFilePath = kwargs.get("failFileListPath", None)
            loadFileListPath = kwargs.get("loadFileListPath", None)
            saveInputFileListPath = kwargs.get("saveFileListPath", None)
            schemaLevel = kwargs.get("schemaLevel",
                                     "min") if kwargs.get("schemaLevel") in [
                                         "min", "full"
                                     ] else "min"
            loadType = kwargs.get("loadType", "full")  # or replace
            updateSchemaOnReplace = kwargs.get("updateSchemaOnReplace", True)
            pruneDocumentSize = float(
                kwargs.get("pruneDocumentSize"
                           )) if "pruneDocumentSize" in kwargs else None

            # "Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name",
            documentStyle = kwargs.get("documentStyle",
                                       "rowwise_by_name_with_cardinality")
            dbType = kwargs.get("dbType", "mongo")
            #
            databaseName = kwargs.get("databaseName", None)
            databaseNameList = self.__cfgOb.get(
                "DATABASE_NAMES_ALL",
                sectionName="database_catalog_configuration").split(",")
            collectionNameList = kwargs.get("collectionNameList", None)
            mergeValidationReports = kwargs.get("mergeValidationReports", True)
            #
            tU = TimeUtil()
            dataSetId = kwargs.get(
                "dataSetId"
            ) if "dataSetId" in kwargs else tU.getCurrentWeekSignature()
            seqDataLocator = self.__cfgOb.getPath(
                "RCSB_SEQUENCE_CLUSTER_DATA_PATH",
                sectionName=self.__configName)
            sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                               sectionName=self.__configName)

        except Exception as e:
            logger.exception(
                "Argument and configuration processing failing with %s",
                str(e))
            return False
        #

        if op == "pdbx-loader" and dbType == "mongo" and databaseName in databaseNameList:
            okS = True
            try:
                inputPathList = None
                if loadFileListPath:
                    mu = MarshalUtil(workPath=self.__cachePath)
                    inputPathList = mu.doImport(loadFileListPath, fmt="list")
                    if not inputPathList:
                        logger.error(
                            "Operation %r missing or empty input file path list %s - exiting",
                            op, loadFileListPath)
                        return False
            except Exception as e:
                logger.exception(
                    "Operation %r processing input path list failing with %s",
                    op, str(e))
                return False
            #
            try:
                mw = PdbxLoader(
                    self.__cfgOb,
                    self.__cachePath,
                    resourceName="MONGO_DB",
                    numProc=numProc,
                    chunkSize=chunkSize,
                    fileLimit=fileLimit,
                    verbose=self.__debugFlag,
                    readBackCheck=readBackCheck,
                )
                ok = mw.load(
                    databaseName,
                    collectionLoadList=collectionNameList,
                    loadType=loadType,
                    inputPathList=inputPathList,
                    styleType=documentStyle,
                    dataSelectors=["PUBLIC_RELEASE"],
                    failedFilePath=failedFilePath,
                    saveInputFileListPath=saveInputFileListPath,
                    pruneDocumentSize=pruneDocumentSize,
                    validationLevel=schemaLevel,
                    mergeContentTypes=["vrpt"]
                    if mergeValidationReports else None,
                    updateSchemaOnReplace=updateSchemaOnReplace,
                )
                okS = self.loadStatus(mw.getLoadStatus(),
                                      readBackCheck=readBackCheck)
            except Exception as e:
                logger.exception("Operation %r database %r failing with %s",
                                 op, databaseName, str(e))
        elif op == "etl-entity-sequence-clusters" and dbType == "mongo":
            cw = SequenceClustersEtlWorker(self.__cfgOb,
                                           numProc=numProc,
                                           chunkSize=chunkSize,
                                           documentLimit=documentLimit,
                                           verbose=self.__debugFlag,
                                           readBackCheck=readBackCheck,
                                           workPath=self.__cachePath)
            ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType)
            okS = self.loadStatus(cw.getLoadStatus(),
                                  readBackCheck=readBackCheck)
        elif op == "etl-repository-holdings" and dbType == "mongo":
            rhw = RepoHoldingsEtlWorker(
                self.__cfgOb,
                sandboxPath,
                self.__cachePath,
                numProc=numProc,
                chunkSize=chunkSize,
                documentLimit=documentLimit,
                verbose=self.__debugFlag,
                readBackCheck=readBackCheck,
            )
            ok = rhw.load(dataSetId, loadType=loadType)
            okS = self.loadStatus(rhw.getLoadStatus(),
                                  readBackCheck=readBackCheck)

        logger.info("Completed operation %r with status %r", op, ok and okS)

        return ok and okS
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    parser.add_argument(
        "--data_set_id",
        default=None,
        help="Data set identifier (default= 2019_14 for current week)")
    parser.add_argument(
        "--full",
        default=True,
        action="store_true",
        help="Fresh full load in a new tables/collections (Default)")
    parser.add_argument("--etl_chemref",
                        default=False,
                        action="store_true",
                        help="ETL integrated chemical reference data")
    parser.add_argument("--etl_uniprot_core",
                        default=False,
                        action="store_true",
                        help="ETL UniProt core reference data")
    parser.add_argument("--etl_tree_node_lists",
                        default=False,
                        action="store_true",
                        help="ETL tree node lists")
    parser.add_argument("--upd_ref_seq",
                        default=False,
                        action="store_true",
                        help="Update reference sequence assignments")
    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")
    parser.add_argument("--db_type",
                        default="mongo",
                        help="Database server type (default=mongo)")
    parser.add_argument("--read_back_check",
                        default=False,
                        action="store_true",
                        help="Perform read back check on all documents")
    parser.add_argument("--num_proc",
                        default=2,
                        help="Number of processes to execute (default=2)")
    parser.add_argument("--chunk_size",
                        default=10,
                        help="Number of files loaded per process")
    parser.add_argument("--document_limit",
                        default=None,
                        help="Load document limit for testing")
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument("--mock",
                        default=False,
                        action="store_true",
                        help="Use MOCK repository configuration for testing")
    parser.add_argument(
        "--cache_path",
        default=None,
        help="Top cache path for external and local resource files")
    parser.add_argument("--rebuild_cache",
                        default=False,
                        action="store_true",
                        help="Rebuild cached files from remote resources")
    # parser.add_argument("--test_req_seq_cache", default=False, action="store_true", help="Test reference sequence cached files")
    #
    #
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    rebuildCache = args.rebuild_cache
    useCache = not args.rebuild_cache

    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=configName,
                           mockTopPath=mockTopPath)
    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)

    #
    try:
        readBackCheck = args.read_back_check
        tU = TimeUtil()
        dataSetId = args.data_set_id if args.data_set_id else tU.getCurrentWeekSignature(
        )
        numProc = int(args.num_proc)
        chunkSize = int(args.chunk_size)
        documentLimit = int(
            args.document_limit) if args.document_limit else None
        loadType = "full" if args.full else "replace"
        cachePath = args.cache_path if args.cache_path else "."

        if args.db_type != "mongo":
            logger.error("Unsupported database server type %s", args.db_type)
    except Exception as e:
        logger.exception("Argument processing problem %s", str(e))
        parser.print_help(sys.stderr)
        exit(1)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    ##
    #  Rebuild or check resource cache
    okS = True
    ok = buildResourceCache(cfgOb,
                            configName,
                            cachePath,
                            rebuildCache=rebuildCache)
    if not ok:
        logger.error("Cache rebuild or check failure (rebuild %r) %r",
                     rebuildCache, cachePath)
        exit(1)
    # if not useCache:
    #    buildResourceCache(cfgOb, configName, cachePath, rebuildCache=True)
    #
    if args.db_type == "mongo":
        if args.etl_tree_node_lists:
            rhw = TreeNodeListWorker(cfgOb,
                                     cachePath,
                                     numProc=numProc,
                                     chunkSize=chunkSize,
                                     documentLimit=documentLimit,
                                     verbose=debugFlag,
                                     readBackCheck=readBackCheck,
                                     useCache=useCache)
            ok = rhw.load(dataSetId, loadType=loadType)
            okS = loadStatus(rhw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.etl_chemref:
            crw = ChemRefEtlWorker(cfgOb,
                                   cachePath,
                                   numProc=numProc,
                                   chunkSize=chunkSize,
                                   documentLimit=documentLimit,
                                   verbose=debugFlag,
                                   readBackCheck=readBackCheck,
                                   useCache=useCache)
            ok = crw.load(dataSetId, extResource="DrugBank", loadType=loadType)
            okS = loadStatus(crw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.etl_uniprot_core:
            crw = UniProtCoreEtlWorker(cfgOb,
                                       cachePath,
                                       numProc=numProc,
                                       chunkSize=chunkSize,
                                       documentLimit=documentLimit,
                                       verbose=debugFlag,
                                       readBackCheck=readBackCheck,
                                       useCache=useCache)
            ok = crw.load(dataSetId, extResource="UniProt", loadType=loadType)
            okS = loadStatus(crw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.upd_ref_seq:
            databaseName = "pdbx_core"
            collectionName = "pdbx_core_polymer_entity"
            polymerType = "Protein"
            ok = doReferenceSequenceUpdate(cfgOb,
                                           databaseName,
                                           collectionName,
                                           polymerType,
                                           cachePath,
                                           useCache,
                                           fetchLimit=documentLimit,
                                           refChunkSize=100)
            okS = ok
        #
        logger.info("Operation completed with status %r " % ok and okS)
Beispiel #7
0
class DataExchangeStatus(object):
    """
    Create status records for data exchange operations.

    For example,

    loop_
     _rcsb_data_exchange_status.update_id
     _rcsb_data_exchange_status.database
     _rcsb_data_exchange_status.object
     _rcsb_data_exchange_status.update_status_flag
     _rcsb_data_exchange_status.update_begin_timestamp
     _rcsb_data_exchange_status.update_end_timestamp
    2018_23 chem_comp_v5 chem_comp Y '2018-07-11 11:51:37.958508+00:00' '2018-07-11 11:55:03.966508+00:00'
    # ... abbreviated ...

    """
    def __init__(self, **kwargs):
        self.__startTimestamp = None
        self.__endTimestamp = None
        self.__updateId = "unset"
        self.__statusFlag = "N"
        self.__databaseName = "unset"
        self.__objectName = "unset"
        self.__tU = TimeUtil()
        self.__kwargs = kwargs

    def setObject(self, databaseName, objectName):
        """Set the object for current status record.

        Args:
            databaseName (str): database container name
            objectName (str): object name (collection/table) within database

        Returns:
            bool: True for success or False otherwise
        """
        try:
            self.__databaseName = databaseName
            self.__objectName = objectName
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def setStartTime(self, tS=None, useUtc=True):
        """Set the start time for the current exchange operation.

        Args:
            tS (str, optional): timestamp for the start of the update operation (default=current time)
            useUtc (bool, optional): Report times in UTC

        Returns:
            str: isoformat timestamp or None otherwise
        """
        try:
            self.__startTimestamp = tS if tS else self.__tU.getTimestamp(
                useUtc=useUtc)
            return self.__startTimestamp
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def setEndTime(self, tS=None, useUtc=True):
        """Set the end time for the current exchange operation.

        Args:
            tS (str, optional): timestamp for the end of the update operation (default=current time)
            useUtc (bool, optional): Report times in UTC

        Returns:
            str: isoformat timestamp or None otherwise
        """
        try:
            self.__endTimestamp = tS if tS else self.__tU.getTimestamp(
                useUtc=useUtc)
            return self.__endTimestamp
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def setStatus(self, updateId=None, successFlag="Y"):
        """Set the update identifier (yyyy_<week_in_year>) and success flag for the current exchange operation.

        Args:
            updateId (str, optional): Update identifier (default=yyyy_<week_in_year>)
            successFlag (str, optional): 'Y'/'N'

        Returns:
            bool: True for success or False otherwise
        """
        try:
            self.__statusFlag = successFlag
            self.__updateId = updateId if updateId else self.__tU.getCurrentWeekSignature(
            )
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def getStatus(self, useTimeStrings=False):
        """Get the current data exchange status document.

        Returns:
            dict: Updated list of status records including the appended current record

        """
        try:
            if useTimeStrings:
                sD = {
                    "update_id": self.__updateId,
                    "database_name": self.__databaseName,
                    "object_name": self.__objectName,
                    "update_status_flag": self.__statusFlag,
                    "update_begin_timestamp": self.__startTimestamp,
                    "update_end_timestamp": self.__endTimestamp,
                }
            else:
                sD = {
                    "update_id":
                    self.__updateId,
                    "database_name":
                    self.__databaseName,
                    "object_name":
                    self.__objectName,
                    "update_status_flag":
                    self.__statusFlag,
                    "update_begin_timestamp":
                    self.__tU.getDateTimeObj(self.__startTimestamp),
                    "update_end_timestamp":
                    self.__tU.getDateTimeObj(self.__endTimestamp),
                }
            return sD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return {}
Beispiel #8
0
    def load(self, op, **kwargs):
        logger.info("Starting operation %r\n", op)
        if not self.__cacheStatus:
            logger.error("Resource cache test or rebuild has failed - exiting")
            return False
        # argument processing
        if op not in ["etl_tree_node_lists", "etl_chemref", "etl_uniprot_core", "upd_ref_seq", "upd_ref_seq_comp_models", "refresh_pubchem"]:
            logger.error("Unsupported operation %r - exiting", op)
            return False
        try:
            # test mode and UniProt accession primary match minimum count for doReferenceSequenceUpdate()
            testMode = kwargs.get("testMode", False)
            minMatchPrimaryPercent = kwargs.get("minMatchPrimaryPercent", None)
            minMissing = kwargs.get("minMissing", 0)
            #
            readBackCheck = kwargs.get("readBackCheck", False)
            numProc = int(kwargs.get("numProc", 1))
            chunkSize = int(kwargs.get("chunkSize", 10))
            refChunkSize = int(kwargs.get("refChunkSize", 100))
            documentLimit = int(kwargs.get("documentLimit")) if "documentLimit" in kwargs else None
            loadType = kwargs.get("loadType", "full")  # or replace
            dbType = kwargs.get("dbType", "mongo")
            tU = TimeUtil()
            dataSetId = kwargs.get("dataSetId") if "dataSetId" in kwargs else tU.getCurrentWeekSignature()
            #  Rebuild or reuse reference sequence cache
            rebuildSequenceCache = kwargs.get("rebuildSequenceCache", False)
            useSequenceCache = not rebuildSequenceCache
            #
        except Exception as e:
            logger.exception("Argument or configuration processing failing with %s", str(e))
            return False
        #
        okS = ok = False
        if dbType == "mongo":
            if op == "etl_tree_node_lists":
                rhw = TreeNodeListWorker(
                    self.__cfgOb,
                    self.__cachePath,
                    numProc=numProc,
                    chunkSize=chunkSize,
                    documentLimit=documentLimit,
                    verbose=self.__debugFlag,
                    readBackCheck=readBackCheck,
                    useCache=self.__useCache,
                )
                ok = rhw.load(dataSetId, loadType=loadType)
                okS = self.loadStatus(rhw.getLoadStatus(), readBackCheck=readBackCheck)

            elif op == "etl_chemref":
                crw = ChemRefEtlWorker(
                    self.__cfgOb,
                    self.__cachePath,
                    numProc=numProc,
                    chunkSize=chunkSize,
                    documentLimit=documentLimit,
                    verbose=self.__debugFlag,
                    readBackCheck=readBackCheck,
                    useCache=self.__useCache,
                )
                ok = crw.load(dataSetId, extResource="DrugBank", loadType=loadType)
                okS = self.loadStatus(crw.getLoadStatus(), readBackCheck=readBackCheck)

            elif op == "etl_uniprot_core":
                crw = UniProtCoreEtlWorker(
                    self.__cfgOb,
                    self.__cachePath,
                    numProc=numProc,
                    chunkSize=chunkSize,
                    documentLimit=documentLimit,
                    verbose=self.__debugFlag,
                    readBackCheck=readBackCheck,
                    useCache=self.__useCache,
                )
                ok = crw.load(dataSetId, extResource="UniProt", loadType=loadType)
                okS = self.loadStatus(crw.getLoadStatus(), readBackCheck=readBackCheck)

            elif op == "upd_ref_seq":
                databaseName = "pdbx_core"
                collectionName = "pdbx_core_polymer_entity"
                polymerType = "Protein"
                ok = self.doReferenceSequenceUpdate(
                    databaseName,
                    collectionName,
                    polymerType,
                    fetchLimit=documentLimit,
                    useSequenceCache=useSequenceCache,
                    testMode=testMode,
                    minMatchPrimaryPercent=minMatchPrimaryPercent,
                    minMissing=minMissing,
                    refChunkSize=refChunkSize,
                )
                okS = ok
            elif op == "upd_ref_seq_comp_models":
                databaseName = "pdbx_comp_model_core"
                collectionName = "pdbx_comp_model_core_polymer_entity"
                polymerType = "Protein"
                ok = self.doReferenceSequenceUpdate(
                    databaseName,
                    collectionName,
                    polymerType,
                    fetchLimit=documentLimit,
                    useSequenceCache=useSequenceCache,
                    testMode=testMode,
                    minMatchPrimaryPercent=minMatchPrimaryPercent,
                    minMissing=minMissing,
                    refChunkSize=refChunkSize,
                )
                okS = ok
        #
        logger.info("Completed operation %r with status %r\n", op, ok and okS)
        return ok and okS