def main(): parser = argparse.ArgumentParser() # defaultConfigName = "site_info_configuration" # parser.add_argument("--full", default=False, action="store_true", help="Fresh full load in a new tables/collections") parser.add_argument( "--replace", default=False, action="store_true", help="Load with replacement in an existing table/collection (default)") # parser.add_argument( "--load_chem_comp_ref", default=False, action="store_true", help="Load Chemical Component reference definitions (public subset)") parser.add_argument( "--load_chem_comp_core_ref", default=False, action="store_true", help= "Load Chemical Component Core reference definitions (public subset)") parser.add_argument( "--load_bird_chem_comp_ref", default=False, action="store_true", help= "Load Bird Chemical Component reference definitions (public subset)") parser.add_argument( "--load_bird_chem_comp_core_ref", default=False, action="store_true", help= "Load Bird Chemical Component Core reference definitions (public subset)" ) parser.add_argument("--load_bird_ref", default=False, action="store_true", help="Load Bird reference definitions (public subset)") parser.add_argument( "--load_bird_family_ref", default=False, action="store_true", help="Load Bird Family reference definitions (public subset)") parser.add_argument("--load_entry_data", default=False, action="store_true", help="Load PDBx entry data (current released subset)") parser.add_argument( "--load_pdbx_core", default=False, action="store_true", help="Load all PDBx core collections (current released subset)") parser.add_argument( "--load_pdbx_core_merge", default=False, action="store_true", help= "Load all PDBx core collections with merged content (current released subset)" ) # parser.add_argument("--load_pdbx_core_entry", default=False, action="store_true", help="Load PDBx core entry (current released subset)") parser.add_argument("--load_pdbx_core_entity", default=False, action="store_true", help="Load PDBx core entity (current released subset)") parser.add_argument( "--load_pdbx_core_entity_monomer", default=False, action="store_true", help="Load PDBx core entity monomer (current released subset)") parser.add_argument( "--load_pdbx_core_assembly", default=False, action="store_true", help="Load PDBx core assembly (current released subset)") parser.add_argument( "--load_ihm_dev", default=False, action="store_true", help="Load I/HM DEV model data (current released subset)") # parser.add_argument("--config_path", default=None, help="Path to configuration options file") parser.add_argument("--config_name", default=defaultConfigName, help="Configuration section name") parser.add_argument("--db_type", default="mongo", help="Database server type (default=mongo)") parser.add_argument( "--document_style", default="rowwise_by_name_with_cardinality", help= "Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name", ) parser.add_argument("--read_back_check", default=False, action="store_true", help="Perform read back check on all documents") parser.add_argument("--schema_level", default=None, help="Schema validation level (full|min default=None)") # parser.add_argument( "--load_file_list_path", default=None, help= "Input file containing load file path list (override automatic repository scan)" ) parser.add_argument( "--fail_file_list_path", default=None, help="Output file containing file paths that fail to load") parser.add_argument( "--save_file_list_path", default=None, help="Save repo file paths from automatic file system scan in this path" ) parser.add_argument("--num_proc", default=2, help="Number of processes to execute (default=2)") parser.add_argument("--chunk_size", default=10, help="Number of files loaded per process") parser.add_argument("--file_limit", default=None, help="Load file limit for testing") parser.add_argument("--prune_document_size", default=None, help="Prune large documents to this size limit (MB)") parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging") parser.add_argument("--mock", default=False, action="store_true", help="Use MOCK repository configuration for testing") parser.add_argument("--cache_path", default=None, help="Cache path for resource files") parser.add_argument("--rebuild_cache", default=False, action="store_true", help="Rebuild cached resource files") parser.add_argument("--rebuild_schema", default=False, action="store_true", help="Rebuild schema on-the-fly if not cached") parser.add_argument("--vrpt_repo_path", default=None, help="Path to validation report repository") args = parser.parse_args() # debugFlag = args.debug if debugFlag: logger.setLevel(logging.DEBUG) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Configuration Details configPath = args.config_path configName = args.config_name if not configPath: configPath = os.getenv("DBLOAD_CONFIG_PATH", None) try: if os.access(configPath, os.R_OK): os.environ["DBLOAD_CONFIG_PATH"] = configPath logger.info("Using configuation path %s (%s)", configPath, configName) else: logger.error("Missing or access issue with config file %r", configPath) exit(1) mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=defaultConfigName, mockTopPath=mockTopPath) if configName != defaultConfigName: cfgOb.replaceSectionName(defaultConfigName, configName) # if args.vrpt_repo_path: vrptPath = args.vrpt_repo_path if not os.access(vrptPath, os.R_OK): logger.error("Unreadable validation report repository path %r", vrptPath) envName = cfgOb.get("VRPT_REPO_PATH_ENV", sectionName=configName) os.environ[envName] = vrptPath logger.info("Using alternate validation report path %s", os.getenv(envName)) except Exception as e: logger.error("Missing or access issue with config file %r with %s", configPath, str(e)) exit(1) # try: readBackCheck = args.read_back_check numProc = int(args.num_proc) chunkSize = int(args.chunk_size) fileLimit = int(args.file_limit) if args.file_limit else None failedFilePath = args.fail_file_list_path fPath = args.load_file_list_path schemaLevel = args.schema_level if args.schema_level in [ "min", "full", "minimum" ] else None loadType = "full" if args.full else "replace" loadType = "replace" if args.replace else "full" saveInputFileListPath = args.save_file_list_path pruneDocumentSize = float( args.prune_document_size) if args.prune_document_size else None cachePath = args.cache_path if args.cache_path else "." cachePath = os.path.abspath(cachePath) rebuildCache = args.rebuild_cache if args.rebuild_cache else False rebuildSchemaFlag = args.rebuild_schema if args.rebuild_schema else False if args.document_style not in [ "rowwise_by_name", "rowwise_by_name_with_cardinality", "columnwise_by_name", "rowwise_by_id", "rowwise_no_name" ]: logger.error("Unsupported document style %s", args.document_style) if args.db_type != "mongo": logger.error("Unsupported database server type %s", args.db_type) except Exception as e: logger.exception("Argument processing problem %s", str(e)) parser.print_help(sys.stderr) exit(1) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Rebuild or check resource cache okS = True ok = buildResourceCache(cfgOb, configName, cachePath, rebuildCache=rebuildCache) if not ok: logger.error("Cache rebuild or check failure (rebuild %r) %r", rebuildCache, cachePath) exit(1) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Read any input path lists - # inputPathList = None if fPath: mu = MarshalUtil(workPath=cachePath) inputPathList = mu.doImport(fPath, fmt="list") if not inputPathList: logger.error("Missing or empty input file path list %s", fPath) exit(1) # ## if args.db_type == "mongo": mw = PdbxLoader( cfgOb, cachePath, resourceName="MONGO_DB", numProc=numProc, chunkSize=chunkSize, fileLimit=fileLimit, verbose=debugFlag, readBackCheck=readBackCheck, rebuildSchemaFlag=rebuildSchemaFlag, ) if args.load_chem_comp_ref: ok = mw.load( "chem_comp", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_chem_comp_core_ref: ok = mw.load( "chem_comp_core", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_bird_chem_comp_ref: ok = mw.load( "bird_chem_comp", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_bird_chem_comp_core_ref: ok = mw.load( "bird_chem_comp_core", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_bird_ref: ok = mw.load( "bird", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_bird_family_ref: ok = mw.load( "bird_family", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["BIRD_FAMILY_PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_entry_data: ok = mw.load( "pdbx", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_pdbx_core: ok = mw.load( "pdbx_core", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) # if args.load_pdbx_core_merge: ok = mw.load( "pdbx_core", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, mergeContentTypes=["vrpt"], ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) # if args.load_pdbx_core_entity: ok = mw.load( "pdbx_core", collectionLoadList=["pdbx_core_entity"], loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) # if args.load_pdbx_core_entity_monomer: ok = mw.load( "pdbx_core", collectionLoadList=["pdbx_core_entity_monomer"], loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) # if args.load_pdbx_core_entry: ok = mw.load( "pdbx_core", collectionLoadList=["pdbx_core_entry"], loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_pdbx_core_assembly: ok = mw.load( "pdbx_core", collectionLoadList=["pdbx_core_assembly"], loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_ihm_dev: ok = mw.load( "ihm_dev", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) # logger.info("Operation completed with status %r " % ok and okS)
def main(): parser = argparse.ArgumentParser() # defaultConfigName = "site_info_configuration" # parser.add_argument( "--full", default=True, action="store_true", help="Fresh full load in a new tables/collections (Default)") # parser.add_argument("--etl_entity_sequence_clusters", default=False, action="store_true", help="ETL entity sequence clusters") parser.add_argument("--etl_repository_holdings", default=False, action="store_true", help="ETL repository holdings") # parser.add_argument("--etl_chemref", default=False, action="store_true", help="ETL integrated chemical reference data") # parser.add_argument("--etl_tree_node_lists", default=False, action='store_true', help="ETL tree node lists") parser.add_argument( "--data_set_id", default=None, help="Data set identifier (default= 2018_14 for current week)") # parser.add_argument( "--sequence_cluster_data_path", default=None, help="Sequence cluster data path (default set by configuration") parser.add_argument( "--sandbox_data_path", default=None, help="Date exchange sandboxPath data path (default set by configuration" ) # parser.add_argument("--config_path", default=None, help="Path to configuration options file") parser.add_argument("--config_name", default=defaultConfigName, help="Configuration section name") parser.add_argument("--db_type", default="mongo", help="Database server type (default=mongo)") # parser.add_argument("--document_style", default="rowwise_by_name_with_cardinality", # help="Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name") parser.add_argument("--read_back_check", default=False, action="store_true", help="Perform read back check on all documents") # parser.add_argument("--num_proc", default=2, help="Number of processes to execute (default=2)") parser.add_argument("--chunk_size", default=10, help="Number of files loaded per process") parser.add_argument("--document_limit", default=None, help="Load document limit for testing") parser.add_argument("--prune_document_size", default=None, help="Prune large documents to this size limit (MB)") parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging") parser.add_argument("--mock", default=False, action="store_true", help="Use MOCK repository configuration for testing") parser.add_argument("--cache_path", default=None, help="Path containing cache directories") # parser.add_argument("--use_cache", default=False, action="store_true", help="Use cache files from remote resources") parser.add_argument("--rebuild_cache", default=False, action="store_true", help="Rebuild cached resource files") # parser.add_argument("--rebuild_schema", default=False, action="store_true", help="Rebuild schema on-the-fly if not cached") # # args = parser.parse_args() # debugFlag = args.debug if debugFlag: logger.setLevel(logging.DEBUG) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Configuration Details configPath = args.config_path configName = args.config_name # useCache = args.use_cache if not configPath: configPath = os.getenv("DBLOAD_CONFIG_PATH", None) try: if os.access(configPath, os.R_OK): os.environ["DBLOAD_CONFIG_PATH"] = configPath logger.info("Using configuation path %s (%s)", configPath, configName) else: logger.error("Missing or access issue with config file %r", configPath) exit(1) mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=defaultConfigName, mockTopPath=mockTopPath) if configName != defaultConfigName: cfgOb.replaceSectionName(defaultConfigName, configName) # except Exception as e: logger.error("Missing or access issue with config file %r with %s", configPath, str(e)) exit(1) # try: readBackCheck = args.read_back_check tU = TimeUtil() dataSetId = args.data_set_id if args.data_set_id else tU.getCurrentWeekSignature( ) seqDataLocator = args.sequence_cluster_data_path if args.sequence_cluster_data_path else cfgOb.getPath( "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName) sandboxPath = args.sandbox_data_path if args.sandbox_data_path else cfgOb.getPath( "RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) numProc = int(args.num_proc) chunkSize = int(args.chunk_size) documentLimit = int( args.document_limit) if args.document_limit else None loadType = "full" if args.full else "replace" # loadType = 'replace' if args.replace else 'full' cachePath = args.cache_path if args.cache_path else "." rebuildCache = args.rebuild_cache if args.rebuild_cache else False # rebuildSchemaFlag = args.rebuild_schema if args.rebuild_schema else False # # if args.document_style not in ['rowwise_by_name', 'rowwise_by_name_with_cardinality', 'columnwise_by_name', 'rowwise_by_id', 'rowwise_no_name']: # logger.error("Unsupported document style %s" % args.document_style) if args.db_type != "mongo": logger.error("Unsupported database server type %s", args.db_type) except Exception as e: logger.exception("Argument processing problem %s", str(e)) parser.print_help(sys.stderr) exit(1) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Rebuild or check resource cache ok = buildResourceCache(cfgOb, configName, cachePath, rebuildCache=rebuildCache) if not ok: logger.error("Cache rebuild or check failure (rebuild %r) %r", rebuildCache, cachePath) exit(1) ## if args.db_type == "mongo": if args.etl_entity_sequence_clusters: cw = SequenceClustersEtlWorker(cfgOb, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck, workPath=cachePath) ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType) okS = loadStatus(cw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.etl_repository_holdings: rhw = RepoHoldingsEtlWorker(cfgOb, sandboxPath, cachePath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck) ok = rhw.load(dataSetId, loadType=loadType) okS = loadStatus(rhw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) logger.info("Operation completed with status %r " % ok and okS)
def main(): parser = argparse.ArgumentParser() # defaultConfigName = "site_info_configuration" # parser.add_argument( "--update_chem_comp_ref", default=False, action="store_true", help="Update schema for Chemical Component reference definitions") parser.add_argument( "--update_chem_comp_core_ref", default=False, action="store_true", help="Update core schema for Chemical Component reference definitions") parser.add_argument( "--update_bird_chem_comp_ref", default=False, action="store_true", help="Update schema for Bird Chemical Component reference definitions") parser.add_argument( "--update_bird_chem_comp_core_ref", default=False, action="store_true", help= "Update core schema for Bird Chemical Component reference definitions") parser.add_argument("--update_bird_ref", default=False, action="store_true", help="Update schema for Bird reference definitions") parser.add_argument( "--update_bird_family_ref", default=False, action="store_true", help="Update schema for Bird Family reference definitions") parser.add_argument("--update_pdbx", default=False, action="store_true", help="Update schema for PDBx entry data") parser.add_argument("--update_pdbx_core", default=False, action="store_true", help="Update schema for PDBx core entry/entity data") parser.add_argument( "--update_pdbx_comp_model_core", default=False, action="store_true", help="Update schema for PDBx computational model core entry/entity data" ) # parser.add_argument("--update_repository_holdings", default=False, action="store_true", help="Update schema for repository holdings") parser.add_argument("--update_entity_sequence_clusters", default=False, action="store_true", help="Update schema for entity sequence clusters") parser.add_argument("--update_data_exchange", default=False, action="store_true", help="Update schema for data exchange status") parser.add_argument("--update_ihm_dev", default=False, action="store_true", help="Update schema for I/HM dev entry data") parser.add_argument("--update_drugbank_core", default=False, action="store_true", help="Update DrugBank schema") # parser.add_argument( "--update_config_all", default=False, action="store_true", help="Update using configuration settings (e.g. DATABASE_NAMES_ALL)") parser.add_argument( "--update_config_deployed", default=False, action="store_true", help= "Update using configuration settings (e.g. DATABASE_NAMES_DEPLOYED)") parser.add_argument( "--update_config_test", default=False, action="store_true", help="Update using configuration settings (e.g. DATABASE_NAMES_TEST)") # parser.add_argument("--config_path", default=None, help="Path to configuration options file") parser.add_argument("--config_name", default=defaultConfigName, help="Configuration section name") # parser.add_argument("--cache_path", default=None, help="Schema cache directory path") parser.add_argument( "--encoding_types", default=None, help="Schema encoding (rcsb|json|bson) (comma separated)") parser.add_argument( "--validation_levels", default=None, help="Schema validation level (full|min) (comma separated)") parser.add_argument("--compare_only", default=False, action="store_true", help="Perform comparison with cached schema") # parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging") parser.add_argument( "--mock", default=False, action="store_true", help="Use MOCK repository configuration for dependencies and testing") # parser.add_argument("--working_path", default=None, help="Working/alternative path for temporary and schema files") args = parser.parse_args() # debugFlag = args.debug if debugFlag: logger.setLevel(logging.DEBUG) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Configuration Details configPath = args.config_path configName = args.config_name cachePath = args.cache_path compareOnly = args.compare_only # encodingTypes = args.encoding_types.split( ",") if args.encoding_types else [] validationLevels = args.validation_levels.split( ",") if args.validation_levels else [] dataTypingList = ["ANY", "SQL"] if not configPath: configPath = os.getenv("DBLOAD_CONFIG_PATH", None) try: if os.access(configPath, os.R_OK): os.environ["DBLOAD_CONFIG_PATH"] = configPath logger.info("Using configuation path %s (%s)", configPath, configName) else: logger.error("Missing or access issue with config file %r", configPath) exit(1) mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=defaultConfigName, mockTopPath=mockTopPath) if configName != defaultConfigName: cfgOb.replaceSectionName(defaultConfigName, configName) except Exception as e: logger.error("Missing or access issue with config file %r with %s", configPath, str(e)) exit(1) # databaseNameList = [] if args.update_chem_comp_ref: databaseNameList.append("chem_comp") if args.update_bird_chem_comp_ref: databaseNameList.append("bird_chem_comp") if args.update_chem_comp_core_ref: databaseNameList.append("chem_comp_core") if args.update_bird_chem_comp_core_ref: databaseNameList.append("bird_chem_comp_core") if args.update_bird_ref: databaseNameList.append("bird") if args.update_bird_family_ref: databaseNameList.append("bird_family") if args.update_pdbx: databaseNameList.append("pdbx") if args.update_pdbx_core: databaseNameList.append("pdbx_core") if args.update_pdbx_comp_model_core: databaseNameList.append("pdbx_comp_model_core") if args.update_repository_holdings: databaseNameList.append("repository_holdings") if args.update_entity_sequence_clusters: databaseNameList.append("sequence_clusters") if args.update_data_exchange: databaseNameList.append("data_exchange") if args.update_ihm_dev: databaseNameList.append("ihm_dev") if args.update_drugbank_core: databaseNameList.append("drugbank_core") if args.update_config_deployed: databaseNameList = cfgOb.getList( "DATABASE_NAMES_DEPLOYED", sectionName="database_catalog_configuration") dataTypingList = cfgOb.getList( "DATATYPING_DEPLOYED", sectionName="database_catalog_configuration") validationLevels = cfgOb.getList( "VALIDATION_LEVELS_DEPLOYED", sectionName="database_catalog_configuration") encodingTypes = cfgOb.getList( "ENCODING_TYPES_DEPLOYED", sectionName="database_catalog_configuration") if args.update_config_all: databaseNameList = cfgOb.getList( "DATABASE_NAMES_ALL", sectionName="database_catalog_configuration") dataTypingList = cfgOb.getList( "DATATYPING_ALL", sectionName="database_catalog_configuration") validationLevels = cfgOb.getList( "VALIDATION_LEVELS_ALL", sectionName="database_catalog_configuration") encodingTypes = cfgOb.getList( "ENCODING_TYPES_ALL", sectionName="database_catalog_configuration") if args.update_config_test: databaseNameList = cfgOb.getList( "DATABASE_NAMES_TEST", sectionName="database_catalog_configuration") dataTypingList = cfgOb.getList( "DATATYPING_TEST", sectionName="database_catalog_configuration") validationLevels = cfgOb.getList( "VALIDATION_LEVELS_TEST", sectionName="database_catalog_configuration") encodingTypes = cfgOb.getList( "ENCODING_TYPES_TEST", sectionName="database_catalog_configuration") # scnD = cfgOb.get("document_collection_names", sectionName="document_helper_configuration") # databaseNameList = list(set(databaseNameList)) logger.debug("Collections %s", list(scnD.items())) logger.debug("databaseNameList %s", databaseNameList) if compareOnly: schP = SchemaProvider(cfgOb, cachePath, useCache=True) difPathList = [] for databaseName in databaseNameList: for dataTyping in dataTypingList: logger.debug("Building schema %s with types %s", databaseName, dataTyping) pth = schP.schemaDefCompare(databaseName, dataTyping) if pth: difPathList.append(pth) if difPathList: logger.info("Schema definition difference path list %r", difPathList) difPathList = [] for databaseName in databaseNameList: dD = schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=False) sD = SchemaDefAccess(dD) for cd in sD.getCollectionInfo(): collectionName = cd["NAME"] for encodingType in encodingTypes: if encodingType.lower() != "json": continue for level in validationLevels: pth = schP.jsonSchemaCompare(databaseName, collectionName, encodingType, level) if pth: difPathList.append(pth) if difPathList: logger.info("JSON schema difference path list %r", difPathList) else: schP = SchemaProvider(cfgOb, cachePath, useCache=False) for databaseName in databaseNameList: for encodingType in encodingTypes: if encodingType == "rcsb": for dataTyping in dataTypingList: logger.info( "Creating schema definition for content type %s data typing %s", databaseName, dataTyping) schP.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True) else: if databaseName in scnD: for dD in scnD[databaseName]: collectionName = dD["NAME"] for validationLevel in validationLevels: logger.info( "Creating %r schema for content type %s collection %s", encodingType, databaseName, collectionName) schP.makeSchema(databaseName, collectionName, encodingType=encodingType, level=validationLevel, saveSchema=True)
def main(): parser = argparse.ArgumentParser() defaultConfigName = "site_info_configuration" # parser.add_argument("--scanType", default="full", help="Repository scan type (full|incr)") # group = parser.add_mutually_exclusive_group() group.add_argument("--scan_chem_comp_ref", default=False, action="store_true", help="Scan Chemical Component reference definitions (public subset)") group.add_argument("--scan_chem_comp_core_ref", default=False, action="store_true", help="Scan Chemical Component Core reference definitions (public subset)") group.add_argument("--scan_bird_chem_comp_ref", default=False, action="store_true", help="Scan Bird Chemical Component reference definitions (public subset)") group.add_argument("--scan_bird_chem_comp_core_ref", default=False, action="store_true", help="Scan Bird Chemical Component Core reference definitions (public subset)") group.add_argument("--scan_bird_ref", default=False, action="store_true", help="Scan Bird reference definitions (public subset)") group.add_argument("--scan_bird_family_ref", default=False, action="store_true", help="Scan Bird Family reference definitions (public subset)") group.add_argument("--scan_entry_data", default=False, action="store_true", help="Scan PDB entry data (current released subset)") group.add_argument("--scan_obsolete_entry_data", default=False, action="store_true", help="Scan obsolete PDB entry data") group.add_argument("--scan_comp_model_data", default=False, action="store_true", help="Scan computational model files (mock-data subset)") group.add_argument("--scan_ihm_dev", default=False, action="store_true", help="Scan PDBDEV I/HM entry data (current released subset)") # parser.add_argument("--config_path", default=None, help="Path to configuration options file") parser.add_argument("--config_name", default=defaultConfigName, help="Configuration section name") parser.add_argument("--input_file_list_path", default=None, help="Input file containing file paths to scan") parser.add_argument("--output_file_list_path", default=None, help="Output file containing file paths scanned") parser.add_argument("--fail_file_list_path", default=None, help="Output file containing file paths that fail scan") parser.add_argument("--scan_data_file_path", default=None, help="Output working file storing scan data (Pickle)") parser.add_argument("--coverage_file_path", default=None, help="Coverage map (JSON) output path") parser.add_argument("--coverage_item_file_path", default=None, help="Coverage by item (tdd) output path") parser.add_argument("--type_map_file_path", default=None, help="Type map (JSON) output path") parser.add_argument("--num_proc", default=2, help="Number of processes to execute (default=2)") parser.add_argument("--chunk_size", default=10, help="Number of files loaded per process") parser.add_argument("--file_limit", default=None, help="Load file limit for testing") parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging") parser.add_argument("--mock", default=False, action="store_true", help="Use MOCK repository configuration for testing") parser.add_argument("--cache_path", default=None, help="Cache path and working direcory for temporary files") args = parser.parse_args() # debugFlag = args.debug if debugFlag: logger.setLevel(logging.DEBUG) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Configuration Details configPath = args.config_path configName = args.config_name if not configPath: configPath = os.getenv("DBLOAD_CONFIG_PATH", None) try: if os.access(configPath, os.R_OK): os.environ["DBLOAD_CONFIG_PATH"] = configPath logger.info("Using configuration path %s (%s)", configPath, configName) else: logger.error("Missing or access issue with config file %r", configPath) exit(1) mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=defaultConfigName, mockTopPath=mockTopPath) if configName != defaultConfigName: cfgOb.replaceSectionName(defaultConfigName, configName) except Exception as e: logger.error("Missing or access issue with config file %r with %s", configPath, str(e)) exit(1) # try: numProc = int(args.num_proc) chunkSize = int(args.chunk_size) fileLimit = int(args.file_limit) if args.file_limit else None # failedFilePath = args.fail_file_list_path scanType = args.scanType # inputFileListPath = args.input_file_list_path outputFileListPath = args.output_file_list_path scanDataFilePath = args.scan_data_file_path dataCoverageFilePath = args.coverage_file_path dataCoverageItemFilePath = args.coverage_item_file_path dataTypeFilePath = args.type_map_file_path cachePath = args.cache_path if args.cache_path else "." except Exception as e: logger.exception("Argument processing problem %s", str(e)) parser.print_help(sys.stderr) exit(1) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # # Read any input path lists - # inputPathList = None if inputFileListPath: mu = MarshalUtil(workPath=cachePath) inputPathList = mu.doImport(inputFileListPath, fmt="list") # ## if args.scan_chem_comp_ref: contentType = "chem_comp_core" elif args.scan_chem_comp_core_ref: contentType = "chem_comp_core" elif args.scan_bird_chem_comp_ref: contentType = "bird_chem_comp_core" elif args.scan_bird_chem_comp_core_ref: contentType = "bird_chem_comp_core" elif args.scan_bird_ref: contentType = "bird" elif args.scan_bird_family_ref: contentType = "bird_family" elif args.scan_entry_data: contentType = "pdbx" elif args.scan_obsolete_entry_data: contentType = "pdbx_obsolete" elif args.scan_comp_model_data: contentType = "pdbx_comp_model_core" elif args.scan_ihm_dev: contentType = "ihm_dev" ok = scanRepo( cfgOb, contentType, scanDataFilePath, numProc, chunkSize, fileLimit, scanType=scanType, inputPathList=inputPathList, pathListFilePath=outputFileListPath, dataCoverageFilePath=dataCoverageFilePath, dataCoverageItemFilePath=dataCoverageItemFilePath, dataTypeFilePath=dataTypeFilePath, failedFilePath=failedFilePath, cachePath=cachePath, ) logger.info("Operation completed with status %r", ok)
class ReferenceSequenceUtilsTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(ReferenceSequenceUtilsTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml") # # Caution: this is very site specific setting ! configName = "site_info_remote" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) if configName != "site_info_configuration": self.__cfgOb.replaceSectionName("site_info_configuration", configName) # self.__workPath = os.path.join(HERE, "test-cache-preserve") # self.__entityPolymerCachePath = os.path.join( self.__workPath, "entity-polymer-data-cache.pic") self.__entityPolymerCacheKwargs = {"fmt": "pickle"} self.__useEntityPolymerCache = True # self.__refDbCachePath = os.path.join(self.__workPath, "unp-data-test-cache.json") self.__refDbCacheKwargs = {"fmt": "json", "indent": 3} # self.__refDbUseCache = True self.__fetchLimit = 500 # self.__mU = MarshalUtil() # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testUpdateUniProtCache(self): """Test case - extract entity polymer info and update reference sequence cache""" try: refDbName = "UNP" rsu = ReferenceSequenceUtils( self.__cfgOb, refDbName, referenceCachePath=self.__refDbCachePath, referenceCacheKwargs=self.__refDbCacheKwargs, useReferenceCache=self.__refDbUseCache, entityPolymerCachePath=self.__entityPolymerCachePath, entityPolymerCacheKwargs=self.__entityPolymerCacheKwargs, useEntityPolymerCache=self.__useEntityPolymerCache, fetchLimit=self.__fetchLimit, ) numPrimary, numSecondary, numNone = rsu.getReferenceAccessionAlignSummary( ) self.assertGreaterEqual(numPrimary, 70) logger.info("For %r matched primary: %d secondary: %d none %d", refDbName, numPrimary, numSecondary, numNone) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class EntityPolymerExtractorFullTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(EntityPolymerExtractorFullTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml") # # Caution: this is very site specific setting # configName = "site_info_remote" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) if configName != "site_info_configuration": self.__cfgOb.replaceSectionName("site_info_configuration", configName) # # self.__workPath = os.path.join(HERE, "test-cache-preserve") # self.__fullCacheKwargs = {"fmt": "pickle"} self.__fullEntitySaveCachePath = os.path.join( self.__workPath, "entity-polymer-data-cache.pic") # self.__mU = MarshalUtil() self.__entryLimitFull = 50 # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) @unittest.skip("rebuild cache") def testRebuildCache(self): """Test case - extract entity polymer info - rebuild full cache of extracted entity polymer data -""" try: epe = EntityPolymerExtractor( self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=False, saveCacheKwargs=self.__fullCacheKwargs, entryLimit=self.__entryLimitFull) eCount = epe.getEntryCount() if self.__entryLimitFull is not None: self.assertGreaterEqual(eCount, self.__entryLimitFull) else: self.assertGreaterEqual(eCount, 10) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testAccessEntityPolymerFeatures(self): """Test case - access cached entity polymer info from full cache""" try: epe = EntityPolymerExtractor( self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=True, saveCacheKwargs=self.__fullCacheKwargs) eCount = epe.getEntryCount() logger.info("Entry count %d", eCount) self.assertGreaterEqual(eCount, self.__entryLimitFull) # unpL = epe.getRefSeqAccessions("UNP") logger.info("Ref seq count %d", len(unpL)) self.assertGreaterEqual(len(unpL), 1) # testOp = False if testOp: for entryId in ["1CP9"]: for entityId in ["1", "2"]: uL = epe.getEntityRefSeqAccessions( "UNP", entryId, entityId) logger.debug("UNP for %s %s %r", entryId, entityId, uL) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testAccessEntityPolymerReadCache(self): """Test case - access cached entity polymer info from full cache""" try: epe = EntityPolymerExtractor( self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=True, saveCacheKwargs=self.__fullCacheKwargs) logger.info("Cache entry count %d", epe.getEntryCount()) cD = epe.countRefSeqAccessions("UNP") self.assertGreaterEqual(len(cD), 2) # logger.info("UNP reference sequences per entity %r", dict(sorted(cD.items()))) logger.info("Reference sequences per entity %r", dict(sorted(epe.countRefSeqAccessionAny().items()))) logger.info("Reference sequences per ref db %r", dict(sorted(epe.countRefSeqAccessionDbType().items()))) # ok = epe.checkRefSeqAlignRange("UNP") self.assertTrue(ok) unpL = epe.getRefSeqAccessions("UNP") logger.info("Unique UNP reference sequences %d", len(unpL)) self.assertTrue(ok) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testTaxonomyEntityPolymerReadCache(self): """Test case - evaluate taxonomy - from full cache""" try: taxIdList = [562, 9606, 3701] for taxId in taxIdList: tU = TaxonomyUtils(taxDirPath=self.__workPath) tL = tU.getLineage(taxId) logger.info("Taxonomy lineage for %d %r", taxId, tL) # # epe = EntityPolymerExtractor( self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=True, saveCacheKwargs=self.__fullCacheKwargs) logger.info("Cache entry count %d", epe.getEntryCount()) logger.info( "Reference sequences per ref db %r", dict(sorted(epe.countRefSeqAccessionDbType().items()))) rD = epe.countRefSeqAccessionByTaxon(dbNameList=["UNP"]) logger.info("Unique taxons %d", len(list(rD.keys()))) # numT = 0 for tId, aL in rD.items(): tL = tU.getLineage(tId) if taxId in tL: tc = len(set(aL)) logger.info("Matched %5d %s (%r)", tc, tU.getScientificName(tId), tId) numT += tc logger.info("Total matched accessions %d ", numT) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()