def test_materialize_from_identifier(self): logger.info(self.getTestHeader('test materialize from identifier')) curdir = os.getcwd() os.chdir(self.tmpdir) try: bdb.materialize("ark:/57799/b91H6JHBS1u2FTG") except Exception as e: self.fail(bdbag.get_typed_exception(e)) finally: os.chdir(curdir)
def test_materialize_from_dir(self): logger.info(self.getTestHeader('test materialize from dir')) curdir = os.getcwd() os.chdir(self.tmpdir) try: bdb.materialize(self.test_bag_fetch_http_dir) except Exception as e: self.fail(bdbag.get_typed_exception(e)) finally: os.chdir(curdir)
def test_materialize_from_file(self): logger.info(self.getTestHeader('test materialize from file')) curdir = os.getcwd() os.chdir(self.tmpdir) try: bdb.materialize( ospj(self.test_archive_dir, 'test-bag-fetch-http.zip')) except Exception as e: self.fail(bdbag.get_typed_exception(e)) finally: os.chdir(curdir)
def test_materialize_from_url(self): logger.info(self.getTestHeader('test materialize from URL')) curdir = os.getcwd() os.chdir(self.tmpdir) try: bdb.materialize( "https://github.com/fair-research/bdbag/raw/master/test/test-data/test-archives/" "test-bag.zip") except Exception as e: self.fail(bdbag.get_typed_exception(e)) finally: os.chdir(curdir)
def test_materialize_non_bag(self): logger.info(self.getTestHeader('test materialize non-bag')) curdir = os.getcwd() os.chdir(self.tmpdir) try: bag_path = bdb.materialize(self.test_data_dir) self.assertFalse(bdb.is_bag(bag_path)) except Exception as e: self.fail(bdbag.get_typed_exception(e)) finally: os.chdir(curdir)
def restore(self, **kwargs): """ Perform the catalog restore operation. The restore process is broken up into six phases: 1. Pre-process the input path. - If the input path is a file, it is assumed that it is a compressed archive file that can be extracted into an input directory via a supported codec: `tar`,`tgz`,`bz2`, or `zip`. - If the input directory is a valid _bag_ directory structure, the bag will be materialized. 2. The catalog schema will be restored first. The schema is restored from a ERMRest JSON schema document file. The schema document file must be named `catalog-schema.json` and must appear at the root of the input directory. The restore process can be configured to exclude the restoration of an enumerated set both schema and tables. 3. The catalog table data will be restored, if present. The table date restoration process is resilient to interruption and may be restarted. However, if the catalog schema or data is mutated outside of the scope of the restore function in-between such restarts, the restored catalog's consistency cannot be guaranteed. The restore process can be configured to exclude the restoration of table data for a set of tables. 4. The catalog foreign keys will be restored. 5. The catalog assets will be restored, if present. 6. On success, the restore state marker annotations will be deleted and the catalog history will be truncated. :param kwargs: :return: """ success = True start = datetime.datetime.now() # pre-process input logging.info("Processing input path: %s" % self.input_path) is_file, is_dir, is_uri = bdb.inspect_path(self.input_path) if not (is_file or is_dir or is_uri): raise DerivaRestoreError( "Invalid input path [%s]. If the specified input path refers to a locally mounted " "file or directory, it does not exist or cannot be accessed. If the specified " "path is a URI, the scheme component of the URI could not be determined." % self.input_path) if is_file or is_dir: self.input_path = os.path.abspath(self.input_path) if is_file: logging.info( "The input path [%s] is a file. Assuming input file is a directory archive and extracting..." % self.input_path) self.input_path = bdb.extract_bag(self.input_path) try: if not self.no_bag_materialize: self.input_path = bdb.materialize(self.input_path) except bdb.bdbagit.BagValidationError as e: if self.strict_bag_validation: raise DerivaRestoreError(format_exception(e)) else: logging.warning( "Input bag validation failed and strict validation mode is disabled. %s" % format_exception(e)) is_bag = bdb.is_bag(self.input_path) src_schema_file = os.path.abspath( os.path.join(self.input_path, "data" if is_bag else "", "catalog-schema.json")) # the src_catalog_stub created below will never be "connected" in any kind of network sense, # but we need an instance of ErmrestCatalog in order to get a working Model from the schema file. src_catalog_stub = ErmrestCatalog("file", src_schema_file, "1") src_model = Model.fromfile(src_catalog_stub, src_schema_file) # initialize/connect to destination catalog if not self.catalog_id: self.catalog_id = self.server.create_ermrest_catalog().catalog_id self.server_args["catalog_id"] = self.catalog_id logging.info("Created new target catalog with ID: %s" % self.catalog_id) self.dst_catalog = self.server.connect_ermrest(self.catalog_id) # init dcctx cid to a default self.dst_catalog.dcctx['cid'] = self.__class__.__name__ # build up the model content we will copy to destination dst_model = self.dst_catalog.getCatalogModel() logging.info("Restoring %s to catalog: %s" % (self.input_path, self.dst_catalog.get_server_uri())) # set top-level config right away and find fatal usage errors... if self.restore_policy: logging.info("Restoring top-level catalog ACLs...") if not src_model.acls: logging.info("Source schema does not contain any ACLs.") else: src_model.acls.owner.extend(dst_model.acls.owner) self.dst_catalog.put('/acl', json=src_model.acls) if self.restore_annotations: logging.info("Restoring top-level catalog annotations...") self.dst_catalog.put('/annotation', json=src_model.annotations) # build up the model content we will copy to destination dst_model = self.dst_catalog.getCatalogModel() new_model = [] new_columns = [ ] # ERMrest does not currently allow bulk column creation new_keys = [] # ERMrest does not currently allow bulk key creation restore_states = {} fkeys_deferred = {} exclude_schemas = [] if self.exclude_schemas is None else self.exclude_schemas try: for sname, schema in src_model.schemas.items(): if sname in exclude_schemas: continue if sname not in dst_model.schemas: new_model.append(self.copy_sdef(schema)) for tname, table in schema.tables.items(): if table.kind != 'table': logging.warning('Skipping restore of %s %s:%s' % (table.kind, sname, tname)) continue if 'RID' not in table.column_definitions.elements: raise DerivaRestoreError( "Source table %s.%s lacks system-columns and cannot be restored." % (sname, tname)) # make sure the source table is pruned of any existing restore state markers if table.annotations.get(CLONE_STATE_URL) is not None: del table.annotations[CLONE_STATE_URL] if table.annotations.get( self.RESTORE_STATE_URL) is not None: del table.annotations[self.RESTORE_STATE_URL] if sname not in dst_model.schemas or tname not in dst_model.schemas[ sname].tables: new_model.append(self.copy_tdef_core(table)) restore_states[( sname, tname)] = 1 if self.restore_data else None fkeys_deferred[(sname, tname)] = self.copy_tdef_fkeys(table) else: src_columns = { c.name: c for c in table.column_definitions } dst_columns = { c.name: c for c in dst_model.schemas[sname].tables[tname]. column_definitions } for cname in src_columns: if cname not in dst_columns: new_columns.append( self.copy_cdef(src_columns[cname])) else: self.check_column_compatibility( src_columns[cname], dst_columns[cname]) for cname in dst_columns: if cname not in src_columns: raise DerivaRestoreError( "Destination column %s.%s.%s does not exist in source catalog." % (sname, tname, cname)) src_keys = { tuple(sorted(c.name for c in key.unique_columns)): key for key in table.keys } dst_keys = { tuple(sorted(c.name for c in key.unique_columns)): key for key in dst_model.schemas[sname].tables[tname].keys } for utuple in src_keys: if utuple not in dst_keys: new_keys.append( self.copy_kdef(src_keys[utuple])) for utuple in dst_keys: if utuple not in src_keys: raise DerivaRestoreError( "Destination key %s.%s(%s) does not exist in source catalog." % (sname, tname, ', '.join(utuple))) restore_states[(sname, tname)] = \ dst_model.schemas[sname].tables[tname].annotations.get(self.RESTORE_STATE_URL) if dst_model.schemas[sname].tables[tname].foreign_keys: # assume that presence of any destination foreign keys means we already completed if self.restore_assets: self.upload_assets() return else: fkeys_deferred[( sname, tname)] = self.copy_tdef_fkeys(table) # apply the stage 1 model to the destination in bulk logging.info("Restoring catalog schema...") if new_model: self.dst_catalog.post("/schema", json=new_model).raise_for_status() for sname, tname, cdef in new_columns: self.dst_catalog.post("/schema/%s/table/%s/column" % (urlquote(sname), urlquote(tname)), json=cdef).raise_for_status() for sname, tname, kdef in new_keys: self.dst_catalog.post("/schema/%s/table/%s/key" % (urlquote(sname), urlquote(tname)), json=kdef).raise_for_status() # copy data in stage 2 if self.restore_data: logging.info("Restoring catalog data...") for sname, tname in restore_states.keys(): tname_uri = "%s:%s" % (urlquote(sname), urlquote(tname)) if restore_states[(sname, tname)] == 1: # determine current position in (partial?) copy row = self.dst_catalog.get( "/entity/%s@sort(RID::desc::)?limit=1" % tname_uri).json() if row: last = row[0]['RID'] logging.info( "Existing data detected in table [%s] -- will attempt partial restore of " "remaining records following last known RID: %s" % (tname_uri, last)) else: last = None table = self.get_json_recordset( self.open_json_stream_file( self.get_table_path(sname, tname, is_bag)), self.data_chunk_size, after=last) total = 0 table_success = True try: for chunk in table: if chunk: self.dst_catalog.post( "/entity/%s?nondefaults=RID,RCT,RCB" % tname_uri, json=chunk) total += len(chunk) else: break except: table_success = False finally: table.close() if table_success: logging.info( "Restoration of table data [%s] successful. %s rows restored." % (tname_uri, total)) else: logging.warning( "Restoration of table data [%s] failed. %s rows restored." % (tname_uri, total)) # record our progress on catalog in case we fail part way through self.dst_catalog.put( "/schema/%s/table/%s/annotation/%s" % ( urlquote(sname), urlquote(tname), urlquote(self.RESTORE_STATE_URL), ), json=2) elif restore_states[(sname, tname)] is None and ( sname, tname) in { ('public', 'ERMrest_Client'), ('public', 'ERMrest_Group'), }: # special sync behavior for magic ermrest tables # HACK: these are assumed small enough to join via local merge of arrays want = sorted(self.load_json_file( self.get_table_path(sname, tname, is_bag)), key=lambda r: r['ID']) have = sorted(self.dst_catalog.get( "/entity/%s?limit=none" % tname_uri).json(), key=lambda r: r['ID']) create = [] update = [] pos_want = 0 pos_have = 0 while pos_want < len(want): while pos_have < len(have) and have[pos_have][ 'ID'] < want[pos_want]['ID']: # dst-only rows will be retained as is pos_have += 1 if pos_have >= len(have) or have[pos_have][ 'ID'] > want[pos_want]['ID']: # src-only rows will be inserted create.append(want[pos_want]) pos_want += 1 else: # overlapping rows will be updated update.append(want[pos_want]) pos_want += 1 self.dst_catalog.post( "/entity/%s?nondefaults=RCT,RCB" % tname_uri, json=create) self.dst_catalog.put( "/attributegroup/%s/ID;%s" % (tname_uri, ",".join([ urlquote(c.name) for c in src_model. schemas[sname].tables[tname].column_definitions if c.name not in {'RID', 'RMT', 'RMB', 'ID'} ])), json=update) # record our progress on catalog in case we fail part way through self.dst_catalog.put( "/schema/%s/table/%s/annotation/%s" % ( urlquote(sname), urlquote(tname), urlquote(self.RESTORE_STATE_URL), ), json=2) # apply stage 2 model in bulk only... we won't get here unless preceding succeeded logging.info("Restoring foreign keys...") new_fkeys = [] for fkeys in fkeys_deferred.values(): new_fkeys.extend(fkeys) # restore fkeys if new_fkeys: self.dst_catalog.post("/schema", json=new_fkeys) # restore assets if self.restore_assets: self.upload_assets() # cleanup self.cleanup_restored_catalog() except: success = False raise finally: elapsed_time = datetime.datetime.now() - start total_secs = elapsed_time.total_seconds() elapsed = time.strftime('%H:%M:%S', time.gmtime(total_secs)) logging.info("Restore of catalog %s %s. %s" % (self.dst_catalog.get_server_uri(), "completed successfully" if success else "failed", ("Elapsed time: %s" % elapsed) if (total_secs > 0) else ""))