Esempio n. 1
0
 def test_materialize_from_identifier(self):
     logger.info(self.getTestHeader('test materialize from identifier'))
     curdir = os.getcwd()
     os.chdir(self.tmpdir)
     try:
         bdb.materialize("ark:/57799/b91H6JHBS1u2FTG")
     except Exception as e:
         self.fail(bdbag.get_typed_exception(e))
     finally:
         os.chdir(curdir)
Esempio n. 2
0
 def test_materialize_from_dir(self):
     logger.info(self.getTestHeader('test materialize from dir'))
     curdir = os.getcwd()
     os.chdir(self.tmpdir)
     try:
         bdb.materialize(self.test_bag_fetch_http_dir)
     except Exception as e:
         self.fail(bdbag.get_typed_exception(e))
     finally:
         os.chdir(curdir)
Esempio n. 3
0
 def test_materialize_from_file(self):
     logger.info(self.getTestHeader('test materialize from file'))
     curdir = os.getcwd()
     os.chdir(self.tmpdir)
     try:
         bdb.materialize(
             ospj(self.test_archive_dir, 'test-bag-fetch-http.zip'))
     except Exception as e:
         self.fail(bdbag.get_typed_exception(e))
     finally:
         os.chdir(curdir)
Esempio n. 4
0
 def test_materialize_from_url(self):
     logger.info(self.getTestHeader('test materialize from URL'))
     curdir = os.getcwd()
     os.chdir(self.tmpdir)
     try:
         bdb.materialize(
             "https://github.com/fair-research/bdbag/raw/master/test/test-data/test-archives/"
             "test-bag.zip")
     except Exception as e:
         self.fail(bdbag.get_typed_exception(e))
     finally:
         os.chdir(curdir)
Esempio n. 5
0
 def test_materialize_non_bag(self):
     logger.info(self.getTestHeader('test materialize non-bag'))
     curdir = os.getcwd()
     os.chdir(self.tmpdir)
     try:
         bag_path = bdb.materialize(self.test_data_dir)
         self.assertFalse(bdb.is_bag(bag_path))
     except Exception as e:
         self.fail(bdbag.get_typed_exception(e))
     finally:
         os.chdir(curdir)
Esempio n. 6
0
    def restore(self, **kwargs):
        """
        Perform the catalog restore operation. The restore process is broken up into six phases:

        1. Pre-process the input path.
            - If the input path is a file, it is assumed that it is a compressed archive file that can be extracted
            into an input directory via a supported codec: `tar`,`tgz`,`bz2`, or `zip`.
            - If the input directory is a valid _bag_ directory structure, the bag will be materialized.
        2. The catalog schema will be restored first. The schema is restored from a ERMRest JSON schema document file.
            The schema document file must be named `catalog-schema.json` and must appear at the root of the input
            directory. The restore process can be configured to exclude the restoration of an enumerated set both
            schema and tables.
        3. The catalog table data will be restored, if present. The table date restoration process is resilient to
            interruption and may be restarted. However, if the catalog schema or data is mutated outside of the scope of
            the restore function in-between such restarts, the restored catalog's consistency cannot be guaranteed.
            The restore process can be configured to exclude the restoration of table data for a set of tables.
        4. The catalog foreign keys will be restored.
        5. The catalog assets will be restored, if present.
        6. On success, the restore state marker annotations will be deleted and the catalog history will be truncated.

        :param kwargs:
        :return:
        """
        success = True
        start = datetime.datetime.now()

        # pre-process input
        logging.info("Processing input path: %s" % self.input_path)
        is_file, is_dir, is_uri = bdb.inspect_path(self.input_path)
        if not (is_file or is_dir or is_uri):
            raise DerivaRestoreError(
                "Invalid input path [%s]. If the specified input path refers to a locally mounted "
                "file or directory, it does not exist or cannot be accessed. If the specified "
                "path is a URI, the scheme component of the URI could not be determined."
                % self.input_path)
        if is_file or is_dir:
            self.input_path = os.path.abspath(self.input_path)
        if is_file:
            logging.info(
                "The input path [%s] is a file. Assuming input file is a directory archive and extracting..."
                % self.input_path)
            self.input_path = bdb.extract_bag(self.input_path)

        try:
            if not self.no_bag_materialize:
                self.input_path = bdb.materialize(self.input_path)
        except bdb.bdbagit.BagValidationError as e:
            if self.strict_bag_validation:
                raise DerivaRestoreError(format_exception(e))
            else:
                logging.warning(
                    "Input bag validation failed and strict validation mode is disabled. %s"
                    % format_exception(e))
        is_bag = bdb.is_bag(self.input_path)

        src_schema_file = os.path.abspath(
            os.path.join(self.input_path, "data" if is_bag else "",
                         "catalog-schema.json"))
        # the src_catalog_stub created below will never be "connected" in any kind of network sense,
        # but we need an instance of ErmrestCatalog in order to get a working Model from the schema file.
        src_catalog_stub = ErmrestCatalog("file", src_schema_file, "1")
        src_model = Model.fromfile(src_catalog_stub, src_schema_file)

        # initialize/connect to destination catalog
        if not self.catalog_id:
            self.catalog_id = self.server.create_ermrest_catalog().catalog_id
            self.server_args["catalog_id"] = self.catalog_id
            logging.info("Created new target catalog with ID: %s" %
                         self.catalog_id)
        self.dst_catalog = self.server.connect_ermrest(self.catalog_id)

        # init dcctx cid to a default
        self.dst_catalog.dcctx['cid'] = self.__class__.__name__

        # build up the model content we will copy to destination
        dst_model = self.dst_catalog.getCatalogModel()

        logging.info("Restoring %s to catalog: %s" %
                     (self.input_path, self.dst_catalog.get_server_uri()))
        # set top-level config right away and find fatal usage errors...
        if self.restore_policy:
            logging.info("Restoring top-level catalog ACLs...")
            if not src_model.acls:
                logging.info("Source schema does not contain any ACLs.")
            else:
                src_model.acls.owner.extend(dst_model.acls.owner)
                self.dst_catalog.put('/acl', json=src_model.acls)

        if self.restore_annotations:
            logging.info("Restoring top-level catalog annotations...")
            self.dst_catalog.put('/annotation', json=src_model.annotations)

        # build up the model content we will copy to destination
        dst_model = self.dst_catalog.getCatalogModel()

        new_model = []
        new_columns = [
        ]  # ERMrest does not currently allow bulk column creation
        new_keys = []  # ERMrest does not currently allow bulk key creation
        restore_states = {}
        fkeys_deferred = {}
        exclude_schemas = [] if self.exclude_schemas is None else self.exclude_schemas

        try:
            for sname, schema in src_model.schemas.items():
                if sname in exclude_schemas:
                    continue
                if sname not in dst_model.schemas:
                    new_model.append(self.copy_sdef(schema))

                for tname, table in schema.tables.items():
                    if table.kind != 'table':
                        logging.warning('Skipping restore of %s %s:%s' %
                                        (table.kind, sname, tname))
                        continue

                    if 'RID' not in table.column_definitions.elements:
                        raise DerivaRestoreError(
                            "Source table %s.%s lacks system-columns and cannot be restored."
                            % (sname, tname))

                    # make sure the source table is pruned of any existing restore state markers
                    if table.annotations.get(CLONE_STATE_URL) is not None:
                        del table.annotations[CLONE_STATE_URL]
                    if table.annotations.get(
                            self.RESTORE_STATE_URL) is not None:
                        del table.annotations[self.RESTORE_STATE_URL]

                    if sname not in dst_model.schemas or tname not in dst_model.schemas[
                            sname].tables:
                        new_model.append(self.copy_tdef_core(table))
                        restore_states[(
                            sname, tname)] = 1 if self.restore_data else None
                        fkeys_deferred[(sname,
                                        tname)] = self.copy_tdef_fkeys(table)
                    else:
                        src_columns = {
                            c.name: c
                            for c in table.column_definitions
                        }
                        dst_columns = {
                            c.name: c
                            for c in dst_model.schemas[sname].tables[tname].
                            column_definitions
                        }

                        for cname in src_columns:
                            if cname not in dst_columns:
                                new_columns.append(
                                    self.copy_cdef(src_columns[cname]))
                            else:
                                self.check_column_compatibility(
                                    src_columns[cname], dst_columns[cname])

                        for cname in dst_columns:
                            if cname not in src_columns:
                                raise DerivaRestoreError(
                                    "Destination column %s.%s.%s does not exist in source catalog."
                                    % (sname, tname, cname))

                        src_keys = {
                            tuple(sorted(c.name
                                         for c in key.unique_columns)): key
                            for key in table.keys
                        }
                        dst_keys = {
                            tuple(sorted(c.name
                                         for c in key.unique_columns)): key
                            for key in
                            dst_model.schemas[sname].tables[tname].keys
                        }

                        for utuple in src_keys:
                            if utuple not in dst_keys:
                                new_keys.append(
                                    self.copy_kdef(src_keys[utuple]))

                        for utuple in dst_keys:
                            if utuple not in src_keys:
                                raise DerivaRestoreError(
                                    "Destination key %s.%s(%s) does not exist in source catalog."
                                    % (sname, tname, ', '.join(utuple)))

                        restore_states[(sname, tname)] = \
                            dst_model.schemas[sname].tables[tname].annotations.get(self.RESTORE_STATE_URL)
                        if dst_model.schemas[sname].tables[tname].foreign_keys:
                            # assume that presence of any destination foreign keys means we already completed
                            if self.restore_assets:
                                self.upload_assets()
                            return
                        else:
                            fkeys_deferred[(
                                sname, tname)] = self.copy_tdef_fkeys(table)

            # apply the stage 1 model to the destination in bulk
            logging.info("Restoring catalog schema...")
            if new_model:
                self.dst_catalog.post("/schema",
                                      json=new_model).raise_for_status()

            for sname, tname, cdef in new_columns:
                self.dst_catalog.post("/schema/%s/table/%s/column" %
                                      (urlquote(sname), urlquote(tname)),
                                      json=cdef).raise_for_status()

            for sname, tname, kdef in new_keys:
                self.dst_catalog.post("/schema/%s/table/%s/key" %
                                      (urlquote(sname), urlquote(tname)),
                                      json=kdef).raise_for_status()

            # copy data in stage 2
            if self.restore_data:
                logging.info("Restoring catalog data...")
                for sname, tname in restore_states.keys():
                    tname_uri = "%s:%s" % (urlquote(sname), urlquote(tname))
                    if restore_states[(sname, tname)] == 1:
                        # determine current position in (partial?) copy
                        row = self.dst_catalog.get(
                            "/entity/%s@sort(RID::desc::)?limit=1" %
                            tname_uri).json()
                        if row:
                            last = row[0]['RID']
                            logging.info(
                                "Existing data detected in table [%s] -- will attempt partial restore of "
                                "remaining records following last known RID: %s"
                                % (tname_uri, last))
                        else:
                            last = None

                        table = self.get_json_recordset(
                            self.open_json_stream_file(
                                self.get_table_path(sname, tname, is_bag)),
                            self.data_chunk_size,
                            after=last)

                        total = 0
                        table_success = True
                        try:
                            for chunk in table:
                                if chunk:
                                    self.dst_catalog.post(
                                        "/entity/%s?nondefaults=RID,RCT,RCB" %
                                        tname_uri,
                                        json=chunk)
                                    total += len(chunk)
                                else:
                                    break
                        except:
                            table_success = False
                        finally:
                            table.close()
                            if table_success:
                                logging.info(
                                    "Restoration of table data [%s] successful. %s rows restored."
                                    % (tname_uri, total))
                            else:
                                logging.warning(
                                    "Restoration of table data [%s] failed. %s rows restored."
                                    % (tname_uri, total))

                        # record our progress on catalog in case we fail part way through
                        self.dst_catalog.put(
                            "/schema/%s/table/%s/annotation/%s" % (
                                urlquote(sname),
                                urlquote(tname),
                                urlquote(self.RESTORE_STATE_URL),
                            ),
                            json=2)
                    elif restore_states[(sname, tname)] is None and (
                            sname, tname) in {
                                ('public', 'ERMrest_Client'),
                                ('public', 'ERMrest_Group'),
                            }:
                        # special sync behavior for magic ermrest tables
                        # HACK: these are assumed small enough to join via local merge of arrays
                        want = sorted(self.load_json_file(
                            self.get_table_path(sname, tname, is_bag)),
                                      key=lambda r: r['ID'])
                        have = sorted(self.dst_catalog.get(
                            "/entity/%s?limit=none" % tname_uri).json(),
                                      key=lambda r: r['ID'])
                        create = []
                        update = []

                        pos_want = 0
                        pos_have = 0
                        while pos_want < len(want):
                            while pos_have < len(have) and have[pos_have][
                                    'ID'] < want[pos_want]['ID']:
                                # dst-only rows will be retained as is
                                pos_have += 1
                            if pos_have >= len(have) or have[pos_have][
                                    'ID'] > want[pos_want]['ID']:
                                # src-only rows will be inserted
                                create.append(want[pos_want])
                                pos_want += 1
                            else:
                                # overlapping rows will be updated
                                update.append(want[pos_want])
                                pos_want += 1

                        self.dst_catalog.post(
                            "/entity/%s?nondefaults=RCT,RCB" % tname_uri,
                            json=create)
                        self.dst_catalog.put(
                            "/attributegroup/%s/ID;%s" % (tname_uri, ",".join([
                                urlquote(c.name) for c in src_model.
                                schemas[sname].tables[tname].column_definitions
                                if c.name not in {'RID', 'RMT', 'RMB', 'ID'}
                            ])),
                            json=update)

                        # record our progress on catalog in case we fail part way through
                        self.dst_catalog.put(
                            "/schema/%s/table/%s/annotation/%s" % (
                                urlquote(sname),
                                urlquote(tname),
                                urlquote(self.RESTORE_STATE_URL),
                            ),
                            json=2)

            # apply stage 2 model in bulk only... we won't get here unless preceding succeeded
            logging.info("Restoring foreign keys...")
            new_fkeys = []
            for fkeys in fkeys_deferred.values():
                new_fkeys.extend(fkeys)

            # restore fkeys
            if new_fkeys:
                self.dst_catalog.post("/schema", json=new_fkeys)

            # restore assets
            if self.restore_assets:
                self.upload_assets()

            # cleanup
            self.cleanup_restored_catalog()
        except:
            success = False
            raise
        finally:
            elapsed_time = datetime.datetime.now() - start
            total_secs = elapsed_time.total_seconds()
            elapsed = time.strftime('%H:%M:%S', time.gmtime(total_secs))
            logging.info("Restore of catalog %s %s. %s" %
                         (self.dst_catalog.get_server_uri(),
                          "completed successfully" if success else "failed",
                          ("Elapsed time: %s" % elapsed) if
                          (total_secs > 0) else ""))