コード例 #1
0
ファイル: utils.py プロジェクト: fair-research/concierge
def extract_bag(local_bag_archive_path):
    """Unachive a local bdbag, and return the local path. Places the unachived
    bag next to the archived one, minus the archived bag's extension."""
    local_bag, _ = os.path.splitext(local_bag_archive_path)
    bdbag_api.extract_bag(local_bag_archive_path, os.path.dirname(local_bag))
    bagit_bag = bagit.Bag(local_bag)
    return bagit_bag
コード例 #2
0
def extract_bag(bdbag_zip_path, output_directory=None, project_name=None):
    """Extract BDBag contents into named output directory in original BDBag location."""
    (before, sep, after) = bdbag_zip_path.rpartition('.zip')
    prefix = os.path.basename(before)
    if project_name:
        prefix = project_name
    outdir = os.path.dirname(before)
    if output_directory:
        outdir = output_directory
    outdir = os.path.normpath(outdir)
    bdbag_api.extract_bag(bdbag_zip_path, output_path=outdir)
    return os.path.join(outdir, prefix, "data")
コード例 #3
0
ファイル: test_api.py プロジェクト: sterlingbaldwin/bdbag
 def test_extract_bag_archive_zip_with_relocate_existing(self):
     logger.info(self.getTestHeader('extract bag zip format, relocate existing'))
     try:
         bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.zip'), temp=False)
         self.assertTrue(ospe(bag_path))
         self.assertTrue(bdb.is_bag(bag_path))
         bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.zip'), temp=False)
         self.assertTrue(ospe(bag_path))
         self.assertTrue(bdb.is_bag(bag_path))
         bdb.cleanup_bag(os.path.dirname(bag_path))
         output = self.stream.getvalue()
         self.assertExpectedMessages(["moving existing directory"], output)
     except Exception as e:
         self.fail(get_typed_exception(e))
コード例 #4
0
    def parse(self, bag_archive, output_path="out"):
        """ Analyze the bag, consuming BagIt-RO metadata into a structure downstream code emitters can use. """
        manifest = {}
        """ Extract the bag. """
        bag_path = bdbag_api.extract_bag(bag_archive, output_path=output_path)
        if bdbag_api.is_bag(bag_path):

            logger.debug("Initializing metadata datasets")
            manifest['path'] = bag_path
            manifest['datasets'] = {}
            datasets = manifest['datasets']
            data_path = os.path.join(bag_path, "data")
            """ Extract tarred files. """
            tar_data_files = glob.glob(os.path.join(data_path, "*.csv.gz"))
            for f in tar_data_files:
                with gzip.open(f, 'rb') as zipped:
                    extracted = f.replace(".gz", "")
                    with open(extracted, "wb") as stream:
                        file_content = zipped.read()
                        stream.write(file_content)
            """ Collect metadata for each file. """
            data_files = glob.glob(os.path.join(data_path, "*.csv"))
            csv_filter = CSVFilter()
            for f in data_files:
                csv_filter.filter_data(f)
                logger.debug(f"  --collecting metadata for: {f}")
                jsonld_context = self._get_jsonld_context(f)
                datasets[f] = jsonld_context
                context = datasets[f]['@context']
                datasets[f]['columns'] = {
                    k: None
                    for k in context if isinstance(context[k], dict)
                }
        return manifest
コード例 #5
0
ファイル: bundle.py プロジェクト: inab/ro-curate
def validate(ro_path):
    """
    Validates the research object at the path `ro_path`, throwing an
    exception if an error is encountered.
    Throws urllib.error.HTTPError, ValidationError, MissingManifestError,
    json.decoder.JSONDecodeError.
    :param ro_path: relative or absolute path to the root directory of the
    research object
    """
    # Extract bag to temp directory and process the RO as a directory
    ro_path = bdbag_api.extract_bag(ro_path, temp=True)
    ro_path = os.path.abspath(ro_path)

    # Validate BagIt RO bag
    # try:
    #     bdbag_api.validate_bag(ro_path)
    #     bdbag_api.validate_bag_structure(ro_path)
    # except BagValidationError as err:
    #     yield err

    # Get graphs for manifest and main profile
    try:
        manifest_path = find_manifest(ro_path)
    except MissingManifestError as err:
        yield err
        return

    manifest_graph = get_graph(path_to_uri(manifest_path), base=ro_path)

    for err in validate_graph(manifest_graph, base=ro_path):
        yield err
コード例 #6
0
ファイル: validation.py プロジェクト: nih-cfde/cfde-submit
def ts_validate(data_path, schema=None):
    """Validate a given TableSchema using frictionless.

    Arguments:
        data_path (str): Path to the TableSchema JSON or BDBag directory
                or BDBag archive to validate.
        schema (str): The schema to validate against. If not provided,
                the data is only validated against the defined TableSchema.
                Default None.

    Returns:
        dict: The validation results.
            is_valid (bool): Is the TableSchema valid?
            raw_errors (list): The raw Exceptions generated from any validation errors.
            error (str): A formatted error message about any validation errors.
    """
    if os.path.isfile(data_path):
        archive_file = data_path
        try:
            data_path = bdbag_api.extract_bag(data_path, temp=True)
        except Exception as e:
            raise InvalidInput("Error extracting %s: %s" % (archive_file, e))
        if not bdbag_api.is_bag(data_path):
            raise InvalidInput(
                "Input %s does not appear to be a valid BDBag. This tool requires a"
                " prepared BDBag archive when invoked on an existing archive file."
                % archive_file)

    # If data_path is a directory, find JSON
    if os.path.isdir(data_path):
        if "data" in os.listdir(data_path):
            data_path = os.path.join(data_path, "data")
        desc_file_list = [
            filename for filename in os.listdir(data_path)
            if filename.endswith(".json") and not filename.startswith(".")
        ]
        if len(desc_file_list) < 1:
            raise ValidationException("No TableSchema JSON file found")
        elif len(desc_file_list) > 1:
            raise ValidationException("Mutiple JSON files found in directory")
        else:
            data_path = os.path.join(data_path, desc_file_list[0])

    # Read into Package
    try:
        pkg = Package(data_path)
        report = validate(pkg, schema=schema)
    except FrictionlessException as e:
        raise ValidationException("Validation error\n%s" % e.error.message)

    if not report.valid:
        if report.errors:
            msg = report.errors[0]['message']
        else:
            for task in report['tasks']:
                if not task.valid:
                    msg = task['resource']['path'] + "\n"
                    msg += task['errors'][0]['message']
        raise ValidationException("Validation error in %s" % msg)
コード例 #7
0
ファイル: test_api.py プロジェクト: sterlingbaldwin/bdbag
 def test_extract_bag_archive_tar(self):
     logger.info(self.getTestHeader('extract bag tar format'))
     try:
         bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.tar'), temp=True)
         self.assertTrue(ospe(bag_path))
         self.assertTrue(bdb.is_bag(bag_path))
         bdb.cleanup_bag(os.path.dirname(bag_path))
     except Exception as e:
         self.fail(get_typed_exception(e))
コード例 #8
0
ファイル: test_api.py プロジェクト: kylechard/bdbag
 def test_extract_bag_archive_tar(self):
     logger.info(self.getTestHeader('extract bag tar format'))
     try:
         bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.tar'), temp=True)
         self.assertTrue(ospe(bag_path))
         self.assertTrue(bdb.is_bag(bag_path))
         bdb.cleanup_bag(os.path.dirname(bag_path))
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
コード例 #9
0
def get_gtex_files_by_id():
    lbags = [
        fname for fname in os.listdir(BAG_DIR)
        if fname.endswith('.zip') and fname not in BLACKLISTED_BAGS
    ]

    bag_info = {}
    for local_bag_archive in lbags:
        local_bag_archive = os.path.abspath(BAG_DIR + '/' + local_bag_archive)
        local_bag, _ = os.path.splitext(local_bag_archive)
        if not os.path.exists(local_bag):
            try:
                bdbag_api.extract_bag(local_bag_archive,
                                      os.path.dirname(local_bag))
            except RuntimeError:
                continue
        with open(local_bag + '/fetch.txt') as lbfh:
            content = lbfh.read()
            cram_fname = content.split('\t')[0]
            cram_basename = os.path.basename(cram_fname)
            gtex_id = cram_basename.split('.', 1)[0]
            if bag_info.get(gtex_id):
                raise ValueError('Another bag exists with this run info: '
                                 '\n1: {}\n2: {}'.format(
                                     bag_info.get(gtex_id)['file'],
                                     local_bag_archive))
            bag_info[gtex_id] = {
                'id':
                gtex_id,
                'file':
                local_bag_archive,
                'basename':
                os.path.basename(local_bag_archive),
                'size':
                os.stat(local_bag_archive).st_size,
                'md5':
                md5(local_bag_archive),
                'location':
                'https://bags.fair-research.org/{}'
                ''.format(os.path.basename(local_bag_archive))
            }

    return bag_info
コード例 #10
0
ファイル: utils.py プロジェクト: ini-bdds/data-concierge
def fetch_bags(minids):
    """Given a list of minid bag models, follow their location and
    fetch the data associated with them, if it doesn't already
    exist on the filesystem. Returns a list of bagit bag objects"""
    bags = _resolve_minids_to_bags(minids)
    bagit_bags = []
    for bag in bags:
        bag_name = os.path.basename(bag.location)
        local_bag_archive = os.path.join(settings.BAG_STAGING_DIR, bag_name)
        if not os.path.exists(local_bag_archive):
            r = requests.get(bag.location, stream=True)
            if r.status_code == 200:
                with open(local_bag_archive, 'wb') as f:
                    for chunk in r.iter_content(HTTP_CHUNK_SIZE):
                        f.write(chunk)
        local_bag, _ = os.path.splitext(local_bag_archive)
        if not os.path.exists(local_bag):
            bdbag_api.extract_bag(local_bag_archive,
                                  os.path.dirname(local_bag))
        bagit_bag = bagit.Bag(local_bag)
        bagit_bags.append(bagit_bag)
    return bagit_bags
コード例 #11
0
    def testExportBag(self):
        tale = self._create_water_tale()
        export_path = '/tale/{}/export'.format(str(tale['_id']))
        resp = self.request(path=export_path,
                            method='GET',
                            params={'taleFormat': 'bagit'},
                            isJson=False,
                            user=self.user)
        dirpath = tempfile.mkdtemp()
        bag_file = os.path.join(dirpath, "{}.zip".format(str(tale['_id'])))
        with open(bag_file, 'wb') as fp:
            for content in resp.body:
                fp.write(content)
        temp_path = bdb.extract_bag(bag_file, temp=True)
        try:
            bdb.validate_bag_structure(temp_path)
        except bagit.BagValidationError:
            pass  # TODO: Goes without saying that we should not be doing that...
        shutil.rmtree(dirpath)

        # Test dataSetCitation
        resp = self.request(path='/tale/{_id}'.format(**tale),
                            method='PUT',
                            type='application/json',
                            user=self.user,
                            body=json.dumps({
                                'dataSet': [],
                                'imageId': str(tale['imageId']),
                                'public': tale['public'],
                            }))
        self.assertStatusOk(resp)
        tale = resp.json
        self.assertEqual(tale['dataSetCitation'], [])

        self.model('tale', 'wholetale').remove(tale)
        self.model('collection').remove(self.data_collection)
コード例 #12
0
def ts_validate(data_path, schema=None):
    """Validate a given TableSchema using the Datapackage package.

    Arguments:
        data_path (str): Path to the TableSchema JSON or BDBag directory
                or BDBag archive to validate.
        schema (str): The schema to validate against. If not provided,
                the data is only validated against the defined TableSchema.
                Default None.

    Returns:
        dict: The validation results.
            is_valid (bool): Is the TableSchema valid?
            raw_errors (list): The raw Exceptions generated from any validation errors.
            error (str): A formatted error message about any validation errors.
    """
    # If data_path is BDBag archive, unarchive to temp dir
    try:
        data_path = bdbag_api.extract_bag(data_path, temp=True)
    # data_path is not archive
    except RuntimeError:
        pass
    # If data_path is dir (incl. if was unarchived), find JSON desc
    if os.path.isdir(data_path):
        # If 'data' dir present, search there instead
        if "data" in os.listdir(data_path):
            data_path = os.path.join(data_path, "data")
        # Find .json file (cannot be hidden)
        desc_file_list = [
            filename for filename in os.listdir(data_path)
            if filename.endswith(".json") and not filename.startswith(".")
        ]
        if len(desc_file_list) < 1:
            return {
                "is_valid":
                False,
                "raw_errors":
                [FileNotFoundError("No TableSchema JSON file found.")],
                "error":
                "No TableSchema JSON file found."
            }
        elif len(desc_file_list) > 1:
            return {
                "is_valid":
                False,
                "raw_errors":
                [RuntimeError("Multiple JSON files found in directory.")],
                "error":
                "Multiple JSON files found in directory."
            }
        else:
            data_path = os.path.join(data_path, desc_file_list[0])
    # data_path should/must be file now (JSON desc)
    if not os.path.isfile(data_path):
        return {
            "is_valid":
            False,
            "raw_errors": [
                ValueError(
                    "Path '{}' does not refer to a file".format(data_path))
            ],
            "error":
            "Path '{}' does not refer to a file".format(data_path)
        }

    # Read into Package (identical to DataPackage), return error on failure
    try:
        pkg = Package(descriptor=data_path, strict=True)
    except Exception as e:
        return {
            "is_valid": False,
            "raw_errors": e.errors,
            "error": "\n".join([str(err) for err in pkg.errors])
        }
    # Check and return package validity based on non-Exception-throwing Package validation
    if not pkg.valid:
        return {
            "is_valid": pkg.valid,
            "raw_errors": pkg.errors,
            "error": "\n".join([str(err) for err in pkg.errors])
        }
    # Perform manual validation as well
    for resource in pkg.resources:
        try:
            resource.read()
        except CastError as e:
            return {
                "is_valid": False,
                "raw_errors": e.errors,
                "error": "\n".join([str(err) for err in e.errors])
            }
        except Exception as e:
            return {"is_valid": False, "raw_errors": repr(e), "error": str(e)}
    return {"is_valid": True, "raw_errors": [], "error": None}
コード例 #13
0
def main():

    args, is_bag, is_file = parse_cli()
    path = os.path.abspath(args.path)

    archive = None
    temp_path = None
    error = None
    result = 0

    if not args.quiet:
        sys.stderr.write('\n')

    try:
        if not is_file:
            # do not try to create or update the bag if the user just wants to validate or complete an existing bag
            if not (
                (args.validate or args.validate_profile or args.resolve_fetch)
                    and not (args.update and bdb.is_bag(path))):
                if args.checksum and 'all' in args.checksum:
                    args.checksum = ['md5', 'sha1', 'sha256', 'sha512']
                # create or update the bag depending on the input arguments
                bdb.make_bag(path,
                             algs=args.checksum,
                             update=args.update,
                             save_manifests=not args.skip_manifests,
                             prune_manifests=args.prune_manifests,
                             metadata=BAG_METADATA if BAG_METADATA else None,
                             metadata_file=args.metadata_file,
                             remote_file_manifest=args.remote_file_manifest,
                             config_file=args.config_file,
                             ro_metadata_file=args.ro_metadata_file)

        # otherwise just extract the bag if it is an archive and no other conflicting options specified
        elif not (args.validate or args.validate_profile
                  or args.resolve_fetch):
            bdb.extract_bag(path)
            if not args.quiet:
                sys.stderr.write('\n')
            return result

        if args.ro_manifest_generate:
            bdb.generate_ro_manifest(
                path,
                True if args.ro_manifest_generate == "overwrite" else False,
                config_file=args.config_file)

        if args.resolve_fetch:
            if args.validate == 'full':
                sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING)
            bdb.resolve_fetch(
                path,
                force=True if args.resolve_fetch == 'all' else False,
                keychain_file=args.keychain_file,
                config_file=args.config_file,
                filter_expr=args.fetch_filter)

        if args.validate:
            if is_file:
                temp_path = bdb.extract_bag(path, temp=True)
            if args.validate == 'structure':
                bdb.validate_bag_structure(temp_path if temp_path else path)
            else:
                bdb.validate_bag(
                    temp_path if temp_path else path,
                    fast=True if args.validate == 'fast' else False,
                    config_file=args.config_file)

        if args.archiver:
            archive = bdb.archive_bag(path, args.archiver)

        if archive is None and is_file:
            archive = path

        if args.validate_profile:
            if is_file:
                if not temp_path:
                    temp_path = bdb.extract_bag(path, temp=True)
            profile = bdb.validate_bag_profile(
                temp_path if temp_path else path)
            bdb.validate_bag_serialization(archive if archive else path,
                                           profile)

        if args.revert:
            bdb.revert_bag(path)

    except Exception as e:
        result = 1
        error = "Error: %s" % get_typed_exception(e)

    finally:
        if temp_path:
            bdb.cleanup_bag(os.path.dirname(temp_path))
        if result != 0:
            sys.stderr.write("\n%s" % error)

    if not args.quiet:
        sys.stderr.write('\n')

    return result
コード例 #14
0
ファイル: deriva_restore.py プロジェクト: emirdad/deriva-py
    def restore(self, **kwargs):
        """
        Perform the catalog restore operation. The restore process is broken up into six phases:

        1. Pre-process the input path.
            - If the input path is a file, it is assumed that it is a compressed archive file that can be extracted
            into an input directory via a supported codec: `tar`,`tgz`,`bz2`, or `zip`.
            - If the input directory is a valid _bag_ directory structure, the bag will be materialized.
        2. The catalog schema will be restored first. The schema is restored from a ERMRest JSON schema document file.
            The schema document file must be named `catalog-schema.json` and must appear at the root of the input
            directory. The restore process can be configured to exclude the restoration of an enumerated set both
            schema and tables.
        3. The catalog table data will be restored, if present. The table date restoration process is resilient to
            interruption and may be restarted. However, if the catalog schema or data is mutated outside of the scope of
            the restore function in-between such restarts, the restored catalog's consistency cannot be guaranteed.
            The restore process can be configured to exclude the restoration of table data for a set of tables.
        4. The catalog foreign keys will be restored.
        5. The catalog assets will be restored, if present.
        6. On success, the restore state marker annotations will be deleted and the catalog history will be truncated.

        :param kwargs:
        :return:
        """
        success = True
        start = datetime.datetime.now()

        # pre-process input
        logging.info("Processing input path: %s" % self.input_path)
        is_file, is_dir, is_uri = bdb.inspect_path(self.input_path)
        if not (is_file or is_dir or is_uri):
            raise DerivaRestoreError(
                "Invalid input path [%s]. If the specified input path refers to a locally mounted "
                "file or directory, it does not exist or cannot be accessed. If the specified "
                "path is a URI, the scheme component of the URI could not be determined."
                % self.input_path)
        if is_file or is_dir:
            self.input_path = os.path.abspath(self.input_path)
        if is_file:
            logging.info(
                "The input path [%s] is a file. Assuming input file is a directory archive and extracting..."
                % self.input_path)
            self.input_path = bdb.extract_bag(self.input_path)

        try:
            if not self.no_bag_materialize:
                self.input_path = bdb.materialize(self.input_path)
        except bdb.bdbagit.BagValidationError as e:
            if self.strict_bag_validation:
                raise DerivaRestoreError(format_exception(e))
            else:
                logging.warning(
                    "Input bag validation failed and strict validation mode is disabled. %s"
                    % format_exception(e))
        is_bag = bdb.is_bag(self.input_path)

        src_schema_file = os.path.abspath(
            os.path.join(self.input_path, "data" if is_bag else "",
                         "catalog-schema.json"))
        # the src_catalog_stub created below will never be "connected" in any kind of network sense,
        # but we need an instance of ErmrestCatalog in order to get a working Model from the schema file.
        src_catalog_stub = ErmrestCatalog("file", src_schema_file, "1")
        src_model = Model.fromfile(src_catalog_stub, src_schema_file)

        # initialize/connect to destination catalog
        if not self.catalog_id:
            self.catalog_id = self.server.create_ermrest_catalog().catalog_id
            self.server_args["catalog_id"] = self.catalog_id
            logging.info("Created new target catalog with ID: %s" %
                         self.catalog_id)
        self.dst_catalog = self.server.connect_ermrest(self.catalog_id)

        # init dcctx cid to a default
        self.dst_catalog.dcctx['cid'] = self.__class__.__name__

        # build up the model content we will copy to destination
        dst_model = self.dst_catalog.getCatalogModel()

        logging.info("Restoring %s to catalog: %s" %
                     (self.input_path, self.dst_catalog.get_server_uri()))
        # set top-level config right away and find fatal usage errors...
        if self.restore_policy:
            logging.info("Restoring top-level catalog ACLs...")
            if not src_model.acls:
                logging.info("Source schema does not contain any ACLs.")
            else:
                src_model.acls.owner.extend(dst_model.acls.owner)
                self.dst_catalog.put('/acl', json=src_model.acls)

        if self.restore_annotations:
            logging.info("Restoring top-level catalog annotations...")
            self.dst_catalog.put('/annotation', json=src_model.annotations)

        # build up the model content we will copy to destination
        dst_model = self.dst_catalog.getCatalogModel()

        new_model = []
        new_columns = [
        ]  # ERMrest does not currently allow bulk column creation
        new_keys = []  # ERMrest does not currently allow bulk key creation
        restore_states = {}
        fkeys_deferred = {}
        exclude_schemas = [] if self.exclude_schemas is None else self.exclude_schemas

        try:
            for sname, schema in src_model.schemas.items():
                if sname in exclude_schemas:
                    continue
                if sname not in dst_model.schemas:
                    new_model.append(self.copy_sdef(schema))

                for tname, table in schema.tables.items():
                    if table.kind != 'table':
                        logging.warning('Skipping restore of %s %s:%s' %
                                        (table.kind, sname, tname))
                        continue

                    if 'RID' not in table.column_definitions.elements:
                        raise DerivaRestoreError(
                            "Source table %s.%s lacks system-columns and cannot be restored."
                            % (sname, tname))

                    # make sure the source table is pruned of any existing restore state markers
                    if table.annotations.get(CLONE_STATE_URL) is not None:
                        del table.annotations[CLONE_STATE_URL]
                    if table.annotations.get(
                            self.RESTORE_STATE_URL) is not None:
                        del table.annotations[self.RESTORE_STATE_URL]

                    if sname not in dst_model.schemas or tname not in dst_model.schemas[
                            sname].tables:
                        new_model.append(self.copy_tdef_core(table))
                        restore_states[(
                            sname, tname)] = 1 if self.restore_data else None
                        fkeys_deferred[(sname,
                                        tname)] = self.copy_tdef_fkeys(table)
                    else:
                        src_columns = {
                            c.name: c
                            for c in table.column_definitions
                        }
                        dst_columns = {
                            c.name: c
                            for c in dst_model.schemas[sname].tables[tname].
                            column_definitions
                        }

                        for cname in src_columns:
                            if cname not in dst_columns:
                                new_columns.append(
                                    self.copy_cdef(src_columns[cname]))
                            else:
                                self.check_column_compatibility(
                                    src_columns[cname], dst_columns[cname])

                        for cname in dst_columns:
                            if cname not in src_columns:
                                raise DerivaRestoreError(
                                    "Destination column %s.%s.%s does not exist in source catalog."
                                    % (sname, tname, cname))

                        src_keys = {
                            tuple(sorted(c.name
                                         for c in key.unique_columns)): key
                            for key in table.keys
                        }
                        dst_keys = {
                            tuple(sorted(c.name
                                         for c in key.unique_columns)): key
                            for key in
                            dst_model.schemas[sname].tables[tname].keys
                        }

                        for utuple in src_keys:
                            if utuple not in dst_keys:
                                new_keys.append(
                                    self.copy_kdef(src_keys[utuple]))

                        for utuple in dst_keys:
                            if utuple not in src_keys:
                                raise DerivaRestoreError(
                                    "Destination key %s.%s(%s) does not exist in source catalog."
                                    % (sname, tname, ', '.join(utuple)))

                        restore_states[(sname, tname)] = \
                            dst_model.schemas[sname].tables[tname].annotations.get(self.RESTORE_STATE_URL)
                        if dst_model.schemas[sname].tables[tname].foreign_keys:
                            # assume that presence of any destination foreign keys means we already completed
                            if self.restore_assets:
                                self.upload_assets()
                            return
                        else:
                            fkeys_deferred[(
                                sname, tname)] = self.copy_tdef_fkeys(table)

            # apply the stage 1 model to the destination in bulk
            logging.info("Restoring catalog schema...")
            if new_model:
                self.dst_catalog.post("/schema",
                                      json=new_model).raise_for_status()

            for sname, tname, cdef in new_columns:
                self.dst_catalog.post("/schema/%s/table/%s/column" %
                                      (urlquote(sname), urlquote(tname)),
                                      json=cdef).raise_for_status()

            for sname, tname, kdef in new_keys:
                self.dst_catalog.post("/schema/%s/table/%s/key" %
                                      (urlquote(sname), urlquote(tname)),
                                      json=kdef).raise_for_status()

            # copy data in stage 2
            if self.restore_data:
                logging.info("Restoring catalog data...")
                for sname, tname in restore_states.keys():
                    tname_uri = "%s:%s" % (urlquote(sname), urlquote(tname))
                    if restore_states[(sname, tname)] == 1:
                        # determine current position in (partial?) copy
                        row = self.dst_catalog.get(
                            "/entity/%s@sort(RID::desc::)?limit=1" %
                            tname_uri).json()
                        if row:
                            last = row[0]['RID']
                            logging.info(
                                "Existing data detected in table [%s] -- will attempt partial restore of "
                                "remaining records following last known RID: %s"
                                % (tname_uri, last))
                        else:
                            last = None

                        table = self.get_json_recordset(
                            self.open_json_stream_file(
                                self.get_table_path(sname, tname, is_bag)),
                            self.data_chunk_size,
                            after=last)

                        total = 0
                        table_success = True
                        try:
                            for chunk in table:
                                if chunk:
                                    self.dst_catalog.post(
                                        "/entity/%s?nondefaults=RID,RCT,RCB" %
                                        tname_uri,
                                        json=chunk)
                                    total += len(chunk)
                                else:
                                    break
                        except:
                            table_success = False
                        finally:
                            table.close()
                            if table_success:
                                logging.info(
                                    "Restoration of table data [%s] successful. %s rows restored."
                                    % (tname_uri, total))
                            else:
                                logging.warning(
                                    "Restoration of table data [%s] failed. %s rows restored."
                                    % (tname_uri, total))

                        # record our progress on catalog in case we fail part way through
                        self.dst_catalog.put(
                            "/schema/%s/table/%s/annotation/%s" % (
                                urlquote(sname),
                                urlquote(tname),
                                urlquote(self.RESTORE_STATE_URL),
                            ),
                            json=2)
                    elif restore_states[(sname, tname)] is None and (
                            sname, tname) in {
                                ('public', 'ERMrest_Client'),
                                ('public', 'ERMrest_Group'),
                            }:
                        # special sync behavior for magic ermrest tables
                        # HACK: these are assumed small enough to join via local merge of arrays
                        want = sorted(self.load_json_file(
                            self.get_table_path(sname, tname, is_bag)),
                                      key=lambda r: r['ID'])
                        have = sorted(self.dst_catalog.get(
                            "/entity/%s?limit=none" % tname_uri).json(),
                                      key=lambda r: r['ID'])
                        create = []
                        update = []

                        pos_want = 0
                        pos_have = 0
                        while pos_want < len(want):
                            while pos_have < len(have) and have[pos_have][
                                    'ID'] < want[pos_want]['ID']:
                                # dst-only rows will be retained as is
                                pos_have += 1
                            if pos_have >= len(have) or have[pos_have][
                                    'ID'] > want[pos_want]['ID']:
                                # src-only rows will be inserted
                                create.append(want[pos_want])
                                pos_want += 1
                            else:
                                # overlapping rows will be updated
                                update.append(want[pos_want])
                                pos_want += 1

                        self.dst_catalog.post(
                            "/entity/%s?nondefaults=RCT,RCB" % tname_uri,
                            json=create)
                        self.dst_catalog.put(
                            "/attributegroup/%s/ID;%s" % (tname_uri, ",".join([
                                urlquote(c.name) for c in src_model.
                                schemas[sname].tables[tname].column_definitions
                                if c.name not in {'RID', 'RMT', 'RMB', 'ID'}
                            ])),
                            json=update)

                        # record our progress on catalog in case we fail part way through
                        self.dst_catalog.put(
                            "/schema/%s/table/%s/annotation/%s" % (
                                urlquote(sname),
                                urlquote(tname),
                                urlquote(self.RESTORE_STATE_URL),
                            ),
                            json=2)

            # apply stage 2 model in bulk only... we won't get here unless preceding succeeded
            logging.info("Restoring foreign keys...")
            new_fkeys = []
            for fkeys in fkeys_deferred.values():
                new_fkeys.extend(fkeys)

            # restore fkeys
            if new_fkeys:
                self.dst_catalog.post("/schema", json=new_fkeys)

            # restore assets
            if self.restore_assets:
                self.upload_assets()

            # cleanup
            self.cleanup_restored_catalog()
        except:
            success = False
            raise
        finally:
            elapsed_time = datetime.datetime.now() - start
            total_secs = elapsed_time.total_seconds()
            elapsed = time.strftime('%H:%M:%S', time.gmtime(total_secs))
            logging.info("Restore of catalog %s %s. %s" %
                         (self.dst_catalog.get_server_uri(),
                          "completed successfully" if success else "failed",
                          ("Elapsed time: %s" % elapsed) if
                          (total_secs > 0) else ""))
コード例 #15
0
def download_data(location, local_path):
    """Download data from a remote host to the configured machine.
    (Many sources to one destination)

    Arguments:
        location (str): The location of the data.
        local_path (str): The path to the local storage location.

    Returns:
        dict: success (bool): True on success, False on failure.
    """
    filename = None
    # If the local_path is a file and not a directory, use the directory
    if ((os.path.exists(local_path) and not os.path.isdir(local_path))
            or (not os.path.exists(local_path) and local_path[-1] != "/")):
        # Save the filename for later
        filename = os.path.basename(local_path)
        local_path = os.path.dirname(local_path) + "/"

    os.makedirs(local_path, exist_ok=True)

    loc_info = urllib.parse.urlparse(location)
    # HTTP(S)
    if loc_info.scheme.startswith("http"):
        # Get default filename and extension
        http_filename = os.path.basename(loc_info.path)
        if not http_filename:
            http_filename = "archive"
        ext = os.path.splitext(http_filename)[1]
        if not ext:
            ext = ".archive"

        # Fetch file
        with requests.get(location, stream=True) as res:
            if res.status_code >= 300:
                logger.error(
                    f"Error {res.status_code} downloading file '{location}': "
                    f"{res.content}")
                raise IOError("File download failed: {}".format(res.content))
            else:
                logger.debug(
                    f"Downloaded file {location} with status code {res.status_code}"
                )
            # Get filename from header if present
            con_disp = res.headers.get("Content-Disposition", "")
            filename_start = con_disp.find("filename=")
            if filename_start >= 0:
                filename_end = con_disp.find(";", filename_start)
                if filename_end < 0:
                    filename_end = None
                http_filename = con_disp[filename_start +
                                         len("filename="):filename_end]
                http_filename = http_filename.strip("\"'; ")

            # Create path for file
            archive_path = os.path.join(local_path, filename or http_filename)
            # Download and save file
            with open(archive_path, 'wb') as out:
                shutil.copyfileobj(res.raw, out)
            logger.debug("Saved HTTP file: {}".format(archive_path))

        # Assume data is BDBag, extract
        bag_path = bdbag_api.extract_bag(archive_path, local_path)
    # Not supported
    else:
        # Nothing to do
        raise IOError(
            "Invalid data location: '{}' is not a recognized protocol "
            "(from {}).".format(loc_info.scheme, str(location)))
    # Return path to bag
    return bag_path
コード例 #16
0
ファイル: bdbag_cli.py プロジェクト: kylechard/bdbag
def main():

    sys.stderr.write('\n')

    args, is_bag, is_file = parse_cli()
    path = os.path.abspath(args.bag_path)

    archive = None
    temp_path = None
    error = None
    result = 0

    try:
        if not is_file:
            # do not try to create or update the bag if the user just wants to validate or complete an existing bag
            if not ((args.validate or args.validate_profile or args.resolve_fetch) and
                    not (args.update and bdb.is_bag(path))):
                if args.checksum and 'all' in args.checksum:
                    args.checksum = ['md5', 'sha1', 'sha256', 'sha512']
                # create or update the bag depending on the input arguments
                bdb.make_bag(path,
                             args.checksum,
                             args.update,
                             args.skip_manifests,
                             args.prune_manifests,
                             BAG_METADATA if BAG_METADATA else None,
                             args.metadata_file,
                             args.remote_file_manifest,
                             args.config_file)

        # otherwise just extract the bag if it is an archive and no other conflicting options specified
        elif not (args.validate or args.validate_profile or args.resolve_fetch):
            bdb.extract_bag(path)
            sys.stderr.write('\n')
            return result

        if args.resolve_fetch:
            if args.validate == 'full':
                sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING)
            bdb.resolve_fetch(path, True if args.resolve_fetch == 'all' else False)

        if args.validate:
            if is_file:
                temp_path = bdb.extract_bag(path, temp=True)
            bdb.validate_bag(temp_path if temp_path else path,
                             True if args.validate == 'fast' else False,
                             args.config_file)

        if args.archiver:
            archive = bdb.archive_bag(path, args.archiver)

        if archive is None and is_file:
            archive = path

        if args.validate_profile:
            if is_file:
                if not temp_path:
                    temp_path = bdb.extract_bag(path, temp=True)
            profile = bdb.validate_bag_profile(temp_path if temp_path else path)
            bdb.validate_bag_serialization(archive if archive else path, profile)

    except Exception as e:
        result = 1
        error = "Error: %s" % bdbag.get_named_exception(e)

    finally:
        if temp_path:
            bdb.cleanup_bag(os.path.dirname(temp_path))
        if result != 0:
            sys.stderr.write("\n%s" % error)

    sys.stderr.write('\n')

    return result
コード例 #17
0
def submit_job(input_minid, wf_minid, api_key=None):
    #### BASIC ASSUMPTIONS:
    # 1. User has a globus ID and has account in GG
    # 2. User has created an API
    # 3. User does not have the workflow setup on their account

    #### A. Get workflow GA file from the workflow MINID
    #### B. Push GA file to the instance url
    gi = GalaxyInstance(URL, api_key)

    QUERY_BASE = "http://minid.bd2k.org/minid/landingpage/"
    tmp_path = tempfile.mkdtemp()
    wf_mine = None
    try:
        # A.
        BASE_DOWNLOAD_PATH = "/%s" % (tmp_path)
        query = "%s/%s" % (QUERY_BASE, wf_minid)
        # print("Executing query: %s" % query)
        r = requests.get(query, headers={"Accept": "application/json"})
        location = r.json()["locations"][0]['link']
        filename = location.split("/")[-1]
        path = "%s/%s" % (BASE_DOWNLOAD_PATH, filename)
        # print("Downloading result: %s" % location)

        # Save the bag from the minid location
        response = requests.get(location, stream=True)
        with open(path, 'wb') as handle:
            for block in response.iter_content(1024):
                handle.write(block)

        extract_path = ".".join(path.split(".")[0:-1])
        output_path = "%s/%s" % (extract_path, ".".join(
            filename.split(".")[0:-1]))
        # print("Extracting bag and resolving fetch: %s" % output_path)
        bdbag_api.extract_bag(path, extract_path)
        time.sleep(5)
        # print('resolving fetch')
        bdbag_api.resolve_fetch(output_path, True)
        ga_file = glob.glob("%s/data/*.ga" % (output_path))[0]

        # B.
        ga_dict = None
        with open(ga_file) as handle:
            ga_dict = json.loads(handle.read())
        if ga_dict is not None:
            wf_mine = gi.workflows.import_workflow_dict(ga_dict)

    finally:
        shutil.rmtree(tmp_path)
        # print('finished!')

    # published_workflow_id = "6f1411e6cfea8ef7"
    # workflow_name = "imported: RNA-seq-Gtex-stage1-v2.0-bags_transfer"
    #
    ## check if workflow exists
    # workflows = gi.workflows.get_workflows(name=workflow_name)
    # wf_mine = None
    # if len(workflows) > 0:
    #    wf_mine = workflows[-1]
    # else:
    #    # workflow does not exist, need to import from published
    #    wf_mine = gi.workflows.import_shared_workflow(published_workflow_id)

    # create a history
    history_name = "topmed_history_%s" % time.strftime(
        "%a_%b_%d_%Y_%-I:%M:%S_%p", time.localtime(time.time()))
    history = gi.histories.create_history(name=history_name)
    wf_data = {}
    wf_data['workflow_id'] = wf_mine['id']
    wf_data['ds_map'] = {}
    parameters = {}
    parameters['0'] = {'minid': input_minid}
    parameters['5'] = {
        'historyid': history['id'],
        'userapi': api_key,
        'url': URL
    }
    wf_data['parameters'] = parameters

    # print('super close to finishing!')

    res = gi.workflows.invoke_workflow(wf_data['workflow_id'],
                                       wf_data['ds_map'],
                                       params=wf_data['parameters'],
                                       history_id=history['id'],
                                       import_inputs_to_history=False)
    return {
        'history_name': history_name,
        'history_id': res['history_id'],
        'res': res
    }
コード例 #18
0
def ts_validate(data_path, schema=None):
    """Validate a given TableSchema using the Datapackage package.

    Arguments:
        data_path (str): Path to the TableSchema JSON or BDBag directory
                or BDBag archive to validate.
        schema (str): The schema to validate against. If not provided,
                the data is only validated against the defined TableSchema.
                Default None.

    Returns:
        dict: The validation results.
            is_valid (bool): Is the TableSchema valid?
            raw_errors (list): The raw Exceptions generated from any validation errors.
            error (str): A formatted error message about any validation errors.
    """
    # If data_path is BDBag archive, unarchive to temp dir
    try:
        data_path = bdbag_api.extract_bag(data_path, temp=True)
    # data_path is not archive
    except RuntimeError:
        pass
    # If data_path is dir (incl. if was unarchived), find JSON desc
    if os.path.isdir(data_path):
        # If 'data' dir present, search there instead
        if "data" in os.listdir(data_path):
            data_path = os.path.join(data_path, "data")
        # Find .json file (cannot be hidden)
        desc_file_list = [
            filename for filename in os.listdir(data_path)
            if filename.endswith(".json") and not filename.startswith(".")
        ]
        if len(desc_file_list) < 1:
            return {
                "is_valid":
                False,
                "raw_errors":
                [FileNotFoundError("No TableSchema JSON file found.")],
                "error":
                "No TableSchema JSON file found."
            }
        elif len(desc_file_list) > 1:
            return {
                "is_valid":
                False,
                "raw_errors":
                [RuntimeError("Multiple JSON files found in directory.")],
                "error":
                "Multiple JSON files found in directory."
            }
        else:
            data_path = os.path.join(data_path, desc_file_list[0])
    # data_path should/must be file now (JSON desc)
    if not os.path.isfile(data_path):
        return {
            "is_valid":
            False,
            "raw_errors": [
                ValueError(
                    "Path '{}' does not refer to a file".format(data_path))
            ],
            "error":
            "Path '{}' does not refer to a file".format(data_path)
        }

    # Read into Package, return error on failure
    try:
        pkg = Package(descriptor=data_path, strict=True)
    except Exception as e:
        return {"is_valid": False, "raw_errors": e.errors, "error": e.errors}

    if schema:
        # Download reference schema
        schema_path = os.path.join(os.path.dirname(data_path),
                                   "validation_schema.json")
        try:
            with open(schema_path, "wb") as f:
                f.write(requests.get(schema).content)
        except Exception as e:
            return {
                "is_valid": False,
                "raw_errors": [e],
                "error": "Error while downloading schema: {}".format(str(e))
            }
        # TODO: Validate against downloaded schema
        print(
            "Warning: Currently unable to validate data against existing schema '{}'."
            .format(schema))

    # Actually check and return package validity based on Package validation
    return {
        "is_valid": pkg.valid,
        "raw_errors": pkg.errors,
        "error": "\n".join([str(err) for err in pkg.errors])
    }