Esempio n. 1
0
    def _get_content_metadata(self):
        """Get ALL metadata for all dataset content.

        Returns
        -------
        generator((location, metadata_dict))
        """
        log_progress(
            lgr.info,
            'extractordataladcore',
            'Start core metadata extraction from %s', self.ds,
            total=len(self.paths),
            label='Core metadata extraction',
            unit=' Files',
        )
        if not isinstance(self.ds.repo, AnnexRepo):
            for p in self.paths:
                # this extractor does give a response for ANY file as it serves
                # an an indicator of file presence (i.e. a file list) in the
                # content metadata, even if we know nothing but the filename
                # about a file
                yield (p, dict())
            log_progress(
                lgr.info,
                'extractordataladcore',
                'Finished core metadata extraction from %s', self.ds
            )
            return
        valid_paths = None
        if self.paths and sum(len(i) for i in self.paths) > 500000:
            valid_paths = set(self.paths)
        # Availability information
        for file, whereis in self.ds.repo.whereis(
                self.paths if self.paths and valid_paths is None else '.',
                output='full').items():
            if file.startswith('.datalad') or valid_paths and file not in valid_paths:
                # do not report on our own internal annexed files (e.g. metadata blobs)
                continue
            log_progress(
                lgr.info,
                'extractordataladcore',
                'Extracted core metadata from %s', file,
                update=1,
                increment=True)
            # pull out proper (public) URLs
            # TODO possibly extend with special remote info later on
            meta = {'url': whereis[remote].get('urls', [])
                    for remote in whereis
                    # "web" remote
                    if remote == "00000000-0000-0000-0000-000000000001" and
                    whereis[remote].get('urls', None)}
            yield (file, meta)
        log_progress(
            lgr.info,
            'extractordataladcore',
            'Finished core metadata extraction from %s', self.ds
        )
Esempio n. 2
0
    def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        log_progress(
            lgr.info,
            'extractoraudio',
            'Start audio metadata extraction from %s', self.ds,
            total=len(self.paths),
            label='audio metadata extraction',
            unit=' Files',
        )
        contentmeta = []
        for f in self.paths:
            absfp = opj(self.ds.path, f)
            log_progress(
                lgr.info,
                'extractoraudio',
                'Extract audio metadata from %s', absfp,
                update=1,
                increment=True)
            info = audiofile(absfp, easy=True)
            if info is None:
                continue
            meta = {vocab_map.get(k, k): info[k][0]
                    if isinstance(info[k], list) and len(info[k]) == 1 else info[k]
                    for k in info}
            if hasattr(info, 'mime') and len(info.mime):
                meta['format'] = 'mime:{}'.format(info.mime[0])
            for k in ('length', 'channels', 'bitrate', 'sample_rate'):
                if hasattr(info.info, k):
                    val = getattr(info.info, k)
                    if k == 'length':
                        # duration comes in seconds, cap at millisecond level
                        val = round(val, 3)
                    meta[vocab_map.get(k, k)] = val
            contentmeta.append((f, meta))

        log_progress(
            lgr.info,
            'extractoraudio',
            'Finished audio metadata extraction from %s', self.ds
        )
        return {
            '@context': {
                'music': {
                    '@id': 'http://purl.org/ontology/mo/',
                    'description': 'Music Ontology with main concepts and properties for describing music',
                    'type': vocabulary_id,
                },
                'duration(s)': {
                    "@id": 'time:Duration',
                    "unit": "uo:0000010",
                    'unit_label': 'second',
                },
            },
        }, \
            contentmeta
Esempio n. 3
0
    def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        contentmeta = []
        log_progress(
            lgr.info,
            'extractorimage',
            'Start image metadata extraction from %s', self.ds,
            total=len(self.paths),
            label='image metadata extraction',
            unit=' Files',
        )
        for f in self.paths:
            absfp = opj(self.ds.path, f)
            log_progress(
                lgr.info,
                'extractorimage',
                'Extract image metadata from %s', absfp,
                update=1,
                increment=True)
            try:
                img = Image.open(absfp)
            except Exception as e:
                lgr.debug("Image metadata extractor failed to load %s: %s",
                          absfp, exc_str(e))
                continue
            meta = {
                'type': 'dctype:Image',
            }

            # run all extractors
            meta.update({k: v(img) for k, v in self._extractors.items()})
            # filter useless fields (empty strings and NaNs)
            meta = {k: v for k, v in meta.items()
                    if not (hasattr(v, '__len__') and not len(v))}
            contentmeta.append((f, meta))

        log_progress(
            lgr.info,
            'extractorimage',
            'Finished image metadata extraction from %s', self.ds
        )
        return {
            '@context': vocabulary,
        }, \
            contentmeta
Esempio n. 4
0
    def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        log_progress(
            lgr.info,
            'extractorexif',
            'Start EXIF metadata extraction from %s', self.ds,
            total=len(self.paths),
            label='EXIF metadata extraction',
            unit=' Files',
        )
        contentmeta = []
        for f in self.paths:
            absfp = opj(self.ds.path, f)
            log_progress(
                lgr.info,
                'extractorexif',
                'Extract EXIF metadata from %s', absfp,
                update=1,
                increment=True)
            # TODO we might want to do some more elaborate extraction in the future
            # but for now plain EXIF, no maker extensions, no thumbnails
            info = process_file(open(opj(self.ds.path, f), 'rb'), details=False)
            if not info:
                # got nothing, likely nothing there
                continue
            meta = {k.split()[-1]: _return_as_appropriate_dtype(info[k].printable)
                    for k in info}
            contentmeta.append((f, meta))

        log_progress(
            lgr.info,
            'extractorexif',
            'Finished EXIF metadata extraction from %s', self.ds
        )
        return {
            '@context': {
                'exif': {
                    '@id': 'http://www.w3.org/2003/12/exif/ns/',
                    'description': 'Vocabulary to describe an Exif format picture data',
                    'type': vocabulary_id,
                },
            },
        }, \
            contentmeta
Esempio n. 5
0
def branch_blobs(repo, branch):
    """Get all blobs for `branch`.

    Parameters
    ----------
    repo : GitRepo
    branch : str

    Returns
    -------
    A generator object that returns (hexsha, content, file name) for each blob
    in `branch`.  Note: By design a blob isn't tied to a particular file name;
    the returned file name matches what is returned by 'git rev-list'.
    """
    git = repo.repo.git
    # Note: This might be nicer with rev-list's --filter and
    # --filter-print-omitted, but those aren't available until Git v2.16.
    lines = git.rev_list(branch, objects=True).splitlines()
    # Trees and blobs have an associated path printed.
    objects = (ln.split() for ln in lines)
    blob_trees = [obj for obj in objects if len(obj) == 2]

    num_objects = len(blob_trees)

    log_progress(lgr.info, "repodates_branch_blobs",
                 "Checking %d objects", num_objects,
                 label="Checking objects", total=num_objects, unit=" objects")
    # This is inefficient.  It makes a git call for each object, some of which
    # aren't even blobs.  We could instead use 'git cat-file --batch'.
    for obj, fname in blob_trees:
        log_progress(lgr.info, "repodates_branch_blobs",
                     "Checking %s", obj,
                     increment=True, update=1)
        try:
            yield obj, git.cat_file("blob", obj), fname
        except GitCommandError:  # The object was a tree.
            continue
    log_progress(lgr.info, "repodates_branch_blobs",
                 "Finished checking %d objects", num_objects)
Esempio n. 6
0
def branch_blobs_in_tree(repo, branch):
    """Get all blobs for the current tree of `branch`.

    Parameters
    ----------
    repo : GitRepo
    branch : str, optional

    Returns
    -------
    A generator object that returns (hexsha, content, file name) for each blob.
    Note: If there are multiple files in the tree that point to the blob, only
    the first file name that is reported by 'git ls-tree' is used (i.e., one
    entry per blob is yielded).
    """
    seen_blobs = set()
    git = repo.repo.git
    out = git.ls_tree(branch, z=True, r=True)
    if out:
        lines = out.strip("\0").split("\0")
        num_lines = len(lines)
        log_progress(lgr.info,
                     "repodates_blobs_in_tree",
                     "Checking %d objects in git-annex tree", num_lines,
                     label="Checking objects", total=num_lines,
                     unit=" objects")
        for line in lines:
            _, obj_type, obj, fname = line.split()
            log_progress(lgr.info, "repodates_blobs_in_tree",
                         "Checking %s", obj,
                         increment=True, update=1)
            if obj_type == "blob" and obj not in seen_blobs:
                yield obj, git.cat_file("blob", obj), fname
            seen_blobs.add(obj)
        log_progress(lgr.info, "repodates_blobs_in_tree",
                     "Finished checking %d blobs", num_lines)
Esempio n. 7
0
def add_extra_filename_values(filename_format, rows, urls, dry_run):
    """Extend `rows` with values for special formatting fields.
    """
    file_fields = list(get_fmt_names(filename_format))
    if any(i.startswith("_url") for i in file_fields):
        for row, url in zip(rows, urls):
            row.update(get_url_parts(url))

    if any(i.startswith("_url_filename") for i in file_fields):
        if dry_run:  # Don't waste time making requests.
            dummy = get_file_parts("BASE.EXT", "_url_filename")
            for idx, row in enumerate(rows):
                row.update(
                    {k: v + str(idx) for k, v in dummy.items()})
        else:
            num_urls = len(urls)
            log_progress(lgr.info, "addurls_requestnames",
                         "Requesting file names for %d URLs", num_urls,
                         label="Requesting names", total=num_urls,
                         unit=" Files")
            for row, url in zip(rows, urls):
                # If we run into any issues here, we're just going to raise an
                # exception and then abort inside dlplugin.  It'd be good to
                # disentangle this from `extract` so that we could yield an
                # individual error, drop the row, and keep going.
                filename = get_url_filename(url)
                if filename:
                    row.update(get_file_parts(filename, "_url_filename"))
                else:
                    raise ValueError(
                        "{} does not contain a filename".format(url))
                log_progress(lgr.info, "addurls_requestnames",
                             "%s returned for %s", url, filename,
                             update=1, increment=True)
            log_progress(lgr.info, "addurls_requestnames",
                         "Finished requesting file names")
Esempio n. 8
0
def _yield_res_from_pre2019_extractor(ds, name, extractor_cls, process_type,
                                      paths):  # pragma: no cover
    """This implements dealing with our first extractor class concept"""

    want_dataset_meta = process_type in ('all', 'dataset') \
        if process_type else ds.config.obtain(
            'datalad.metadata.extract-dataset-{}'.format(
                name.replace('_', '-')),
            default=True,
            valtype=EnsureBool())
    want_content_meta = process_type in ('all', 'content') \
        if process_type else ds.config.obtain(
            'datalad.metadata.extract-content-{}'.format(
                name.replace('_', '-')),
            default=True,
            valtype=EnsureBool())

    if not (want_dataset_meta or want_content_meta):  # pragma: no cover
        log_progress(
            lgr.info,
            'metadataextractors',
            'Skipping %s metadata extraction from %s, '
            'disabled by configuration',
            name,
            ds,
        )
        return

    try:
        extractor = extractor_cls(ds, paths)
    except Exception as e:  # pragma: no cover
        log_progress(
            lgr.error,
            'metadataextractors',
            'Failed %s metadata extraction from %s',
            name,
            ds,
        )
        raise ValueError(
            "Failed to load metadata extractor for '%s', "
            "broken dataset configuration (%s)?: %s", name, ds, exc_str(e))

    # this is the old way of extractor operation
    dsmeta_t, contentmeta_t = extractor.get_metadata(
        dataset=want_dataset_meta,
        content=want_content_meta,
    )
    # fake the new way of reporting results directly
    # extractors had no way to report errors, hence
    # everything is unconditionally 'ok'
    for loc, meta in contentmeta_t or []:
        yield dict(
            status='ok',
            path=loc,
            type='file',
            metadata=meta,
        )
    yield dict(
        status='ok',
        path=ds.path,
        type='dataset',
        metadata=dsmeta_t,
    )
Esempio n. 9
0
    def __call__(
        url,
        name,
        dataset=None,
        storage_name=None,
        post_update_hook=False,
        shared=None,
        group=None,
        storage_sibling=True,
        existing='error',
        trust_level=None,
        recursive=False,
        recursion_limit=None,
        disable_storage__=None,
    ):
        if disable_storage__ is not None:
            import warnings
            warnings.warn(
                "datalad-create-sibling-ria --no-storage-sibling "
                "is deprecated, use --storage-sibling off instead.",
                DeprecationWarning)
            # recode to new setup
            disable_storage__ = None
            storage_sibling = False

        if storage_sibling == 'only' and storage_name:
            lgr.warning(
                "Sibling name will be used for storage sibling in "
                "storage-sibling-only mode, but a storage sibling name "
                "was provided")

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='create sibling RIA')
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-ria",
            logger=lgr,
        )

        # parse target URL
        try:
            ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config)
        except ValueError as e:
            yield get_status_dict(status='error', message=str(e), **res_kwargs)
            return

        if ds.repo.get_hexsha() is None or ds.id is None:
            raise RuntimeError("Repository at {} is not a DataLad dataset, "
                               "run 'datalad create [--force]' first.".format(
                                   ds.path))

        if not storage_sibling and storage_name:
            lgr.warning(
                "Storage sibling setup disabled, but a storage sibling name "
                "was provided")

        if storage_sibling and not storage_name:
            storage_name = "{}-storage".format(name)

        if storage_sibling and name == storage_name:
            # leads to unresolvable, circular dependency with publish-depends
            raise ValueError("sibling names must not be equal")

        if not isinstance(url, str):
            raise TypeError("url is not a string, but %s" % type(url))

        # Query existing siblings upfront in order to fail early on
        # existing=='error', since misconfiguration (particularly of special
        # remotes) only to fail in a subdataset later on with that config, can
        # be quite painful.
        # TODO: messages - this is "create-sibling". Don't confuse existence of
        #       local remotes with existence of the actual remote sibling
        #       in wording
        if existing == 'error':
            # in recursive mode this check could take a substantial amount of
            # time: employ a progress bar (or rather a counter, because we don't
            # know the total in advance
            pbar_id = 'check-siblings-{}'.format(id(ds))
            log_progress(
                lgr.info,
                pbar_id,
                'Start checking pre-existing sibling configuration %s',
                ds,
                label='Query siblings',
                unit=' Siblings',
            )
            # even if we have to fail, let's report all conflicting siblings
            # in subdatasets
            failed = False
            for r in ds.siblings(result_renderer=None,
                                 recursive=recursive,
                                 recursion_limit=recursion_limit):
                log_progress(lgr.info,
                             pbar_id,
                             'Discovered sibling %s in dataset at %s',
                             r['name'],
                             r['path'],
                             update=1,
                             increment=True)
                if not r['type'] == 'sibling' or r['status'] != 'ok':
                    # this is an internal status query that has not consequence
                    # for the outside world. Be silent unless something useful
                    # can be said
                    #yield r
                    continue
                if r['name'] == name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
                if storage_name and r['name'] == storage_name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(storage_name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
            log_progress(
                lgr.info,
                pbar_id,
                'Finished checking pre-existing sibling configuration %s',
                ds,
            )
            if failed:
                return

        # TODO: - URL parsing + store creation needs to be RF'ed based on
        #         command abstractions
        #       - more generally consider store creation a dedicated command or
        #         option
        # Note: URL parsing is done twice ATM (for top-level ds). This can't be
        # reduced to single instance, since rewriting url based on config could
        # be different for subdatasets.

        create_store(
            SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), Path(base_path),
            '1')

        yield from _create_sibling_ria(ds, url, name, storage_sibling,
                                       storage_name, existing, shared, group,
                                       post_update_hook, trust_level,
                                       res_kwargs)

        if recursive:
            # Note: subdatasets can be treated independently, so go full
            # recursion when querying for them and _no_recursion with the
            # actual call. Theoretically this can be parallelized.

            for subds in ds.subdatasets(fulfilled=True,
                                        recursive=True,
                                        recursion_limit=recursion_limit,
                                        result_xfm='datasets'):
                yield from _create_sibling_ria(subds, url, name,
                                               storage_sibling, storage_name,
                                               existing, shared, group,
                                               post_update_hook, trust_level,
                                               res_kwargs)
Esempio n. 10
0
    def __call__(
            url,
            name,
            *,  # note that `name` is required but not posarg in CLI
            dataset=None,
            storage_name=None,
            alias=None,
            post_update_hook=False,
            shared=None,
            group=None,
            storage_sibling=True,
            existing='error',
            new_store_ok=False,
            trust_level=None,
            recursive=False,
            recursion_limit=None,
            disable_storage__=None,
            push_url=None):
        if disable_storage__ is not None:
            import warnings
            warnings.warn(
                "datalad-create-sibling-ria --no-storage-sibling "
                "is deprecated, use --storage-sibling off instead.",
                DeprecationWarning)
            # recode to new setup
            disable_storage__ = None
            storage_sibling = False

        if storage_sibling == 'only' and storage_name:
            lgr.warning(
                "Sibling name will be used for storage sibling in "
                "storage-sibling-only mode, but a storage sibling name "
                "was provided")

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='create RIA sibling(s)')
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-ria",
            logger=lgr,
        )

        # parse target URL
        # Note: URL parsing is done twice ATM (for top-level ds). This can't be
        # reduced to single instance, since rewriting url based on config could
        # be different for subdatasets.
        try:
            ssh_host, base_path, rewritten_url = \
                verify_ria_url(push_url if push_url else url, ds.config)
        except ValueError as e:
            yield get_status_dict(status='error', message=str(e), **res_kwargs)
            return

        if ds.repo.get_hexsha() is None or ds.id is None:
            raise RuntimeError("Repository at {} is not a DataLad dataset, "
                               "run 'datalad create [--force]' first.".format(
                                   ds.path))

        if not storage_sibling and storage_name:
            lgr.warning(
                "Storage sibling setup disabled, but a storage sibling name "
                "was provided")

        if storage_sibling and not storage_name:
            storage_name = "{}-storage".format(name)

        if storage_sibling and name == storage_name:
            # leads to unresolvable, circular dependency with publish-depends
            raise ValueError("sibling names must not be equal")

        if not isinstance(url, str):
            raise TypeError("url is not a string, but %s" % type(url))

        # Query existing siblings upfront in order to fail early on
        # existing=='error', since misconfiguration (particularly of special
        # remotes) only to fail in a subdataset later on with that config, can
        # be quite painful.
        # TODO: messages - this is "create-sibling". Don't confuse existence of
        #       local remotes with existence of the actual remote sibling
        #       in wording
        if existing == 'error':
            failed = False
            for dpath, sname in _yield_ds_w_matching_siblings(
                    ds, (name, storage_name),
                    recursive=recursive,
                    recursion_limit=recursion_limit):
                res = get_status_dict(
                    status='error',
                    message=(
                        "a sibling %r is already configured in dataset %r",
                        sname, dpath),
                    type='sibling',
                    name=sname,
                    ds=ds,
                    **res_kwargs,
                )
                failed = True
                yield res
            if failed:
                return
        # TODO: - URL parsing + store creation needs to be RF'ed based on
        #         command abstractions
        #       - more generally consider store creation a dedicated command or
        #         option

        io = SSHRemoteIO(ssh_host) if ssh_host else LocalIO()
        try:
            # determine the existence of a store by trying to read its layout.
            # Because this raises a FileNotFound error if non-existent, we need
            # to catch it
            io.read_file(Path(base_path) / 'ria-layout-version')
        except (FileNotFoundError, RIARemoteError,
                RemoteCommandFailedError) as e:
            if not new_store_ok:
                # we're instructed to only act in case of an existing RIA store
                res = get_status_dict(status='error',
                                      message="No store found at '{}'. Forgot "
                                      "--new-store-ok ?".format(
                                          Path(base_path)),
                                      **res_kwargs)
                yield res
                return

        log_progress(
            lgr.info,
            'create-sibling-ria',
            'Creating a new RIA store at %s',
            Path(base_path),
        )
        create_store(io, Path(base_path), '1')

        yield from _create_sibling_ria(ds, url, push_url, name,
                                       storage_sibling, storage_name, alias,
                                       existing, shared, group,
                                       post_update_hook, trust_level,
                                       res_kwargs)

        if recursive:
            # Note: subdatasets can be treated independently, so go full
            # recursion when querying for them and _no_recursion with the
            # actual call. Theoretically this can be parallelized.

            for subds in ds.subdatasets(state='present',
                                        recursive=True,
                                        recursion_limit=recursion_limit,
                                        return_type='generator',
                                        result_renderer='disabled',
                                        result_xfm='datasets'):
                yield from _create_sibling_ria(
                    subds,
                    url,
                    push_url,
                    name,
                    storage_sibling,
                    storage_name,
                    None,  # subdatasets can't have the same alias as the parent
                    existing,
                    shared,
                    group,
                    post_update_hook,
                    trust_level,
                    res_kwargs)
    def __call__(url,
                 name,
                 dataset=None,
                 ria_remote_name=None,
                 post_update_hook=False,
                 shared=None,
                 group=None,
                 ria_remote=True,
                 existing='error',
                 recursive=False,
                 recursion_limit=None):

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='create sibling RIA')
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-ria",
            logger=lgr,
        )

        if ds.repo.get_hexsha() is None or ds.id is None:
            raise RuntimeError("Repository at {} is not a DataLad dataset, "
                               "run 'datalad create [--force]' first.".format(
                                   ds.path))

        if not ria_remote and ria_remote_name:
            lgr.warning(
                "RIA remote setup disabled, but a ria-remote name was provided"
            )

        if ria_remote and not ria_remote_name:
            ria_remote_name = "{}-ria".format(name)

        if ria_remote and name == ria_remote_name:
            # leads to unresolvable, circular dependency with publish-depends
            raise ValueError("sibling names must not be equal")

        if not isinstance(url, str):
            raise TypeError("url is not a string, but %s" % type(url))

        # Query existing siblings upfront in order to fail early on
        # existing=='error', since misconfiguration (particularly of special
        # remotes) only to fail in a subdataset later on with that config, can
        # be quite painful.
        # TODO: messages - this is "create-sibling". Don't confuse existence of
        #       local remotes with existence of the actual remote sibling
        #       in wording
        if existing == 'error':
            # in recursive mode this check could take a substantial amount of
            # time: employ a progress bar (or rather a counter, because we dont
            # know the total in advance
            pbar_id = 'check-siblings-{}'.format(id(ds))
            log_progress(
                lgr.info,
                pbar_id,
                'Start checking pre-existing sibling configuration %s',
                ds,
                label='Query siblings',
                unit=' Siblings',
            )
            # even if we have to fail, let's report all conflicting siblings
            # in subdatasets
            failed = False
            for r in ds.siblings(result_renderer=None,
                                 recursive=recursive,
                                 recursion_limit=recursion_limit):
                log_progress(lgr.info,
                             pbar_id,
                             'Discovered sibling %s in dataset at %s',
                             r['name'],
                             r['path'],
                             update=1,
                             increment=True)
                if not r['type'] == 'sibling' or r['status'] != 'ok':
                    # this is an internal status query that has not consequence
                    # for the outside world. Be silent unless something useful
                    # can be said
                    #yield r
                    continue
                if r['name'] == name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
                if ria_remote_name and r['name'] == ria_remote_name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(ria_remote_name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
            log_progress(
                lgr.info,
                pbar_id,
                'Finished checking pre-existing sibling configuration %s',
                ds,
            )
            if failed:
                return

        yield from _create_sibling_ria(ds, url, name, ria_remote,
                                       ria_remote_name, existing, shared,
                                       group, post_update_hook, res_kwargs)

        if recursive:
            # Note: subdatasets can be treated independently, so go full
            # recursion when querying for them and _no_recursion with the
            # actual call. Theoretically this can be parallelized.

            for subds in ds.subdatasets(fulfilled=True,
                                        recursive=True,
                                        recursion_limit=recursion_limit,
                                        result_xfm='datasets'):
                yield from _create_sibling_ria(subds, url, name, ria_remote,
                                               ria_remote_name, existing,
                                               shared, group, post_update_hook,
                                               res_kwargs)
Esempio n. 12
0
    def __call__(dataset, urlfile, urlformat, filenameformat,
                 input_type="ext", exclude_autometa=None, meta=None,
                 message=None, dry_run=False, fast=False, ifexists=None,
                 missing_value=None, save=True, version_urls=False):
        # Temporarily work around gh-2269.
        url_file = urlfile
        url_format, filename_format = urlformat, filenameformat

        from requests.exceptions import RequestException

        from datalad.distribution.dataset import Dataset, require_dataset
        from datalad.interface.results import get_status_dict
        from datalad.support.annexrepo import AnnexRepo

        lgr = logging.getLogger("datalad.plugin.addurls")

        dataset = require_dataset(dataset, check_installed=False)
        if dataset.repo and not isinstance(dataset.repo, AnnexRepo):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message="not an annex repo")
            return

        if input_type == "ext":
            extension = os.path.splitext(url_file)[1]
            input_type = "json" if extension == ".json" else "csv"

        with open(url_file) as fd:
            try:
                rows, subpaths = extract(fd, input_type,
                                         url_format, filename_format,
                                         exclude_autometa, meta,
                                         dry_run,
                                         missing_value)
            except (ValueError, RequestException) as exc:
                yield get_status_dict(action="addurls",
                                      ds=dataset,
                                      status="error",
                                      message=exc_str(exc))
                return

        if len(rows) != len(set(row["filename"] for row in rows)):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message=("There are file name collisions; "
                                           "consider using {_repindex}"))
            return

        if dry_run:
            for subpath in subpaths:
                lgr.info("Would create a subdataset at %s", subpath)
            for row in rows:
                lgr.info("Would download %s to %s",
                         row["url"],
                         os.path.join(dataset.path, row["filename"]))
                lgr.info("Metadata: %s",
                         sorted(u"{}={}".format(k, v)
                                for k, v in row["meta_args"].items()))
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="ok",
                                  message="dry-run finished")
            return

        if not dataset.repo:
            # Populate a new dataset with the URLs.
            for r in dataset.create(result_xfm=None,
                                    return_type='generator'):
                yield r

        annex_options = ["--fast"] if fast else []

        for spath in subpaths:
            if os.path.exists(os.path.join(dataset.path, spath)):
                lgr.warning(
                    "Not creating subdataset at existing path: %s",
                    spath)
            else:
                for r in dataset.create(spath, result_xfm=None,
                                        return_type='generator'):
                    yield r

        for row in rows:
            # Add additional information that we'll need for various
            # operations.
            filename_abs = os.path.join(dataset.path, row["filename"])
            if row["subpath"]:
                ds_current = Dataset(os.path.join(dataset.path,
                                                  row["subpath"]))
                ds_filename = os.path.relpath(filename_abs, ds_current.path)
            else:
                ds_current = dataset
                ds_filename = row["filename"]
            row.update({"filename_abs": filename_abs,
                        "ds": ds_current,
                        "ds_filename": ds_filename})

        if version_urls:
            num_urls = len(rows)
            log_progress(lgr.info, "addurls_versionurls",
                         "Versioning %d URLs", num_urls,
                         label="Versioning URLs",
                         total=num_urls, unit=" URLs")
            for row in rows:
                url = row["url"]
                try:
                    row["url"] = get_versioned_url(url)
                except (ValueError, NotImplementedError) as exc:
                    # We don't expect this to happen because get_versioned_url
                    # should return the original URL if it isn't an S3 bucket.
                    # It only raises exceptions if it doesn't know how to
                    # handle the scheme for what looks like an S3 bucket.
                    lgr.warning("error getting version of %s: %s",
                                row["url"], exc_str(exc))
                log_progress(lgr.info, "addurls_versionurls",
                             "Versioned result for %s: %s", url, row["url"],
                             update=1, increment=True)
            log_progress(lgr.info, "addurls_versionurls", "Finished versioning URLs")

        files_to_add = set()
        for r in add_urls(rows, ifexists=ifexists, options=annex_options):
            if r["status"] == "ok":
                files_to_add.add(r["path"])
            yield r

            msg = message or """\
[DATALAD] add files from URLs

url_file='{}'
url_format='{}'
filename_format='{}'""".format(url_file, url_format, filename_format)

        if files_to_add:
            meta_rows = [r for r in rows if r["filename_abs"] in files_to_add]
            for r in add_meta(meta_rows):
                yield r

            if save:
                for r in dataset.save(path=files_to_add, message=msg, recursive=True):
                    yield r
Esempio n. 13
0
def clone_dataset(srcs,
                  destds,
                  reckless=None,
                  description=None,
                  result_props=None,
                  cfg=None):
    """Internal helper to perform cloning without sanity checks (assumed done)

    This helper does not handle any saving of subdataset modification or adding
    in a superdataset.

    Parameters
    ----------
    srcs : list
      Any suitable clone source specifications (paths, URLs)
    destds : Dataset
      Dataset instance for the clone destination
    reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional
      Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e.
      sacrifice data safety for performance or resource footprint.
    description : str, optional
      Location description for the annex of the dataset clone (if there is any).
    result_props : dict, optional
      Default properties for any yielded result, passed on to get_status_dict().
    cfg : ConfigManager, optional
      Configuration will be queried from this instance (i.e. from a particular
      dataset). If None is given, the global DataLad configuration will be
      queried.

    Yields
    ------
    dict
      DataLad result records
    """
    if not result_props:
        # in case the caller had no specific idea on how results should look
        # like, provide sensible defaults
        result_props = dict(
            action='install',
            logger=lgr,
            ds=destds,
        )

    dest_path = destds.pathobj

    # decode all source candidate specifications
    candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs]

    # now expand the candidate sources with additional variants of the decoded
    # giturl, while duplicating the other properties in the additional records
    # for simplicity. The hope is to overcome a few corner cases and be more
    # robust than git clone
    candidate_sources = [
        dict(props, giturl=s) for props in candidate_sources
        for s in _get_flexible_source_candidates(props['giturl'])
    ]

    # important test! based on this `rmtree` will happen below after failed clone
    dest_path_existed = dest_path.exists()
    if dest_path_existed and any(dest_path.iterdir()):
        if destds.is_installed():
            # check if dest was cloned from the given source before
            # this is where we would have installed this from
            # this is where it was actually installed from
            track_name, track_url = _get_tracking_source(destds)
            try:
                # this will get us track_url in system native path conventions,
                # whenever it is a path (and not a URL)
                # this is needed to match it to any potentially incoming local
                # source path in the 'notneeded' test below
                track_path = str(Path(track_url))
            except Exception:
                # this should never happen, because Path() will let any non-path stringification
                # pass through unmodified, but we do not want any potential crash due to
                # pathlib behavior changes
                lgr.debug("Unexpected behavior of pathlib!")
                track_path = None
            for cand in candidate_sources:
                src = cand['giturl']
                if track_url == src \
                        or get_local_file_url(track_url, compatibility='git') == src \
                        or track_path == expanduser(src):
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destds, src),
                        **result_props)
                    return
        # anything else is an error
        yield get_status_dict(
            status='error',
            message=
            'target path already exists and not empty, refuse to clone into target path',
            **result_props)
        return

    log_progress(
        lgr.info,
        'cloneds',
        'Cloning dataset to %s',
        destds,
        total=len(candidate_sources),
        label='Clone attempt',
        unit=' Candidate locations',
    )
    error_msgs = OrderedDict(
    )  # accumulate all error messages formatted per each url
    for cand in candidate_sources:
        log_progress(lgr.info,
                     'cloneds',
                     'Attempting to clone from %s to %s',
                     cand['giturl'],
                     dest_path,
                     update=1,
                     increment=True)

        clone_opts = {}

        if cand.get('version', None):
            clone_opts['branch'] = cand['version']
        try:
            # TODO for now GitRepo.clone() cannot handle Path instances, and PY35
            # doesn't make it happen seemlessly
            GitRepo.clone(path=str(dest_path),
                          url=cand['giturl'],
                          clone_options=clone_opts,
                          create=True)

        except CommandError as e:
            e_stderr = e.stderr

            error_msgs[cand['giturl']] = e
            lgr.debug("Failed to clone from URL: %s (%s)", cand['giturl'],
                      exc_str(e))
            if dest_path.exists():
                lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                          dest_path)
                # We must not just rmtree since it might be curdir etc
                # we should remove all files/directories under it
                # TODO stringification can be removed once patlib compatible
                # or if PY35 is no longer supported
                rmtree(str(dest_path), children_only=dest_path_existed)

            if 'could not create work tree' in e_stderr.lower():
                # this cannot be fixed by trying another URL
                re_match = re.match(r".*fatal: (.*)$",
                                    e_stderr,
                                    flags=re.MULTILINE | re.DOTALL)
                # cancel progress bar
                log_progress(lgr.info, 'cloneds',
                             'Completed clone attempts for %s', destds)
                yield get_status_dict(status='error',
                                      message=re_match.group(1).strip()
                                      if re_match else "stderr: " + e_stderr,
                                      **result_props)
                return
            # next candidate
            continue

        result_props['source'] = cand
        # do not bother with other sources if succeeded
        break

    log_progress(lgr.info, 'cloneds', 'Completed clone attempts for %s',
                 destds)

    if not destds.is_installed():
        if len(error_msgs):
            if all(not e.stdout and not e.stderr for e in error_msgs.values()):
                # there is nothing we can learn from the actual exception,
                # the exit code is uninformative, the command is predictable
                error_msg = "Failed to clone from all attempted sources: %s"
                error_args = list(error_msgs.keys())
            else:
                error_msg = "Failed to clone from any candidate source URL. " \
                            "Encountered errors per each url were:\n- %s"
                error_args = '\n- '.join('{}\n  {}'.format(url, exc_str(exc))
                                         for url, exc in error_msgs.items())
        else:
            # yoh: Not sure if we ever get here but I felt that there could
            #      be a case when this might happen and original error would
            #      not be sufficient to troubleshoot what is going on.
            error_msg = "Awkward error -- we failed to clone properly. " \
                        "Although no errors were encountered, target " \
                        "dataset at %s seems to be not fully installed. " \
                        "The 'succesful' source was: %s"
            error_args = (destds.path, cand['giturl'])
        yield get_status_dict(status='error',
                              message=(error_msg, error_args),
                              **result_props)
        return

    if not cand.get("version"):
        postclone_check_head(destds)

    # act on --reckless=shared-...
    # must happen prior git-annex-init, where we can cheaply alter the repo
    # setup through safe re-init'ing
    if reckless and reckless.startswith('shared-'):
        lgr.debug('Reinit %s to enable shared access permissions', destds)
        destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])])

    yield from postclonecfg_annexdataset(destds, reckless, description)

    # perform any post-processing that needs to know details of the clone
    # source
    if result_props['source']['type'] == 'ria':
        yield from postclonecfg_ria(destds, result_props['source'])

    # yield successful clone of the base dataset now, as any possible
    # subdataset clone down below will not alter the Git-state of the
    # parent
    yield get_status_dict(status='ok', **result_props)
Esempio n. 14
0
    def __call__(target, opts=None, dataset=None):
        # only non-bare repos have hashdirmixed, so require one
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='ORA archive export')
        ds_repo = ds.repo

        annex_objs = ds_repo.dot_git / 'annex' / 'objects'

        archive = resolve_path(target, dataset)
        if archive.is_dir():
            archive = archive / 'archive.7z'
        else:
            archive.parent.mkdir(exist_ok=True, parents=True)

        if not opts:
            # uncompressed by default
            opts = ['-mx0']

        res_kwargs = dict(
            action="export-archive-ora",
            logger=lgr,
        )

        if not annex_objs.is_dir():
            yield get_status_dict(
                ds=ds,
                status='notneeded',
                message='no annex keys present',
                **res_kwargs,
            )
            return

        exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive'
        if exportdir.exists():
            yield get_status_dict(
                ds=ds,
                status='error',
                message=(
                    'export directory already exists, please remove first: %s',
                    str(exportdir)),
                **res_kwargs,
            )
            return

        keypaths = [
            k for k in annex_objs.glob(op.join('**', '*')) if k.is_file()
        ]

        log_progress(
            lgr.info,
            'oraarchiveexport',
            'Start ORA archive export %s',
            ds,
            total=len(keypaths),
            label='ORA archive export',
            unit=' Keys',
        )

        link_fx = os.link
        for keypath in keypaths:
            key = keypath.name
            hashdir = op.join(keypath.parts[-4], keypath.parts[-3])
            log_progress(lgr.info,
                         'oraarchiveexport',
                         'Export key %s to %s',
                         key,
                         hashdir,
                         update=1,
                         increment=True)
            keydir = exportdir / hashdir / key
            keydir.mkdir(parents=True, exist_ok=True)
            try:
                link_fx(str(keypath), str(keydir / key))
            except OSError:
                lgr.warning(
                    'No hard links supported at %s, will copy files instead',
                    str(keydir))
                # no hard links supported
                # switch function after first error
                link_fx = shutil.copyfile
                link_fx(str(keypath), str(keydir / key))

        log_progress(lgr.info, 'oraarchiveexport',
                     'Finished RIA archive export from %s', ds)
        try:
            subprocess.run(
                ['7z', 'u', str(archive), '.'] + opts,
                cwd=str(exportdir),
            )
            yield get_status_dict(path=str(archive),
                                  type='file',
                                  status='ok',
                                  **res_kwargs)
        except Exception as e:
            yield get_status_dict(path=str(archive),
                                  type='file',
                                  status='error',
                                  message=('7z failed: %s', exc_str(e)),
                                  **res_kwargs)
            return
        finally:
            rmtree(str(exportdir))
Esempio n. 15
0
def _push(dspath,
          content,
          target,
          data,
          force,
          jobs,
          res_kwargs,
          pbars,
          got_path_arg=False):
    force_git_push = force in ('all', 'gitpush')

    # nothing recursive in here, we only need a repo to work with
    ds = Dataset(dspath)
    repo = ds.repo

    res_kwargs.update(type='dataset', path=dspath)

    # content will be unique for every push (even on the same dataset)
    pbar_id = 'push-{}-{}'.format(target, id(content))
    # register for final orderly take down
    pbars[pbar_id] = ds
    log_progress(
        lgr.info,
        pbar_id,
        'Determine push target',
        unit=' Steps',
        label='Push',
        total=4,
    )
    # pristine input arg
    _target = target
    # verified or auto-detected
    target = None
    if not _target:
        try:
            # let Git figure out what needs doing
            # we will reuse the result further down again, so nothing is wasted
            wannabe_gitpush = repo.push(remote=None, git_options=['--dry-run'])
            # we did not get an explicit push target, get it from Git
            target = set(p.get('remote', None) for p in wannabe_gitpush)
            # handle case where a pushinfo record did not have a 'remote'
            # property -- should not happen, but be robust
            target.discard(None)
        except Exception as e:
            lgr.debug(
                'Dry-run push to determine default push target failed, '
                'assume no configuration: %s', e)
            target = set()
        if not len(target):
            yield dict(
                res_kwargs,
                status='impossible',
                message='No push target given, and none could be '
                'auto-detected, please specify via --to',
            )
            return
        elif len(target) > 1:
            # dunno if this can ever happen, but if it does, report
            # nicely
            yield dict(res_kwargs,
                       status='error',
                       message=(
                           'No push target given, '
                           'multiple candidates auto-detected: %s',
                           list(target),
                       ))
            return
        else:
            # can only be a single one at this point
            target = target.pop()

    if not target:
        if _target not in repo.get_remotes():
            yield dict(res_kwargs,
                       status='error',
                       message=("Unknown target sibling '%s'.", _target))
            return
        target = _target

    log_progress(lgr.info,
                 pbar_id,
                 "Push refspecs",
                 label="Push to '{}'".format(target),
                 update=1,
                 total=4)

    # define config var name for potential publication dependencies
    depvar = 'remote.{}.datalad-publish-depends'.format(target)
    # list of remotes that are publication dependencies for the
    # target remote
    publish_depends = ensure_list(ds.config.get(depvar, []))
    if publish_depends:
        lgr.debug("Discovered publication dependencies for '%s': %s'", target,
                  publish_depends)

    # cache repo type
    is_annex_repo = isinstance(ds.repo, AnnexRepo)

    # TODO prevent this when `target` is a special remote
    # (possibly redo) a push attempt to figure out what needs pushing
    # do this on the main target only, and apply the result to all
    # dependencies
    try:
        if _target:
            # only do it when an explicit target was given, otherwise
            # we can reuse the result from the auto-probing above
            wannabe_gitpush = repo.push(remote=target,
                                        git_options=['--dry-run'])
    except Exception as e:
        lgr.debug(
            'Dry-run push to check push configuration failed, '
            'assume no configuration: %s', e)
        wannabe_gitpush = []
    refspecs2push = [
        # if an upstream branch is set, go with it
        p['from_ref'] if ds.config.get(
            # refs come in as refs/heads/<branchname>
            # need to cut the prefix
            'branch.{}.remote'.format(p['from_ref'][11:]),
            None) == target
        and ds.config.get('branch.{}.merge'.format(p['from_ref'][11:]), None)
        # if not, define target refspec explicitly to avoid having to
        # set an upstream branch, which would happen implicitly from
        # a users POV, and may also be hard to decide when publication
        # dependencies are present
        else '{}:{}'.format(p['from_ref'], p['to_ref'])
        for p in wannabe_gitpush
        # TODO: what if a publication dependency doesn't have it yet
        # should we not attempt to push, because the main target has it?
        if 'uptodate' not in p['operations'] and (
            # cannot think of a scenario where we would want to push a
            # managed branch directly, instead of the corresponding branch
            'refs/heads/adjusted' not in p['from_ref'])
    ]
    # TODO this is not right with managed branches
    active_branch = repo.get_active_branch()
    if active_branch and is_annex_repo:
        # we could face a managed branch, in which case we need to
        # determine the actual one and make sure it is sync'ed with the
        # managed one, and push that one instead. following methods can
        # be called unconditionally
        repo.localsync(managed_only=True)
        active_branch = repo.get_corresponding_branch(
            active_branch) or active_branch

    if not refspecs2push and not active_branch:
        # nothing was set up for push, and we have no active branch
        # this is a weird one, let's confess and stop here
        # I don't think we need to support such a scenario
        if not active_branch:
            yield dict(
                res_kwargs,
                status='impossible',
                message='There is no active branch, cannot determine remote '
                'branch')
            return

    # make sure that we always push the active branch (the context for the
    # potential path arguments) and the annex branch -- because we claim
    # to know better than any git config
    must_have_branches = [active_branch] if active_branch else []
    if is_annex_repo:
        must_have_branches.append('git-annex')
    for branch in must_have_branches:
        _append_branch_to_refspec_if_needed(ds, refspecs2push, branch)

    # we know what to push and where, now dependency processing first
    for r in publish_depends:
        # simply make a call to this function again, all the same, but
        # target is different
        yield from _push(
            dspath,
            content,
            # to this particular dependency
            r,
            data,
            force,
            jobs,
            res_kwargs.copy(),
            pbars,
            got_path_arg=got_path_arg,
        )

    # and lastly the primary push target
    target_is_git_remote = repo.config.get('remote.{}.url'.format(target),
                                           None) is not None

    # git-annex data copy
    #
    if is_annex_repo:
        if data != "nothing":
            log_progress(lgr.info,
                         pbar_id,
                         "Transfer data",
                         label="Transfer data to '{}'".format(target),
                         update=2,
                         total=4)
            yield from _push_data(
                ds,
                target,
                content,
                data,
                force,
                jobs,
                res_kwargs.copy(),
                got_path_arg=got_path_arg,
            )
        else:
            lgr.debug("Data transfer to '%s' disabled by argument", target)
    else:
        lgr.debug("No data transfer: %s is not a git annex repository", repo)

    if not target_is_git_remote:
        # there is nothing that we need to push or sync with on the git-side
        # of things with this remote
        return

    log_progress(lgr.info,
                 pbar_id,
                 "Update availability information",
                 label="Update availability for '{}'".format(target),
                 update=3,
                 total=4)

    # TODO fetch is only needed if anything was actually transferred. Collect this
    # info and make the following conditional on it

    # after file transfer the remote might have different commits to
    # the annex branch. They have to be merged locally, otherwise a
    # push of it further down will fail
    try:
        # fetch remote, let annex sync them locally, so that the push
        # later on works.
        # We have to fetch via the push url (if there is any),
        # not a pull url.
        # The latter might be dumb and without the execution of a
        # post-update hook we might not be able to retrieve the
        # server-side git-annex branch updates (and git-annex does
        # not trigger the hook on copy), but we know we have
        # full access via the push url -- we have just used it to copy.
        lgr.debug("Fetch 'git-annex' branch updates from '%s'", target)
        fetch_cmd = ['fetch', target, 'git-annex']
        pushurl = repo.config.get('remote.{}.pushurl'.format(target), None)
        if pushurl:
            # for some reason overwriting remote.{target}.url
            # does not have any effect...
            fetch_cmd = [
                '-c', 'url.{}.insteadof={}'.format(
                    pushurl,
                    repo.config.get('remote.{}.url'.format(target), None))
            ] + fetch_cmd
            lgr.debug("Sync local annex branch from pushurl after remote "
                      'availability update.')
        repo.call_git(fetch_cmd)
        # If no CommandError was raised, it means that remote has git-annex
        # but local repo might not be an annex yet. Since there is nothing to "sync"
        # from us, we just skip localsync without mutating repo into an AnnexRepo
        if is_annex_repo:
            repo.localsync(target)
    except CommandError as e:
        # it is OK if the remote doesn't have a git-annex branch yet
        # (e.g. fresh repo)
        # TODO is this possible? we just copied? Maybe check if anything
        # was actually copied?
        if "fatal: couldn't find remote ref git-annex" not in e.stderr.lower():
            raise
        lgr.debug('Remote does not have a git-annex branch: %s', e)

    if not refspecs2push:
        lgr.debug('No refspecs found that need to be pushed')
        return

    # and push all relevant branches, plus the git-annex branch to announce
    # local availability info too
    yield from _push_refspecs(
        repo,
        target,
        refspecs2push,
        force_git_push,
        res_kwargs.copy(),
    )
Esempio n. 16
0
    def __call__(self, dataset, refcommit, process_type, status):
        if process_type not in ('all', 'dataset'):
            return None
        ds = dataset
        log_progress(
            lgr.info,
            'extractorstudyminimeta',
            'Start studyminimeta metadata extraction from {path}'.format(path=ds.path),
            total=len(tuple(status)) + 1,
            label='Studyminimeta metadata extraction',
            unit=' Files',
        )

        source_file = self._get_absolute_studyminimeta_file_name(dataset)
        try:
            with open(source_file, "rt") as input_stream:
                metadata_object = yaml.safe_load(input_stream)
        except FileNotFoundError:
            yield {
                "status": "error",
                "metadata": {},
                "type": process_type,
                "message": "file " + source_file + " could not be opened"
            }
            return
        except yaml.YAMLError as e:
            yield {
                "status": "error",
                "metadata": {},
                "type": process_type,
                "message": "YAML parsing failed with: " + str(e)
            }
            return

        ld_creator_result = LDCreator(
            dataset.id,
            refcommit,
            self._get_relative_studyminimeta_file_name(dataset)
        ).create_ld_from_spec(metadata_object)

        if ld_creator_result.success:
            log_progress(
                lgr.info,
                'extractorstudyminimeta',
                'Finished studyminimeta metadata extraction from {path}'.format(path=ds.path)
            )
            yield {
                "status": "ok",
                "metadata": ld_creator_result.json_ld_object,
                "type": process_type
            }

        else:
            log_progress(
                lgr.error,
                'extractorstudyminimeta',
                'Error in studyminimeta metadata extraction from {path}'.format(path=ds.path)
            )
            yield {
                "status": "error",
                "metadata": {},
                "type": process_type,
                "message": "data structure conversion to JSON-LD failed"
            }
Esempio n. 17
0
    def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        contentmeta = []
        log_progress(
            lgr.info,
            'extractornifti1',
            'Start NIfTI1 metadata extraction from %s',
            self.ds,
            total=len(self.paths),
            label='NIfTI1 metadata extraction',
            unit=' Files',
        )
        for f in self.paths:
            absfp = opj(self.ds.path, f)
            log_progress(lgr.info,
                         'extractornifti1',
                         'Extract NIfTI1 metadata from %s',
                         absfp,
                         update=1,
                         increment=True)
            try:
                header = nibabel.load(absfp).header
            except Exception as e:
                lgr.debug("NIfTI metadata extractor failed to load %s: %s",
                          absfp, exc_str(e))
                continue
            if not isinstance(header, nibabel.Nifti1Header):
                # all we can do for now
                lgr.debug("Ignoring non-NIfTI1 file %s", absfp)
                continue

            # blunt conversion of the entire header
            meta = {
                self._key2stdkey.get(k, k):
                [np.asscalar(i) for i in v] if len(v.shape)
                # scalar
                else np.asscalar(v)
                for k, v in header.items() if k not in self._ignore
            }
            # more convenient info from nibabel's support functions
            meta.update({k: v(header) for k, v in self._extractors.items()})
            # filter useless fields (empty strings and NaNs)
            meta = {
                k: v
                for k, v in meta.items()
                if not (isinstance(v, float) and isnan(v))
                and not (hasattr(v, '__len__') and not len(v))
            }
            # a few more convenient targeted extracts from the header
            # spatial resolution in millimeter
            spatial_unit = header.get_xyzt_units()[0]
            # by what factor to multiply by to get to 'mm'
            if spatial_unit == 'unknown':
                lgr.debug(
                    "unit of spatial resolution for '{}' unknown, assuming 'millimeter'"
                    .format(absfp))
            spatial_unit_conversion = {
                'unknown': 1,
                'meter': 1000,
                'mm': 1,
                'micron': 0.001
            }.get(spatial_unit, None)
            if spatial_unit_conversion is None:
                lgr.debug(
                    "unexpected spatial unit code '{}' from NiBabel".format(
                        spatial_unit))
            # TODO does not see the light of day
            meta['spatial_resolution(mm)'] = \
                [(i * spatial_unit_conversion) for i in header.get_zooms()[:3]]
            # time
            if len(header.get_zooms()) > 3:
                # got a 4th dimension
                rts_unit = header.get_xyzt_units()[1]
                if rts_unit == 'unknown':
                    lgr.warn(
                        "RTS unit '{}' unknown, assuming 'seconds'".format(
                            absfp))
                # normalize to seconds, if possible
                rts_unit_conversion = {
                    'msec': 0.001,
                    'micron': 0.000001
                }.get(rts_unit, 1.0)
                if rts_unit not in ('hz', 'ppm', 'rads'):
                    meta['temporal_spacing(s)'] = \
                        header.get_zooms()[3] * rts_unit_conversion

            contentmeta.append((f, meta))

            # Decode entries which might be bytes
            # TODO: consider doing that in above "metalad" logic
            for k, v in meta.items():
                if isinstance(v, bytes):
                    meta[k] = v.decode()

        log_progress(lgr.info, 'extractornifti1',
                     'Finished NIfTI1 metadata extraction from %s', self.ds)

        return {
            '@context': vocabulary,
        }, \
            contentmeta
Esempio n. 18
0
def _proc(ds, refcommit, sources, status, extractors, process_type):
    dsmeta = dict()
    contentmeta = {}

    log_progress(
        lgr.info,
        'metadataextractors',
        'Start metadata extraction from %s',
        ds,
        total=len(sources),
        label='Metadata extraction',
        unit=' extractors',
    )
    for msrc in sources:
        msrc_key = msrc
        extractor = extractors[msrc]
        log_progress(lgr.info,
                     'metadataextractors',
                     'Engage %s metadata extractor',
                     msrc_key,
                     update=1,
                     increment=True)

        # actually pull the metadata records out of the extractor
        for res in _run_extractor(extractor['class'], msrc, ds, refcommit,
                                  status, extractor['process_type']):
            # always have a path, use any absolute path coming in,
            # make any relative path absolute using the dataset anchor,
            # use the dataset path if nothing is coming in (better then
            # no path at all)
            # for now normalize the reported path to be a plain string
            # until DataLad as a whole can deal with pathlib objects
            if 'path' in res:
                res['path'] = text_type(Path(res['path']))
            res.update(path=ds.path if 'path' not in res else res['path'] if op
                       .isabs(res['path']) else op.join(ds.path, res['path']))

            # the following two conditionals are untested, as a test would
            # require a metadata extractor to yield broken metadata, and in
            # order to have such one, we need a mechanism to have the test
            # inject one on the fly MIH thinks that the code neeeded to do that
            # is more chances to be broken then the code it would test
            if success_status_map.get(res['status'],
                                      False) != 'success':  # pragma: no cover
                yield res
                # no further processing of broken stuff
                continue
            else:  # pragma: no cover
                # if the extractor was happy check the result
                if not _ok_metadata(res, msrc, ds, None):
                    res.update(
                        # this will prevent further processing a few lines down
                        status='error',
                        # TODO have _ok_metadata report the real error
                        message=('Invalid metadata (%s)', msrc),
                    )
                    yield res
                    continue

            # we do not want to report info that there was no metadata
            if not res['metadata']:  # pragma: no cover
                lgr.debug(
                    'Skip %s %s metadata in record of %s: '
                    'extractor reported nothing', msrc_key,
                    res.get('type', ''), res['path'])
                continue

            if res['type'] == 'dataset':
                # TODO warn if two dataset records are generated by the same
                # extractor
                dsmeta[msrc_key] = res['metadata']
            else:
                # this is file metadata, _ok_metadata() checks unknown types
                # assign only ask each metadata extractor once, hence no
                # conflict possible
                loc_dict = contentmeta.get(res['path'], {})
                loc_dict[msrc_key] = res['metadata']
                contentmeta[res['path']] = loc_dict

    log_progress(
        lgr.info,
        'metadataextractors',
        'Finished metadata extraction from %s',
        ds,
    )
    # top-level code relies on the fact that any dataset metadata
    # is yielded before content metadata
    if process_type in (None, 'all', 'dataset') and \
            dsmeta and ds is not None and ds.is_installed():
        yield get_status_dict(
            ds=ds,
            metadata=dsmeta,
            # any errors will have been reported before
            status='ok',
        )

    for p in contentmeta:
        res = get_status_dict(
            # TODO avoid is_installed() call
            path=op.join(ds.path, p) if ds.is_installed() else p,
            metadata=contentmeta[p],
            type='file',
            # any errors will have been reported before
            status='ok',
        )
        # TODO avoid is_installed() call, check if such info is
        # useful and accurate at all
        if ds.is_installed():
            res['parentds'] = ds.path
        yield res
Esempio n. 19
0
    def __call__(dataset=None,
                 path=None,
                 sources=None,
                 process_type=None,
                 format='native'):
        ds = require_dataset(dataset or curdir,
                             purpose="extract metadata",
                             check_installed=not path)

        # check what extractors we want as sources, and whether they are
        # available
        if not sources:
            sources = ['metalad_core', 'metalad_annex'] \
                + assure_list(get_metadata_type(ds))
        # keep local, who knows what some extractors might pull in
        from pkg_resources import iter_entry_points  # delayed heavy import
        extractors = {}
        for ep in iter_entry_points('datalad.metadata.extractors'):
            if ep.name not in sources:
                # not needed here
                continue
            rec = dict(entrypoint=ep)
            if ep.name in extractors:  # pragma: no cover
                # potential conflict
                if extractors[
                        ep.name]['entrypoint'].dist.project_name == 'datalad':
                    # this is OK, just state it is happening
                    lgr.debug('Extractor %s overrides datalad-core variant',
                              ep)
                    extractors[ep.name] = rec
                elif ep.dist.project_name == 'datalad':
                    # also OK
                    lgr.debug('Prefer extractor %s over datalad-core variant',
                              ep)
                else:
                    msg = ('At least two DataLad extensions provide metadata '
                           'extractor %s: %s vs. %s', ep.name, ep.dist,
                           extractors[ep.name].dist)
                    if ep.name in sources:
                        # this extractor is required -> blow hard
                        raise RuntimeError(msg[0] % msg[1:])
                    else:
                        # still moan
                        lgr.warn(msg)
                    # ignore the newcomer, is listed second in sys.path
            else:
                # this fresh and unique
                extractors[ep.name] = rec
        for msrc in sources:
            if msrc not in extractors:
                # we said that we want to fail, rather then just moan about
                # less metadata
                raise ValueError(
                    "Enabled metadata extractor '{}' not available".format(
                        msrc), )
            # load extractor implementation
            rec = extractors[msrc]
            rec['process_type'] = process_type \
                if process_type and not process_type == 'extractors' \
                else ds.config.obtain(
                    'datalad.metadata.extract-from-{}'.format(
                        msrc.replace('_', '-')),
                    default='all')
            # load the extractor class, no instantiation yet
            try:
                rec['class'] = rec['entrypoint'].load()
            except Exception as e:  # pragma: no cover
                msg = ('Failed %s metadata extraction from %s: %s', msrc, ds,
                       exc_str(e))
                log_progress(lgr.error, 'metadataextractors', *msg)
                raise ValueError(msg[0] % msg[1:])

        res_props = dict(
            action='meta_extract',
            logger=lgr,
        )

        # build report on extractors and their state info
        if process_type == 'extractors':
            for ename, eprops in iteritems(extractors):
                state = {}
                # do not trip over old extractors
                if hasattr(eprops['class'], 'get_state'):
                    state.update(eprops['class']().get_state(ds))

                yield dict(action='meta_extract',
                           path=ds.path,
                           status='ok',
                           logger=lgr,
                           extractor=ename,
                           state=dict(
                               state,
                               process_type=eprops['process_type'],
                           ))
            return

        # build a representation of the dataset's content (incl subds
        # records)
        # go through a high-level command (not just the repo methods) to
        # get all the checks and sanitization of input arguments
        # this call is relatively expensive, but already anticipates
        # demand for information by our core extractors that always run
        # unconditionally, hence no real slowdown here
        # TODO this could be a dict, but MIH cannot think of an access
        # pattern that does not involve iteration over all items
        status = []
        exclude_paths = [
            ds.pathobj / PurePosixPath(e) for e in
            (list(exclude_from_metadata) +
             assure_list(ds.config.get('datalad.metadata.exclude-path', [])))
        ]
        if ds.is_installed():
            # we can make use of status
            res_props.update(refds=ds.path)

            for r in ds.status(
                    # let status sort out all path arg handling
                    # but this will likely make it impossible to use this
                    # command to just process an individual file independent
                    # of a dataset
                    path=path,
                    # it is safe to ask for annex info even when a dataset is
                    # plain Git
                    # NOTE changing to 'annex=availability' has substantial
                    # performance costs, as it involved resolving each annex
                    # symlink on the file-system, which can be really slow
                    # depending on the FS and the number of annexed files
                    annex='basic',
                    # TODO we never want to aggregate metadata from untracked
                    # content, but we might just want to see what we can get
                    # from a file
                    untracked='no',
                    # this command cannot and will not work recursively
                    recursive=False,
                    result_renderer='disabled'):
                # path reports are always absolute and anchored on the dataset
                # (no repo) path
                p = Path(r['path'])
                if p in exclude_paths or \
                        any(e in p.parents for e in exclude_paths):
                    # this needs to be ignore for any further processing
                    continue
                # strip useless context information
                status.append({
                    k: v
                    for k, v in iteritems(r)
                    if (k not in ('refds', 'parentds', 'action',
                                  'status') and not k.startswith('prev_'))
                })

            # determine the commit that we are describing
            refcommit = get_refcommit(ds)
            if refcommit is None or not len(status):
                # this seems extreme, but without a single commit there is
                # nothing we can have, or describe -> blow
                yield dict(
                    res_props,
                    status='error',
                    message=\
                    'No metadata-relevant repository content found. ' \
                    'Cannot determine reference commit for metadata ID',
                    type='dataset',
                    path=ds.path,
                )
                return
            # stamp every result
            res_props['refcommit'] = refcommit
        else:
            # no dataset at hand, take path arg at face value and hope
            # for the best
            # TODO we have to resolve the given path to make it match what
            # status is giving (abspath with ds (not repo) anchor)
            status = [dict(path=p, type='file') for p in assure_list(path)]
            # just for compatibility, mandatory argument list below
            refcommit = None

        if ds.is_installed():
            # check availability requirements and obtain data as needed
            needed_paths = set()
            for rec in extractors.values():
                if hasattr(rec['class'], 'get_required_content'):
                    needed_paths.update(
                        # new extractors do not need any instantiation args
                        s['path'] for s in rec['class']().get_required_content(
                            ds, rec['process_type'], status))
            if needed_paths:
                for r in ds.get(path=needed_paths,
                                return_type='generator',
                                result_renderer='disabled'):
                    if success_status_map.get(
                            r['status'],
                            False) != 'success':  # pragma: no cover
                        # online complain when something goes wrong
                        yield r

        contexts = {}
        nodes_by_context = {}
        try:
            for res in _proc(ds, refcommit, sources, status, extractors,
                             process_type):
                if format == 'native':
                    # that is what we pass around internally
                    res.update(**res_props)
                    yield res
                elif format == 'jsonld':
                    collect_jsonld_metadata(ds.pathobj, res, nodes_by_context,
                                            contexts)
        finally:
            # extractors can come from any source with no guarantee for
            # proper implementation. Let's make sure that we bring the
            # dataset back into a sane state (e.g. no batch processes
            # hanging around). We should do this here, as it is not
            # clear whether extraction results will be saved to the
            # dataset(which would have a similar sanitization effect)
            if ds.repo:
                ds.repo.precommit()
        if format == 'jsonld':
            yield dict(status='ok',
                       type='dataset',
                       path=ds.path,
                       metadata=format_jsonld_metadata(nodes_by_context),
                       **res_props)
Esempio n. 20
0
    def __call__(path=None,
                 dataset=None,
                 to=None,
                 since=None,
                 force=None,
                 recursive=False,
                 recursion_limit=None,
                 jobs=None):
        # we resolve here, because we need to perform inspection on what was given
        # as an input argument further down
        paths = [resolve_path(p, dataset) for p in assure_list(path)]

        ds = require_dataset(dataset, check_installed=True, purpose='pushing')
        ds_repo = ds.repo

        res_kwargs = dict(
            action='publish',
            refds=ds.path,
            logger=lgr,
        )

        get_remote_kwargs = {'exclude_special_remotes': False} \
            if isinstance(ds_repo, AnnexRepo) else {}
        if to and to not in ds_repo.get_remotes(**get_remote_kwargs):
            # get again for proper error:
            sr = ds_repo.get_remotes(**get_remote_kwargs)
            # yield an error result instead of raising a ValueError,
            # to enable the use case of pushing to a target that
            # a superdataset doesn't know, but some subdatasets to
            # (in combination with '--on-failure ignore')
            yield dict(res_kwargs,
                       status='error',
                       message="Unknown push target '{}'. {}".format(
                           to, 'Known targets: {}.'.format(', '.join(
                               repr(s) for s in sr))
                           if sr else 'No targets configured in dataset.'))
            return

        if since:
            # will blow with ValueError if unusable
            ds_repo.get_hexsha(since)

        if not since and since is not None:
            # special case: --since=''
            # figure out state of remote branch and set `since`
            since = _get_corresponding_remote_state(ds_repo, to)
            if not since:
                lgr.info("No tracked remote for active branch, "
                         "detection of last pushed state not in effect.")

        # obtain a generator for information on the datasets to process
        # idea is to turn the `paths` argument into per-dataset
        # content listings that can be acted upon
        ds_spec = _datasets_since_(
            # important to pass unchanged dataset arg
            dataset,
            since,
            paths,
            recursive,
            recursion_limit)

        # instead of a loop, this could all be done in parallel
        matched_anything = False
        for dspath, dsrecords in ds_spec:
            matched_anything = True
            lgr.debug('Attempt push of Dataset at %s', dspath)
            pbars = {}
            yield from _push(dspath,
                             dsrecords,
                             to,
                             force,
                             jobs,
                             res_kwargs.copy(),
                             pbars,
                             got_path_arg=True if path else False)
            # take down progress bars for this dataset
            for i, ds in pbars.items():
                log_progress(lgr.info, i, 'Finished push of %s', ds)
        if not matched_anything:
            yield dict(
                res_kwargs,
                status='notneeded',
                message=
                'Given constraints did not match any changes to publish',
                type='dataset',
                path=ds.path,
            )
Esempio n. 21
0
def _push(dspath,
          content,
          target,
          force,
          jobs,
          res_kwargs,
          pbars,
          done_fetch=None,
          got_path_arg=False):
    if not done_fetch:
        done_fetch = set()
    # nothing recursive in here, we only need a repo to work with
    ds = Dataset(dspath)
    repo = ds.repo

    res_kwargs.update(type='dataset', path=dspath)

    # content will be unique for every push (even on the some dataset)
    pbar_id = 'push-{}-{}'.format(target, id(content))
    # register for final orderly take down
    pbars[pbar_id] = ds
    log_progress(
        lgr.info,
        pbar_id,
        'Determine push target',
        unit=' Steps',
        label='Push',
        total=4,
    )
    if not target:
        try:
            # let Git figure out what needs doing
            wannabe_gitpush = repo.push(remote=None, git_options=['--dry-run'])
            # we did not get an explicit push target, get it from Git
            target = set(p.get('remote', None) for p in wannabe_gitpush)
            # handle case where a pushinfo record did not have a 'remote'
            # property -- should not happen, but be robust
            target.discard(None)
        except Exception as e:
            lgr.debug(
                'Dry-run push to determine default push target failed, '
                'assume no configuration: %s', e)
            target = set()
        if not len(target):
            yield dict(
                res_kwargs,
                status='impossible',
                message='No push target given, and none could be '
                'auto-detected, please specific via --to',
            )
            return
        elif len(target) > 1:
            # dunno if this can ever happen, but if it does, report
            # nicely
            yield dict(res_kwargs,
                       status='error',
                       message=(
                           'No push target given, '
                           'multiple candidates auto-detected: %s',
                           list(target),
                       ))
            return
        else:
            # can only be a single one at this point
            target = target.pop()

    if target not in repo.get_remotes():
        yield dict(res_kwargs,
                   status='error',
                   message=("Unknown target sibling '%s'.", target))
        return

    log_progress(lgr.info,
                 pbar_id,
                 "Push refspecs",
                 label="Push to '{}'".format(target),
                 update=1,
                 total=4)

    # define config var name for potential publication dependencies
    depvar = 'remote.{}.datalad-publish-depends'.format(target)
    # list of remotes that are publication dependencies for the
    # target remote
    publish_depends = assure_list(ds.config.get(depvar, []))
    if publish_depends:
        lgr.debug("Discovered publication dependencies for '%s': %s'", target,
                  publish_depends)

    # cache repo type
    is_annex_repo = isinstance(ds.repo, AnnexRepo)

    # TODO prevent this when `target` is a special remote
    # (possibly redo) a push attempt to figure out what needs pushing
    # do this on the main target only, and apply the result to all
    # dependencies
    try:
        wannabe_gitpush = repo.push(remote=target, git_options=['--dry-run'])
    except Exception as e:
        lgr.debug(
            'Dry-run push to check push configuration failed, '
            'assume no configuration: %s', e)
        wannabe_gitpush = []
    refspecs2push = [
        # if an upstream branch is set, go with it
        p['from_ref'] if ds.config.get(
            # refs come in as refs/heads/<branchname>
            # need to cut the prefix
            'branch.{}.remote'.format(p['from_ref'][11:]),
            None) == target
        and ds.config.get('branch.{}.merge'.format(p['from_ref'][11:]), None)
        # if not, define target refspec explicitly to avoid having to
        # set an upstream branch, which would happen implicitly from
        # a users POV, and may also be hard to decide when publication
        # dependencies are present
        else '{}:{}'.format(p['from_ref'], p['to_ref'])
        for p in wannabe_gitpush
        # TODO: what if a publication dependency doesn't have it yet
        # should we not attempt to push, because the main target has it?
        if 'uptodate' not in p['operations'] and (
            # cannot think of a scenario where we would want to push a
            # managed branch directly, instead of the corresponding branch
            'refs/heads/adjusted' not in p['from_ref'])
    ]
    if not refspecs2push:
        lgr.debug(
            'No refspecs configured for push, attempting to use active branch')
        # nothing was set up for push, push the current branch at minimum
        # TODO this is not right with managed branches
        active_branch = repo.get_active_branch()
        if not active_branch:
            yield dict(
                res_kwargs,
                status='impossible',
                message='There is no active branch, cannot determine remote '
                'branch')
            return
        if is_annex_repo:
            # we could face a managed branch, in which case we need to
            # determine the actual one and make sure it is sync'ed with the
            # managed one, and push that one instead. following methods can
            # be called unconditionally
            repo.localsync(managed_only=True)
            active_branch = repo.get_corresponding_branch(
                active_branch) or active_branch
        refspecs2push.append(
            # same dance as above
            active_branch if ds.config.
            get('branch.{}.merge'.format(active_branch), None
                ) else '{ab}:{ab}'.format(ab=active_branch))

    # we know what to push and where, now dependency processing first
    for r in publish_depends:
        # simply make a call to this function again, all the same, but
        # target is different, pass done_fetch to avoid duplicate
        # and expensive calls to git-fetch
        yield from _push(
            dspath,
            content,
            # to this particular dependency
            r,
            force,
            jobs,
            res_kwargs.copy(),
            pbars,
            done_fetch=None,
            got_path_arg=got_path_arg,
        )

    # and lastly the primary push target
    target_is_git_remote = repo.config.get('remote.{}.url'.format(target),
                                           None) is not None
    # only attempt, if Git knows about a URL, otherwise this is
    # a pure special remote that doesn't deal with the git repo
    if target_is_git_remote:
        # push the main branches of interest first, but not yet (necessarily)
        # the git-annex branch. We ant to push first in order to hit any
        # conflicts or unknown history before we move data. Otherwise out
        # decision making done above (--since ...) might have been
        # inappropriate.
        push_ok = True
        for p in _push_refspecs(repo, target, refspecs2push, force,
                                res_kwargs.copy()):
            if p['status'] not in ('ok', 'notneeded'):
                push_ok = False
            yield p
        if not push_ok:
            # error-type results have been yielded, the local status quo is
            # outdated/invalid, stop to let user decide how to proceed.
            # TODO final global error result for the dataset?!
            return

    # git-annex data move
    #
    if not is_annex_repo:
        return

    if force == 'no-datatransfer':
        lgr.debug("Data transfer to '%s' disabled by argument", target)
        return

    log_progress(lgr.info,
                 pbar_id,
                 "Transfer data",
                 label="Transfer data to '{}'".format(target),
                 update=2,
                 total=4)

    yield from _push_data(
        ds,
        target,
        content,
        force,
        jobs,
        res_kwargs.copy(),
        got_path_arg=got_path_arg,
    )

    if not target_is_git_remote:
        # there is nothing that we need to push or sync with on the git-side
        # of things with this remote
        return

    log_progress(lgr.info,
                 pbar_id,
                 "Update availability information",
                 label="Update availability for '{}'".format(target),
                 update=3,
                 total=4)

    # after file transfer the remote might have different commits to
    # the annex branch. They have to be merged locally, otherwise a
    # push of it further down will fail
    try:
        # fetch remote, let annex sync them locally, so that the push
        # later on works.
        # We have to fetch via the push url (if there is any),
        # not a pull url.
        # The latter might be dumb and without the execution of a
        # post-update hook we might not be able to retrieve the
        # server-side git-annex branch updates (and git-annex does
        # not trigger the hook on copy), but we know we have
        # full access via the push url -- we have just used it to copy.
        lgr.debug("Fetch 'git-annex' branch updates from '%s'", target)
        fetch_cmd = ['fetch', target, 'git-annex']
        pushurl = repo.config.get('remote.{}.pushurl'.format(target), None)
        if pushurl:
            # for some reason overwriting remote.{target}.url
            # does not have any effect...
            fetch_cmd = [
                '-c', 'url.{}.insteadof={}'.format(
                    pushurl,
                    repo.config.get('remote.{}.url'.format(target), None))
            ] + fetch_cmd
            lgr.debug("Sync local annex branch from pushurl after remote "
                      'availability update.')
        repo.call_git(fetch_cmd)
        repo.localsync(target)
    except CommandError as e:
        # it is OK if the remote doesn't have a git-annex branch yet
        # (e.g. fresh repo)
        # TODO is this possible? we just copied? Maybe check if anything
        # was actually copied?
        if "fatal: couldn't find remote ref git-annex" not in e.stderr.lower():
            raise
        lgr.debug('Remote does not have a git-annex branch: %s', e)
    # and push the annex branch to announce local availability info
    # too
    yield from _push_refspecs(
        repo,
        target,
        [
            'git-annex' if ds.config.get('branch.git-annex.merge', None) else
            'git-annex:git-annex'
        ],
        force,
        res_kwargs.copy(),
    )
Esempio n. 22
0
    def _get_cnmeta(self, bids):
        # TODO any custom handling of participants infos should eventually
        # be done by pybids in one way or another
        path_props = {}
        participants_fname = opj(self.ds.path, 'participants.tsv')
        if exists(participants_fname):
            try:
                for rx, info in yield_participant_info(bids):
                    path_props[rx] = {'subject': info}
            except Exception as exc:
                if isinstance(exc, ImportError):
                    raise exc
                lgr.warning(
                    "Failed to load participants info due to: %s. Skipping the rest of file",
                    exc_str(exc))

        log_progress(
            lgr.info,
            'extractorbids',
            'Start BIDS metadata extraction from %s',
            self.ds,
            total=len(self.paths),
            label='BIDS metadata extraction',
            unit=' Files',
        )
        # now go over all files in the dataset and query pybids for its take
        # on each of them
        for f in self.paths:
            absfp = opj(self.ds.path, f)
            log_progress(lgr.info,
                         'extractorbids',
                         'Extract BIDS metadata from %s',
                         absfp,
                         update=1,
                         increment=True)
            # BIDS carries a substantial portion of its metadata in JSON
            # sidecar files. we ignore them here completely
            # this might yield some false-negatives in theory, but
            # this case has not been observed in practice yet, hence
            # doing it cheap for now
            if f.endswith('.json'):
                continue
            md = {}
            try:
                md.update({
                    k: v
                    for k, v in bids.get_metadata(
                        opj(self.ds.path, f), include_entities=True).items()
                    # no nested structures for now (can be monstrous when DICOM
                    # metadata is embedded)
                    if not isinstance(v, dict)
                })
            except ValueError as e:
                lgr.debug(
                    'PyBIDS errored on file %s in %s: %s '
                    '(possibly not BIDS-compliant or not recognized', f,
                    self.ds, exc_str(e))
                lgr.debug('no usable BIDS metadata for %s in %s: %s', f,
                          self.ds, exc_str(e))
                # do not raise here:
                # https://github.com/datalad/datalad-neuroimaging/issues/34
            except Exception as e:
                lgr.debug('no usable BIDS metadata for %s in %s: %s', f,
                          self.ds, exc_str(e))
                if cfg.get('datalad.runtime.raiseonerror'):
                    raise

            # no check al props from other sources and apply them
            for rx in path_props:
                if rx.match(f):
                    md.update(path_props[rx])
            yield f, md
        log_progress(lgr.info, 'extractorbids',
                     'Finished BIDS metadata extraction from %s', self.ds)
Esempio n. 23
0
    def __call__(
            archive,
            *,
            dataset=None,
            annex=None,
            add_archive_leading_dir=False,
            strip_leading_dirs=False,
            leading_dirs_depth=None,
            leading_dirs_consider=None,
            use_current_dir=False,
            delete=False,
            key=False,
            exclude=None,
            rename=None,
            existing='fail',
            annex_options=None,
            copy=False,
            commit=True,
            allow_dirty=False,
            stats=None,
            drop_after=False,
            delete_after=False):

        if exclude:
            exclude = ensure_tuple_or_list(exclude)
        if rename:
            rename = ensure_tuple_or_list(rename)
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='add-archive-content')

        # set up common params for result records
        res_kwargs = {
            'action': 'add-archive-content',
            'logger': lgr,
        }

        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message="Can't operate in a pure Git repository",
                **res_kwargs
            )
            return
        if annex:
            warnings.warn(
                "datalad add_archive_content's `annex` parameter is "
                "deprecated and will be removed in a future release. "
                "Use the 'dataset' parameter instead.",
                DeprecationWarning)
        annex = ds.repo
        # get the archive path relative from the ds root
        archive_path = resolve_path(archive, ds=dataset)
        # let Status decide whether we can act on the given file
        for s in ds.status(
                path=archive_path,
                on_failure='ignore',
                result_renderer='disabled'):
            if s['status'] == 'error':
                if 'path not underneath the reference dataset %s' in s['message']:
                    yield get_status_dict(
                        ds=ds,
                        status='impossible',
                        message='Can not add archive outside of the dataset',
                        **res_kwargs)
                    return
                # status errored & we haven't anticipated the cause. Bubble up
                yield s
                return
            elif s['state'] == 'untracked':
                # we can't act on an untracked file
                message = (
                    "Can not add an untracked archive. "
                    "Run 'datalad save {}'".format(archive)
                )
                yield get_status_dict(
                           ds=ds,
                           status='impossible',
                           message=message,
                           **res_kwargs)
                return

        if not allow_dirty and annex.dirty:
            # error out here if the dataset contains untracked changes
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required. '
                    'Use `datalad status` to inspect unsaved changes'),
                **res_kwargs
            )
            return

        # ensure the archive exists, status doesn't error on a non-existing file
        if not key and not lexists(archive_path):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'No such file: {}'.format(archive_path),
                ),
                **res_kwargs
            )
            return

        if not key:
            check_path = archive_path.relative_to(ds.pathobj)
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            # can become get_file_annexinfo once #6104 is merged
            key = annex.get_file_annexinfo(check_path)['key']
            if not key:
                raise RuntimeError(
                    f"Archive must be an annexed file in {ds}")
            archive_dir = Path(archive_path).parent
        else:
            origin = 'key'
            key = archive
            # We must not have anything to do with the location under .git/annex
            archive_dir = None
            # instead, we will go from the current directory
            use_current_dir = True

        archive_basename = file_basename(archive)

        if not key:
            # if we didn't manage to get a key, the file must be in Git
            raise NotImplementedError(
                "Provided file %s does not seem to be under annex control. "
                "We don't support adding everything straight to Git" % archive
            )

        # figure out our location
        pwd = getpwd()
        # are we in a subdirectory of the repository?
        pwd_in_root = annex.path == archive_dir
        # then we should add content under that subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # extract the archive under the current directory, not the directory
            # where the archive is located
            extract_rpath = Path(pwd).relative_to(ds.path) \
                if not pwd_in_root \
                else None
        else:
            extract_rpath = archive_dir.relative_to(ds.path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None

        try:
            key_rpath = annex.get_contentlocation(key)
        except:
            # the only probable reason for this to fail is that there is no
            # content present
            raise RuntimeError(
                "Content of %s seems to be N/A.  Fetch it first" % key
            )

        # now we simply need to go through every file in that archive and
        lgr.info(
            "Adding content of the archive %s into annex %s", archive, annex
        )

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote

        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        # OK, let's ignore that the following class is actually a special
        # remote implementation, and use it only to work with its cache
        annexarchive = ArchiveAnnexCustomRemote(annex=None,
                                                path=annex.path,
                                                persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]
        # make sure there is an enabled datalad-archives special remote
        ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE,
                              autoenable=True)

        precommitted = False
        old_always_commit = annex.always_commit
        # batch mode is disabled when faking dates, we want to always commit
        annex.always_commit = annex.fake_dates_enabled
        if annex_options:
            if isinstance(annex_options, str):
                annex_options = split_cmdline(annex_options)
        delete_after_rpath = None

        prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad",
                                               dir=annex.path)) \
            if delete_after \
            else None

        # dedicated stats which would be added to passed in (if any)
        outside_stats = stats
        stats = ActivityStats()

        try:
            # keep track of extracted files for progress bar logging
            file_counter = 0
            # iterative over all files in the archive
            extracted_files = list(earchive.get_extracted_files())
            # start a progress bar for extraction
            pbar_id = f'add-archive-{archive_path}'
            log_progress(
                lgr.info, pbar_id, 'Extracting archive',
                label="Extracting archive",
                unit=' Files',
                total = len(extracted_files),
                noninteractive_level = logging.INFO)
            for extracted_file in extracted_files:
                file_counter += 1
                files_left = len(extracted_files) - file_counter
                log_progress(
                    lgr.info, pbar_id,
                    "Files to extract %i ", files_left,
                    update=1,
                    increment=True,
                    noninteractive_level=logging.DEBUG)
                stats.files += 1
                extracted_path = Path(earchive.path) / Path(extracted_file)

                if extracted_path.is_symlink():
                    link_path = str(extracted_path.resolve())
                    if not exists(link_path):
                        # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning(
                            "Path %s points to non-existing file %s" %
                            (extracted_path, link_path)
                        )
                        stats.skipped += 1
                        continue
                        # TODO: check if points outside of archive - warn & skip

                url = annexarchive.get_file_url(
                    archive_key=key,
                    file=extracted_file,
                    size=os.stat(extracted_path).st_size)

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = Path(extracted_file)

                # stream archives would not have had the original filename
                # information in them, so would be extracted under a name
                # derived from their annex key.
                # Provide ad-hoc handling for such cases
                if (len(extracted_files) == 1 and
                    Path(archive).suffix in ('.xz', '.gz', '.lzma') and
                        Path(key_rpath).name.startswith(Path(
                            extracted_file).name)):
                    # take archive's name without extension for filename & place
                    # where it was originally extracted
                    target_file = \
                        Path(extracted_file).parent / Path(archive).stem

                if strip_leading_dirs:
                    leading_dir = earchive.get_leading_directory(
                        depth=leading_dirs_depth, exclude=exclude,
                        consider=leading_dirs_consider)
                    leading_dir_len = \
                        len(leading_dir) + len(opsep) if leading_dir else 0
                    target_file = str(target_file)[leading_dir_len:]

                if add_archive_leading_dir:
                    # place extracted content under a directory corresponding to
                    # the archive name with suffix stripped.
                    target_file = Path(archive_basename) / target_file

                if rename:
                    target_file = apply_replacement_rules(rename,
                                                          str(target_file))

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains "
                                    "{regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if delete_after:
                    # place target file in a temporary directory
                    target_file = Path(prefix_dir) / Path(target_file)
                    # but also allow for it in the orig
                    target_file_orig = Path(prefix_dir) / Path(target_file_orig)

                target_file_path_orig = annex.pathobj / target_file_orig

                # If we were invoked in a subdirectory, patch together the
                # correct path
                target_file_path = extract_rpath / target_file \
                    if extract_rpath else target_file
                target_file_path = annex.pathobj / target_file_path

                # when the file already exists...
                if lexists(target_file_path):
                    handle_existing = True
                    if md5sum(str(target_file_path)) == \
                            md5sum(str(extracted_path)):
                        if not annex.is_under_annex(str(extracted_path)):
                            # if under annex -- must be having the same content,
                            # we should just add possibly a new extra URL
                            # but if under git -- we cannot/should not do
                            # anything about it ATM
                            if existing != 'overwrite':
                                continue
                        else:
                            handle_existing = False
                    if not handle_existing:
                        pass  # nothing... just to avoid additional indentation
                    elif existing == 'fail':
                        message = \
                            "{} exists, but would be overwritten by new file " \
                            "{}. Consider adjusting --existing".format\
                            (target_file_path, extracted_file)
                        yield get_status_dict(
                            ds=ds,
                            status='error',
                            message=message,
                            **res_kwargs)
                        return
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a
                        # tree
                        rmtree(target_file_path)
                    else:
                        # an elaborate dance to piece together new archive names
                        target_file_path_orig_ = target_file_path

                        # To keep extension intact -- operate on the base of the
                        # filename
                        p, fn = os.path.split(target_file_path)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            # we shouldn't get here, argparse should catch a
                            # non-existing value for --existing right away
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file
                        # doesn't collide
                        suf, i = '', 0
                        while True:
                            connector = \
                                ('.' if (fn_ext or ends_with_dot) else '')
                            file = fn_base + suf + connector + fn_ext
                            target_file_path_new =  \
                                Path(p) / Path(file)
                            if not lexists(target_file_path_new):
                                # we found a file name that is not yet taken
                                break
                            lgr.debug("Iteration %i of file name finding. "
                                      "File %s already exists", i,
                                      target_file_path_new)
                            i += 1
                            suf = '.%d' % i
                        target_file_path = target_file_path_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_path_orig_, target_file_path))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file_path != target_file_path_orig:
                    stats.renamed += 1

                if copy:
                    raise NotImplementedError(
                        "Not yet copying from 'persistent' cache"
                    )

                lgr.debug("Adding %s to annex pointing to %s and with options "
                          "%r", target_file_path, url, annex_options)

                out_json = annex.add_url_to_file(
                    target_file_path,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:
                    # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        # drop extracted files after adding to annex
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(
                        target_file_path))
                    stats.add_git += 1

                if delete_after:
                    # we count the removal here, but don't yet perform it
                    # to not interfer with batched processes - any pure Git
                    # action invokes precommit which closes batched processes.
                    stats.removed += 1

                # Done with target_file -- just to have clear end of the loop
                del target_file

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(str(archive_path), force=True)

            lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line'))

            if outside_stats:
                outside_stats += stats
            if delete_after:
                # force since not committed. r=True for -r (passed into git call
                # to recurse)
                delete_after_rpath = opj(extract_rpath, prefix_dir) \
                    if extract_rpath else prefix_dir
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                lgr.debug(
                    "Removing extracted and annexed files under %s",
                    delete_after_rpath
                )
                annex.remove(str(delete_after_rpath), r=True, force=True)
            if commit:
                archive_rpath = archive_path.relative_to(ds.path)
                commit_stats = outside_stats if outside_stats else stats
                # so batched ones close and files become annex symlinks etc
                annex.precommit()
                precommitted = True
                if any(r.get('state', None) != 'clean'
                       for p, r in annex.status(untracked='no').items()):
                    annex.commit(
                        "Added content extracted from %s %s\n\n%s" %
                        (origin, archive_rpath,
                         commit_stats.as_str(mode='full')),
                        _datalad_msg=True
                    )
                    commit_stats.reset()
            else:
                # don't commit upon completion
                pass
        finally:
            # take down the progress bar
            log_progress(
                lgr.info, pbar_id,
                'Finished extraction',
                noninteractive_level=logging.INFO)
            # since we batched addurl, we should close those batched processes
            # if haven't done yet.  explicitly checked to avoid any possible
            # "double-action"
            if not precommitted:
                annex.precommit()

            if delete_after_rpath:
                delete_after_path = opj(annex.path, delete_after_rpath)
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                if exists(delete_after_path):  # should not be there
                    # but for paranoid yoh
                    lgr.warning(
                        "Removing temporary directory under which extracted "
                        "files were annexed and should have been removed: %s",
                        delete_after_path)
                    rmtree(delete_after_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)
            # remove tempfile directories (not cleaned up automatically):
            if prefix_dir is not None and lexists(prefix_dir):
                os.rmdir(prefix_dir)
        yield get_status_dict(
            ds=ds,
            status='ok',
            **res_kwargs)
        return annex
Esempio n. 24
0
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None):
    """Make a direct query of a dataset to extract its metadata.

    Parameters
    ----------
    ds : Dataset
    types : list
    """
    errored = False
    dsmeta = dict()
    contentmeta = {}

    if global_meta is not None and content_meta is not None and \
            not global_meta and not content_meta:
        # both are false and not just none
        return dsmeta, contentmeta, errored

    context = {
        '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format(
            vocabulary_version)}

    fullpathlist = paths
    if paths and isinstance(ds.repo, AnnexRepo):
        # Ugly? Jep: #2055
        content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths))
        paths = [p for p, c, a in content_info if not a or c]
        nocontent = len(fullpathlist) - len(paths)
        if nocontent:
            # TODO better fail, or support incremental and label this file as no present
            lgr.warn(
                '{} files have no content present, '
                'some extractors will not operate on {}'.format(
                    nocontent,
                    'them' if nocontent > 10
                           else [p for p, c, a in content_info if not c and a])
            )

    # pull out potential metadata field blacklist config settings
    blacklist = [re.compile(bl) for bl in assure_list(ds.config.obtain(
        'datalad.metadata.aggregate-ignore-fields',
        default=[]))]
    # enforce size limits
    max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize')
    # keep local, who knows what some extractors might pull in
    from pkg_resources import iter_entry_points  # delayed heavy import
    extractors = {ep.name: ep for ep in iter_entry_points('datalad.metadata.extractors')}

    log_progress(
        lgr.info,
        'metadataextractors',
        'Start metadata extraction from %s', ds,
        total=len(types),
        label='Metadata extraction',
        unit=' extractors',
    )
    for mtype in types:
        mtype_key = mtype
        log_progress(
            lgr.info,
            'metadataextractors',
            'Engage %s metadata extractor', mtype_key,
            update=1,
            increment=True)
        if mtype_key not in extractors:
            # we said that we want to fail, rather then just moan about less metadata
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s', mtype_key, ds,
            )
            raise ValueError(
                'Enabled metadata extractor %s is not available in this installation',
                mtype_key)
        try:
            extractor_cls = extractors[mtype_key].load()
            extractor = extractor_cls(
                ds,
                paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist)
        except Exception as e:
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s', mtype_key, ds,
            )
            raise ValueError(
                "Failed to load metadata extractor for '%s', "
                "broken dataset configuration (%s)?: %s",
                mtype, ds, exc_str(e))
            continue
        try:
            dsmeta_t, contentmeta_t = extractor.get_metadata(
                dataset=global_meta if global_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-dataset-{}'.format(mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()),
                content=content_meta if content_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-content-{}'.format(mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()))
        except Exception as e:
            lgr.error('Failed to get dataset metadata ({}): {}'.format(
                mtype, exc_str(e)))
            if cfg.get('datalad.runtime.raiseonerror'):
                log_progress(
                    lgr.error,
                    'metadataextractors',
                    'Failed %s metadata extraction from %s', mtype_key, ds,
                )
                raise
            errored = True
            # if we dont get global metadata we do not want content metadata
            continue

        if dsmeta_t:
            if _ok_metadata(dsmeta_t, mtype, ds, None):
                dsmeta_t = _filter_metadata_fields(
                    dsmeta_t,
                    maxsize=max_fieldsize,
                    blacklist=blacklist)
                dsmeta[mtype_key] = dsmeta_t
            else:
                errored = True

        unique_cm = {}
        extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude", set())
        # TODO: ATM neuroimaging extractors all provide their own internal
        #  log_progress but if they are all generators, we could provide generic
        #  handling of the progress here.  Note also that log message is actually
        #  seems to be ignored and not used, only the label ;-)
        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Metadata extraction per location for %s', mtype,
        #     # contentmeta_t is a generator... so no cound is known
        #     # total=len(contentmeta_t or []),
        #     label='Metadata extraction per location',
        #     unit=' locations',
        # )
        for loc, meta in contentmeta_t or {}:
            lgr.log(5, "Analyzing metadata for %s", loc)
            # log_progress(
            #     lgr.debug,
            #     'metadataextractors_loc',
            #     'ignoredatm',
            #     label=loc,
            #     update=1,
            #     increment=True)
            if not _ok_metadata(meta, mtype, ds, loc):
                errored = True
                # log_progress(
                #     lgr.debug,
                #     'metadataextractors_loc',
                #     'ignoredatm',
                #     label='Failed for %s' % loc,
                # )
                continue
            # we also want to store info that there was no metadata(e.g. to get a list of
            # files that have no metadata)
            # if there is an issue that a extractor needlessly produces empty records, the
            # extractor should be fixed and not a general switch. For example the datalad_core
            # issues empty records to document the presence of a file
            #elif not meta:
            #    continue

            # apply filters
            meta = _filter_metadata_fields(
                meta,
                maxsize=max_fieldsize,
                blacklist=blacklist)

            if not meta:
                continue

            # assign
            # only ask each metadata extractor once, hence no conflict possible
            loc_dict = contentmeta.get(loc, {})
            loc_dict[mtype_key] = meta
            contentmeta[loc] = loc_dict

            if ds.config.obtain(
                    'datalad.metadata.generate-unique-{}'.format(mtype_key.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()):
                # go through content metadata and inject report of unique keys
                # and values into `dsmeta`
                for k, v in iteritems(meta):
                    if k in dsmeta.get(mtype_key, {}):
                        # if the dataset already has a dedicated idea
                        # about a key, we skip it from the unique list
                        # the point of the list is to make missing info about
                        # content known in the dataset, not to blindly
                        # duplicate metadata. Example: list of samples data
                        # were recorded from. If the dataset has such under
                        # a 'sample' key, we should prefer that, over an
                        # aggregated list of a hopefully-kinda-ok structure
                        continue
                    elif k in extractor_unique_exclude:
                        # the extractor thinks this key is worthless for the purpose
                        # of discovering whole datasets
                        # we keep the key (so we know that some file is providing this key),
                        # but ignore any value it came with
                        unique_cm[k] = None
                        continue
                    vset = unique_cm.get(k, set())
                    vset.add(_val2hashable(v))
                    unique_cm[k] = vset

        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Finished metadata extraction across locations for %s', mtype)

        if unique_cm:
            # per source storage here too
            ucp = dsmeta.get('datalad_unique_content_properties', {})
            # important: we want to have a stable order regarding
            # the unique values (a list). we cannot guarantee the
            # same order of discovery, hence even when not using a
            # set above we would still need sorting. the callenge
            # is that any value can be an arbitrarily complex nested
            # beast
            # we also want to have each unique value set always come
            # in a top-level list, so we known if some unique value
            # was a list, os opposed to a list of unique values

            def _ensure_serializable(val):
                if isinstance(val, ReadOnlyDict):
                    return {k: _ensure_serializable(v) for k, v in iteritems(val)}
                if isinstance(val, (tuple, list)):
                    return [_ensure_serializable(v) for v in val]
                else:
                    return val

            ucp[mtype_key] = {
                k: [_ensure_serializable(i)
                    for i in sorted(
                        v,
                        key=_unique_value_key)] if v is not None else None
                for k, v in iteritems(unique_cm)
                # v == None (disable unique, but there was a value at some point)
                # otherwise we only want actual values, and also no single-item-lists
                # of a non-value
                # those contribute no information, but bloat the operation
                # (inflated number of keys, inflated storage, inflated search index, ...)
                if v is None or (v and not v == {''})}
            dsmeta['datalad_unique_content_properties'] = ucp

    log_progress(
        lgr.info,
        'metadataextractors',
        'Finished metadata extraction from %s', ds,
    )

    # always identify the effective vocabulary - JSON-LD style
    if context:
        dsmeta['@context'] = context

    return dsmeta, contentmeta, errored
Esempio n. 25
0
    def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        context = {}
        contentmeta = []
        log_progress(
            lgr.info,
            'extractorxmp',
            'Start XMP metadata extraction from %s',
            self.ds,
            total=len(self.paths),
            label='XMP metadata extraction',
            unit=' Files',
        )
        for f in self.paths:
            absfp = opj(self.ds.path, f)
            log_progress(lgr.info,
                         'extractorxmp',
                         'Extract XMP metadata from %s',
                         absfp,
                         update=1,
                         increment=True)
            info = file_to_dict(absfp)
            if not info:
                # got nothing, likely nothing there
                # TODO check if this is an XMP sidecar file, parse that, and assign metadata
                # to the base file
                continue
            # update vocabulary
            vocab = {
                info[ns][0][0].split(':')[0]: {
                    '@id': ns,
                    'type': vocabulary_id
                }
                for ns in info
            }
            # TODO this is dirty and assumed that XMP is internally consistent with the
            # definitions across all files -- which it likely isn't
            context.update(vocab)
            # now pull out actual metadata
            # cannot do simple dict comprehension, because we need to beautify things a little

            meta = {}
            for ns in info:
                for key, val, props in info[ns]:
                    if not val:
                        # skip everything empty
                        continue
                    if key.count('[') > 1:
                        # this is a nested array
                        # MIH: I do not think it is worth going here
                        continue
                    if props['VALUE_IS_ARRAY']:
                        # we'll catch the actuall array values later
                        continue
                    # normalize value
                    val = assure_unicode(val)
                    # non-breaking space
                    val = val.replace(u"\xa0", ' ')

                    field, idx, qual = xmp_field_re.match(key).groups()
                    normkey = u'{}{}'.format(field, qual)
                    if '/' in key:
                        normkey = u'{0}<{1}>'.format(*normkey.split('/'))
                    if idx:
                        # array
                        arr = meta.get(normkey, [])
                        arr.append(val)
                        meta[normkey] = arr
                    else:
                        meta[normkey] = val
            # compact
            meta = {
                k: v[0] if isinstance(v, list) and len(v) == 1 else v
                for k, v in meta.items()
            }

            contentmeta.append((f, meta))

        log_progress(lgr.info, 'extractorxmp',
                     'Finished XMP metadata extraction from %s', self.ds)
        return {
            '@context': context,
        }, \
            contentmeta
Esempio n. 26
0
def _yield_ds_w_matching_siblings(
        ds, names, recursive=False, recursion_limit=None):
    """(Recursively) inspect a dataset for siblings with particular name(s)

    Parameters
    ----------
    ds: Dataset
      The dataset to be inspected.
    names: iterable
      Sibling names (str) to test for.
    recursive: bool, optional
      Whether to recurse into subdatasets.
    recursion_limit: int, optional
      Recursion depth limit.

    Yields
    ------
    str, str
      Path to the dataset with a matching sibling, and name of the matching
      sibling in that dataset.
    """

    def _discover_all_remotes(ds, refds, **kwargs):
        """Helper to be run on all relevant datasets via foreach
        """
        # Note, that `siblings` doesn't tell us about not enabled special
        # remotes. There could still be conflicting names we need to know
        # about in order to properly deal with the `existing` switch.

        repo = ds.repo
        # list of known git remotes
        if isinstance(repo, AnnexRepo):
            remotes = repo.get_remotes(exclude_special_remotes=True)
            remotes.extend([v['name']
                            for k, v in repo.get_special_remotes().items()]
                           )
        else:
            remotes = repo.get_remotes()
        return remotes

    if not recursive:
        for name in _discover_all_remotes(ds, ds):
            if name in names:
                yield ds.path, name
        return

    # in recursive mode this check could take a substantial amount of
    # time: employ a progress bar (or rather a counter, because we don't
    # know the total in advance
    pbar_id = 'check-siblings-{}'.format(id(ds))
    log_progress(
        lgr.info, pbar_id,
        'Start checking pre-existing sibling configuration %s', ds,
        label='Query siblings',
        unit=' Siblings',
    )

    for res in ds.foreach_dataset(
            _discover_all_remotes,
            recursive=recursive,
            recursion_limit=recursion_limit,
            return_type='generator',
            result_renderer='disabled',
    ):
        # unwind result generator
        if 'result' in res:
            for name in res['result']:
                log_progress(
                    lgr.info, pbar_id,
                    'Discovered sibling %s in dataset at %s',
                    name, res['path'],
                    update=1,
                    increment=True)
                if name in names:
                    yield res['path'], name

    log_progress(
        lgr.info, pbar_id,
        'Finished checking pre-existing sibling configuration %s', ds,
    )
Esempio n. 27
0
    def __call__(path=None,
                 dataset=None,
                 to=None,
                 since=None,
                 data='auto-if-wanted',
                 force=None,
                 recursive=False,
                 recursion_limit=None,
                 jobs=None):
        # push uses '^' to annotate the previous pushed committish, and None for default
        # behavior. '' was/is (to be deprecated) used in `publish`. Alert user about the mistake
        if since == '':
            raise ValueError("'since' should point to commitish or use '^'.")
        # we resolve here, because we need to perform inspection on what was given
        # as an input argument further down
        paths = [resolve_path(p, dataset) for p in ensure_list(path)]

        ds = require_dataset(dataset, check_installed=True, purpose='push')
        ds_repo = ds.repo

        res_kwargs = dict(
            action='publish',
            refds=ds.path,
            logger=lgr,
        )

        get_remote_kwargs = {'exclude_special_remotes': False} \
            if isinstance(ds_repo, AnnexRepo) else {}
        if to and to not in ds_repo.get_remotes(**get_remote_kwargs):
            # get again for proper error:
            sr = ds_repo.get_remotes(**get_remote_kwargs)
            # yield an error result instead of raising a ValueError,
            # to enable the use case of pushing to a target that
            # a superdataset doesn't know, but some subdatasets to
            # (in combination with '--on-failure ignore')
            yield dict(res_kwargs,
                       status='error',
                       path=ds.path,
                       message="Unknown push target '{}'. {}".format(
                           to, 'Known targets: {}.'.format(', '.join(
                               repr(s) for s in sr))
                           if sr else 'No targets configured in dataset.'))
            return
        if since == '^':
            # figure out state of remote branch and set `since`
            since = _get_corresponding_remote_state(ds_repo, to)
            if not since:
                lgr.info("No tracked remote for active branch, "
                         "detection of last pushed state not in effect.")
        elif since:
            # will blow with ValueError if unusable
            ds_repo.get_hexsha(since)

        # obtain a generator for information on the datasets to process
        # idea is to turn the `paths` argument into per-dataset
        # content listings that can be acted upon
        ds_spec = _datasets_since_(
            # important to pass unchanged dataset arg
            dataset,
            since,
            paths,
            recursive,
            recursion_limit)

        # instead of a loop, this could all be done in parallel
        matched_anything = False
        for dspath, dsrecords in ds_spec:
            matched_anything = True
            lgr.debug('Attempt push of Dataset at %s', dspath)
            pbars = {}
            yield from _push(dspath,
                             dsrecords,
                             to,
                             data,
                             force,
                             jobs,
                             res_kwargs.copy(),
                             pbars,
                             got_path_arg=True if path else False)
            # take down progress bars for this dataset
            for i, ds in pbars.items():
                log_progress(lgr.info, i, 'Finished push of %s', ds)
        if not matched_anything:
            potential_remote = False
            if not to and len(paths) == 1:
                # if we get a remote name without --to, provide a hint
                sr = ds_repo.get_remotes(**get_remote_kwargs)
                potential_remote = [p for p in ensure_list(path) if p in sr]
            if potential_remote:
                hint = "{} matches a sibling name and not a path. " \
                      "Forgot --to?".format(potential_remote)
                yield dict(
                    res_kwargs,
                    status='notneeded',
                    message=hint,
                    hints=hint,
                    type='dataset',
                    path=ds.path,
                )
                # there's no matching path and we have generated a hint on
                # fixing the call - we can return now
                return
            yield dict(
                res_kwargs,
                status='notneeded',
                message=
                'Given constraints did not match any changes to publish',
                type='dataset',
                path=ds.path,
            )
Esempio n. 28
0
    def __call__(
            path=None,
            *,
            dataset=None,
            recursive=False,
            recursion_limit=None):
        refds = require_dataset(dataset, check_installed=True,
                                purpose="unlock")

        # Before passing the results to status()
        #   * record explicitly specified non-directory paths so that we can
        #     decide whether to yield a result for reported paths
        #   * filter out and yield results for paths that don't exist
        res_paths_nondir = set()
        paths_lexist = None
        res_paths = list()
        if path:
            # Note, that we need unresolved versions of the path input to be
            # passed on to status. See gh-5456 for example.
            path = ensure_list(path)
            res_paths = resolve_path(path, ds=dataset)
            paths_lexist = []
            res_paths_lexist = []
            for p, p_r in zip(path, res_paths):
                if p_r.exists() or p_r.is_symlink():
                    paths_lexist.append(p)
                    res_paths_lexist.append(p_r)
                if not p_r.is_dir():
                    res_paths_nondir.add(p_r)

        res_kwargs = dict(action='unlock', logger=lgr, refds=refds.path)
        if res_paths:
            for p in set(res_paths).difference(set(res_paths_lexist)):
                yield get_status_dict(
                    status="impossible",
                    path=str(p),
                    type="file",
                    message="path does not exist",
                    **res_kwargs)
        if not (paths_lexist or paths_lexist is None):
            return

        # Collect information on the paths to unlock.
        to_unlock = defaultdict(list)  # ds => paths (relative to ds)
        for res in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=paths_lexist,
                untracked="normal" if res_paths_nondir else "no",
                annex="availability",
                recursive=recursive,
                recursion_limit=recursion_limit,
                result_renderer="disabled",
                return_type="generator",
                on_failure="ignore"):
            if res["action"] != "status" or res["status"] != "ok":
                yield res
                continue
            has_content = res.get("has_content")
            if has_content:
                parentds = res["parentds"]
                to_unlock[parentds].append(op.relpath(res["path"], parentds))
            elif res_paths_nondir and Path(res["path"]) in res_paths_nondir:
                if has_content is False:
                    msg = "no content present"
                    status = "impossible"
                elif res["state"] == "untracked":
                    msg = "untracked"
                    status = "impossible"
                else:
                    # This is either a regular git file or an unlocked annex
                    # file.
                    msg = "non-annex file"
                    status = "notneeded"
                yield get_status_dict(
                    status=status,
                    path=res["path"],
                    type="file",
                    message="{}; cannot unlock".format(msg),
                    **res_kwargs)

        # Do the actual unlocking.
        for ds_path, files in to_unlock.items():
            # register for final orderly take down
            pbar_id = f'unlock-{ds_path}'
            nfiles = len(files)
            log_progress(
                lgr.info, pbar_id,
                'Unlocking files',
                unit=' Files',
                label='Unlocking',
                total=nfiles,
                noninteractive_level=logging.INFO,
            )
            ds = Dataset(ds_path)
            for r in ds.repo._call_annex_records_items_(
                    ["unlock"],
                    files=files,
            ):
                log_progress(
                    lgr.info, pbar_id,
                    "Files to unlock %i", nfiles,
                    update=1, increment=True,
                    noninteractive_level=logging.DEBUG)
                nfiles -= 1
                yield get_status_dict(
                    path=op.join(ds.path, r['file']),
                    status='ok' if r['success'] else 'error',
                    type='file',
                    **res_kwargs)
                if nfiles < 1:
                    # git-annex will spend considerable time after the last
                    # file record to finish things up, let this be known
                    log_progress(
                        lgr.info, pbar_id,
                        "Recording unlocked state in git",
                        update=0, increment=True,
                        noninteractive_level=logging.INFO)

            log_progress(
                lgr.info, pbar_id,
                "Completed unlocking files",
                noninteractive_level=logging.INFO)
Esempio n. 29
0
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None):
    """Make a direct query of a dataset to extract its metadata.

    Parameters
    ----------
    ds : Dataset
    types : list
    """
    errored = False
    dsmeta = dict()
    contentmeta = {}

    if global_meta is not None and content_meta is not None and \
            not global_meta and not content_meta:
        # both are false and not just none
        return dsmeta, contentmeta, errored

    context = {
        '@vocab':
        'http://docs.datalad.org/schema_v{}.json'.format(vocabulary_version)
    }

    fullpathlist = paths
    if paths and isinstance(ds.repo, AnnexRepo):
        # Ugly? Jep: #2055
        content_info = zip(paths, ds.repo.file_has_content(paths),
                           ds.repo.is_under_annex(paths))
        paths = [p for p, c, a in content_info if not a or c]
        nocontent = len(fullpathlist) - len(paths)
        if nocontent:
            # TODO better fail, or support incremental and label this file as no present
            lgr.warn('{} files have no content present, '
                     'some extractors will not operate on {}'.format(
                         nocontent, 'them' if nocontent > 10 else
                         [p for p, c, a in content_info if not c and a]))

    # pull out potential metadata field blacklist config settings
    blacklist = [
        re.compile(bl) for bl in assure_list(
            ds.config.obtain('datalad.metadata.aggregate-ignore-fields',
                             default=[]))
    ]
    # enforce size limits
    max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize')
    # keep local, who knows what some extractors might pull in
    from pkg_resources import iter_entry_points  # delayed heavy import
    extractors = {
        ep.name: ep
        for ep in iter_entry_points('datalad.metadata.extractors')
    }

    log_progress(
        lgr.info,
        'metadataextractors',
        'Start metadata extraction from %s',
        ds,
        total=len(types),
        label='Metadata extraction',
        unit=' extractors',
    )
    for mtype in types:
        mtype_key = mtype
        log_progress(lgr.info,
                     'metadataextractors',
                     'Engage %s metadata extractor',
                     mtype_key,
                     update=1,
                     increment=True)
        if mtype_key not in extractors:
            # we said that we want to fail, rather then just moan about less metadata
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s',
                mtype_key,
                ds,
            )
            raise ValueError(
                'Enabled metadata extractor %s is not available in this installation',
                mtype_key)
        try:
            extractor_cls = extractors[mtype_key].load()
            extractor = extractor_cls(
                ds,
                paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist)
        except Exception as e:
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s',
                mtype_key,
                ds,
            )
            raise ValueError(
                "Failed to load metadata extractor for '%s', "
                "broken dataset configuration (%s)?: %s", mtype, ds,
                exc_str(e))
            continue
        try:
            dsmeta_t, contentmeta_t = extractor.get_metadata(
                dataset=global_meta
                if global_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-dataset-{}'.format(
                        mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()),
                content=content_meta
                if content_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-content-{}'.format(
                        mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()))
        except Exception as e:
            lgr.error('Failed to get dataset metadata ({}): {}'.format(
                mtype, exc_str(e)))
            if cfg.get('datalad.runtime.raiseonerror'):
                log_progress(
                    lgr.error,
                    'metadataextractors',
                    'Failed %s metadata extraction from %s',
                    mtype_key,
                    ds,
                )
                raise
            errored = True
            # if we dont get global metadata we do not want content metadata
            continue

        if dsmeta_t:
            if _ok_metadata(dsmeta_t, mtype, ds, None):
                dsmeta_t = _filter_metadata_fields(dsmeta_t,
                                                   maxsize=max_fieldsize,
                                                   blacklist=blacklist)
                dsmeta[mtype_key] = dsmeta_t
            else:
                errored = True

        unique_cm = {}
        extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude",
                                           set())
        # TODO: ATM neuroimaging extractors all provide their own internal
        #  log_progress but if they are all generators, we could provide generic
        #  handling of the progress here.  Note also that log message is actually
        #  seems to be ignored and not used, only the label ;-)
        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Metadata extraction per location for %s', mtype,
        #     # contentmeta_t is a generator... so no cound is known
        #     # total=len(contentmeta_t or []),
        #     label='Metadata extraction per location',
        #     unit=' locations',
        # )
        for loc, meta in contentmeta_t or {}:
            lgr.log(5, "Analyzing metadata for %s", loc)
            # log_progress(
            #     lgr.debug,
            #     'metadataextractors_loc',
            #     'ignoredatm',
            #     label=loc,
            #     update=1,
            #     increment=True)
            if not _ok_metadata(meta, mtype, ds, loc):
                errored = True
                # log_progress(
                #     lgr.debug,
                #     'metadataextractors_loc',
                #     'ignoredatm',
                #     label='Failed for %s' % loc,
                # )
                continue
            # we also want to store info that there was no metadata(e.g. to get a list of
            # files that have no metadata)
            # if there is an issue that a extractor needlessly produces empty records, the
            # extractor should be fixed and not a general switch. For example the datalad_core
            # issues empty records to document the presence of a file
            #elif not meta:
            #    continue

            # apply filters
            meta = _filter_metadata_fields(meta,
                                           maxsize=max_fieldsize,
                                           blacklist=blacklist)

            if not meta:
                continue

            # assign
            # only ask each metadata extractor once, hence no conflict possible
            loc_dict = contentmeta.get(loc, {})
            loc_dict[mtype_key] = meta
            contentmeta[loc] = loc_dict

            if ds.config.obtain('datalad.metadata.generate-unique-{}'.format(
                    mtype_key.replace('_', '-')),
                                default=True,
                                valtype=EnsureBool()):
                # go through content metadata and inject report of unique keys
                # and values into `dsmeta`
                for k, v in iteritems(meta):
                    if k in dsmeta.get(mtype_key, {}):
                        # if the dataset already has a dedicated idea
                        # about a key, we skip it from the unique list
                        # the point of the list is to make missing info about
                        # content known in the dataset, not to blindly
                        # duplicate metadata. Example: list of samples data
                        # were recorded from. If the dataset has such under
                        # a 'sample' key, we should prefer that, over an
                        # aggregated list of a hopefully-kinda-ok structure
                        continue
                    elif k in extractor_unique_exclude:
                        # the extractor thinks this key is worthless for the purpose
                        # of discovering whole datasets
                        # we keep the key (so we know that some file is providing this key),
                        # but ignore any value it came with
                        unique_cm[k] = None
                        continue
                    vset = unique_cm.get(k, set())
                    vset.add(_val2hashable(v))
                    unique_cm[k] = vset

        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Finished metadata extraction across locations for %s', mtype)

        if unique_cm:
            # per source storage here too
            ucp = dsmeta.get('datalad_unique_content_properties', {})

            # important: we want to have a stable order regarding
            # the unique values (a list). we cannot guarantee the
            # same order of discovery, hence even when not using a
            # set above we would still need sorting. the callenge
            # is that any value can be an arbitrarily complex nested
            # beast
            # we also want to have each unique value set always come
            # in a top-level list, so we known if some unique value
            # was a list, os opposed to a list of unique values

            def _ensure_serializable(val):
                if isinstance(val, ReadOnlyDict):
                    return {
                        k: _ensure_serializable(v)
                        for k, v in iteritems(val)
                    }
                if isinstance(val, (tuple, list)):
                    return [_ensure_serializable(v) for v in val]
                else:
                    return val

            ucp[mtype_key] = {
                k: [
                    _ensure_serializable(i)
                    for i in sorted(v, key=_unique_value_key)
                ] if v is not None else None
                for k, v in iteritems(unique_cm)
                # v == None (disable unique, but there was a value at some point)
                # otherwise we only want actual values, and also no single-item-lists
                # of a non-value
                # those contribute no information, but bloat the operation
                # (inflated number of keys, inflated storage, inflated search index, ...)
                if v is None or (v and not v == {''})
            }
            dsmeta['datalad_unique_content_properties'] = ucp

    log_progress(
        lgr.info,
        'metadataextractors',
        'Finished metadata extraction from %s',
        ds,
    )

    # always identify the effective vocabulary - JSON-LD style
    if context:
        dsmeta['@context'] = context

    return dsmeta, contentmeta, errored
    def __call__(target, dataset=None, opts=None):
        # only non-bare repos have hashdirmixed, so require one
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='RIA archive export')
        ds_repo = ds.repo

        # TODO remove once datalad 0.12rc7 or later is released
        if not hasattr(ds_repo, 'dot_git'):
            from datalad.support.gitrepo import GitRepo
            ds_repo.dot_git = ds_repo.pathobj / GitRepo.get_git_dir(ds_repo)

        annex_objs = ds_repo.dot_git / 'annex' / 'objects'

        archive = resolve_path(target, dataset)
        if archive.is_dir():
            archive = archive / 'archive.7z'
        else:
            archive.parent.mkdir(exist_ok=True, parents=True)

        if not opts:
            # uncompressed by default
            opts = ['-mx0']

        res_kwargs = dict(
            action="export-ria-archive",
            logger=lgr,
        )

        if not annex_objs.is_dir():
            yield get_status_dict(
                ds=ds,
                status='notneeded',
                message='no annex keys present',
                **res_kwargs,
            )
            return

        exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ria_archive'
        if exportdir.exists():
            yield get_status_dict(
                ds=ds,
                status='error',
                message=(
                    'export directory already exists, please remove first: %s',
                    str(exportdir)),
                **res_kwargs,
            )
            return

        keypaths = [
            k for k in annex_objs.glob(op.join('**', '*')) if k.is_file()
        ]

        log_progress(
            lgr.info,
            'riaarchiveexport',
            'Start RIA archive export %s',
            ds,
            total=len(keypaths),
            label='RIA archive export',
            unit=' Keys',
        )

        for keypath in keypaths:
            key = keypath.name
            hashdir = op.join(keypath.parts[-4], keypath.parts[-3])
            log_progress(lgr.info,
                         'riaarchiveexport',
                         'Export key %s to %s',
                         key,
                         hashdir,
                         update=1,
                         increment=True)
            keydir = exportdir / hashdir / key
            keydir.mkdir(parents=True, exist_ok=True)
            os.link(str(keypath), str(keydir / key))

        log_progress(lgr.info, 'riaarchiveexport',
                     'Finished RIA archive export from %s', ds)
        try:
            subprocess.run(
                ['7z', 'u', str(archive), '.'] + opts,
                cwd=str(exportdir),
            )
            yield get_status_dict(path=str(archive),
                                  type='file',
                                  status='ok',
                                  **res_kwargs)
        except Exception as e:
            yield get_status_dict(path=str(archive),
                                  type='file',
                                  status='error',
                                  message=('7z failed: %s', exc_str(e)),
                                  **res_kwargs)
            return
        finally:
            rmtree(str(exportdir))
Esempio n. 31
0
File: xmp.py Progetto: hanke/datalad
    def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        context = {}
        contentmeta = []
        log_progress(
            lgr.info,
            'extractorxmp',
            'Start XMP metadata extraction from %s', self.ds,
            total=len(self.paths),
            label='XMP metadata extraction',
            unit=' Files',
        )
        for f in self.paths:
            absfp = opj(self.ds.path, f)
            log_progress(
                lgr.info,
                'extractorxmp',
                'Extract XMP metadata from %s', absfp,
                update=1,
                increment=True)
            info = file_to_dict(absfp)
            if not info:
                # got nothing, likely nothing there
                # TODO check if this is an XMP sidecar file, parse that, and assign metadata
                # to the base file
                continue
            # update vocabulary
            vocab = {info[ns][0][0].split(':')[0]: {'@id': ns, 'type': vocabulary_id} for ns in info}
            # TODO this is dirty and assumed that XMP is internally consistent with the
            # definitions across all files -- which it likely isn't
            context.update(vocab)
            # now pull out actual metadata
            # cannot do simple dict comprehension, because we need to beautify things a little

            meta = {}
            for ns in info:
                for key, val, props in info[ns]:
                    if not val:
                        # skip everything empty
                        continue
                    if key.count('[') > 1:
                        # this is a nested array
                        # MIH: I do not think it is worth going here
                        continue
                    if props['VALUE_IS_ARRAY']:
                        # we'll catch the actuall array values later
                        continue
                    # normalize value
                    val = assure_unicode(val)
                    # non-breaking space
                    val = val.replace(u"\xa0", ' ')

                    field, idx, qual = xmp_field_re.match(key).groups()
                    normkey = u'{}{}'.format(field, qual)
                    if '/' in key:
                        normkey = u'{0}<{1}>'.format(*normkey.split('/'))
                    if idx:
                        # array
                        arr = meta.get(normkey, [])
                        arr.append(val)
                        meta[normkey] = arr
                    else:
                        meta[normkey] = val
            # compact
            meta = {k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items()}

            contentmeta.append((f, meta))

        log_progress(
            lgr.info,
            'extractorxmp',
            'Finished XMP metadata extraction from %s', self.ds
        )
        return {
            '@context': context,
        }, \
            contentmeta
Esempio n. 32
0
    def get_metadata(self, dataset, content):
        imgseries = {}
        imgs = {}
        log_progress(
            lgr.info,
            'extractordicom',
            'Start DICOM metadata extraction from %s',
            self.ds,
            total=len(self.paths),
            label='DICOM metadata extraction',
            unit=' Files',
        )
        for f in self.paths:
            absfp = op.join(self.ds.path, f)
            log_progress(lgr.info,
                         'extractordicom',
                         'Extract DICOM metadata from %s',
                         absfp,
                         update=1,
                         increment=True)

            if op.basename(f).startswith('PSg'):
                # ignore those dicom files, since they appear to not contain
                # any relevant metadata for image series, but causing trouble
                # (see gh-2210). We might want to change that whenever we get
                # a better understanding of how to deal with those files.
                lgr.debug("Ignoring DICOM file %s", f)
                continue

            try:
                d = dcm.read_file(absfp,
                                  defer_size=1000,
                                  stop_before_pixels=True)
            except InvalidDicomError:
                # we can only ignore
                lgr.debug('"%s" does not look like a DICOM file, skipped', f)
                continue

            if isinstance(d, DicomDir):
                lgr.debug(
                    "%s appears to be a DICOMDIR file. Extraction not yet"
                    " implemented, skipped", f)
                continue

            ddict = None
            if content:
                ddict = _struct2dict(d)
                imgs[f] = ddict
            if d.SeriesInstanceUID not in imgseries:
                # start with a copy of the metadata of the first dicom in a series
                series = _struct2dict(d) if ddict is None else ddict.copy()
                # store directory containing the image series (good for sorted
                # DICOM datasets)
                series_dir = op.dirname(f)
                series[
                    'SeriesDirectory'] = series_dir if series_dir else op.curdir
                series_files = []
            else:
                series, series_files = imgseries[d.SeriesInstanceUID]
                # compare incoming with existing metadata set
                series = {
                    k: series[k]
                    for k in series
                    # only keys that exist and have values that are identical
                    # across all images in the series
                    if _convert_value(getattr(d, k, None)) == series[k]
                }
            series_files.append(f)
            # store
            imgseries[d.SeriesInstanceUID] = (series, series_files)
        log_progress(lgr.info, 'extractordicom',
                     'Finished DICOM metadata extraction from %s', self.ds)

        dsmeta = {
            '@context': context,
            'Series': [info for info, files in imgseries.values()]
        }
        return (
            dsmeta,
            # yield the corresponding series description for each file
            imgs.items() if content else [])
Esempio n. 33
0
    def _mk_search_index(self, force_reindex):
        """Generic entrypoint to index generation

        The actual work that determines the structure and content of the index
        is done by functions that are passed in as arguments

        `meta2doc` - must return dict for index document from result input
        """
        from whoosh import index as widx
        from .metadata import get_ds_aggregate_db_locations
        dbloc, db_base_path = get_ds_aggregate_db_locations(self.ds)
        # what is the lastest state of aggregated metadata
        metadata_state = self.ds.repo.get_last_commit_hexsha(relpath(dbloc, start=self.ds.path))
        # use location common to all index types, they would all invalidate
        # simultaneously
        stamp_fname = opj(self.index_dir, 'datalad_metadata_state')
        index_dir = opj(self.index_dir, self._mode_label)

        if (not force_reindex) and \
                exists(index_dir) and \
                exists(stamp_fname) and \
                open(stamp_fname).read() == metadata_state:
            try:
                # TODO check that the index schema is the same
                # as the one we would have used for reindexing
                # TODO support incremental re-indexing, whoosh can do it
                idx = widx.open_dir(index_dir)
                lgr.debug(
                    'Search index contains %i documents',
                    idx.doc_count())
                self.idx_obj = idx
                return
            except widx.LockError as e:
                raise e
            except widx.IndexError as e:
                # Generic index error.
                # we try to regenerate
                lgr.warning(
                    "Cannot open existing index %s (%s), will regenerate",
                    index_dir, exc_str(e)
                )
            except widx.IndexVersionError as e:  # (msg, version, release=None)
                # Raised when you try to open an index using a format that the
                # current version of Whoosh cannot read. That is, when the index
                # you're trying to open is either not backward or forward
                # compatible with this version of Whoosh.
                # we try to regenerate
                lgr.warning(exc_str(e))
                pass
            except widx.OutOfDateError as e:
                # Raised when you try to commit changes to an index which is not
                # the latest generation.
                # this should not happen here, but if it does ... KABOOM
                raise
            except widx.EmptyIndexError as e:
                # Raised when you try to work with an index that has no indexed
                # terms.
                # we can just continue with generating an index
                pass
            except ValueError as e:
                if 'unsupported pickle protocol' in str(e):
                    lgr.warning(
                        "Cannot open existing index %s (%s), will regenerate",
                        index_dir, exc_str(e)
                    )
                else:
                    raise

        lgr.info('{} search index'.format(
            'Rebuilding' if exists(index_dir) else 'Building'))

        if not exists(index_dir):
            os.makedirs(index_dir)

        # this is a pretty cheap call that just pull this info from a file
        dsinfo = self.ds.metadata(
            get_aggregates=True,
            return_type='list',
            result_renderer='disabled')

        self._mk_schema(dsinfo)

        idx_obj = widx.create_in(index_dir, self.schema)
        idx = idx_obj.writer(
            # cache size per process
            limitmb=cfg.obtain('datalad.search.indexercachesize'),
            # disable parallel indexing for now till #1927 is resolved
            ## number of processes for indexing
            #procs=multiprocessing.cpu_count(),
            ## write separate index segments in each process for speed
            ## asks for writer.commit(optimize=True)
            #multisegment=True,
        )

        # load metadata of the base dataset and what it knows about all its subdatasets
        # (recursively)
        old_idx_size = 0
        old_ds_rpath = ''
        idx_size = 0
        log_progress(
            lgr.info,
            'autofieldidxbuild',
            'Start building search index',
            total=len(dsinfo),
            label='Building search index',
            unit=' Datasets',
        )
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
            # dataset will be reported again
            meta = res.get('metadata', {})
            doc = self._meta2doc(meta)
            admin = {
                'type': res['type'],
                'path': relpath(res['path'], start=self.ds.path),
            }
            if 'parentds' in res:
                admin['parentds'] = relpath(res['parentds'], start=self.ds.path)
            if admin['type'] == 'dataset':
                if old_ds_rpath:
                    lgr.debug(
                        'Added %s on dataset %s',
                        single_or_plural(
                            'document',
                            'documents',
                            idx_size - old_idx_size,
                            include_count=True),
                        old_ds_rpath)
                log_progress(lgr.info, 'autofieldidxbuild',
                             'Indexed dataset at %s', old_ds_rpath,
                             update=1, increment=True)
                old_idx_size = idx_size
                old_ds_rpath = admin['path']
                admin['id'] = res.get('dsid', None)

            doc.update({k: assure_unicode(v) for k, v in admin.items()})
            lgr.debug("Adding document to search index: {}".format(doc))
            # inject into index
            idx.add_document(**doc)
            idx_size += 1

        if old_ds_rpath:
            lgr.debug(
                'Added %s on dataset %s',
                single_or_plural(
                    'document',
                    'documents',
                    idx_size - old_idx_size,
                    include_count=True),
                old_ds_rpath)

        lgr.debug("Committing index")
        idx.commit(optimize=True)
        log_progress(
            lgr.info, 'autofieldidxbuild', 'Done building search index')

        # "timestamp" the search index to allow for automatic invalidation
        with open(stamp_fname, 'w') as f:
            f.write(metadata_state)

        lgr.info('Search index contains %i documents', idx_size)
        self.idx_obj = idx_obj
Esempio n. 34
0
    def __call__(
            target,
            opts=None,
            *,  # opts is positional but optional in CLI
            dataset=None,
            remote=None,
            annex_wanted=None,
            froms=None,
            missing_content='error',):
        # only non-bare repos have hashdirmixed, so require one
        ds = require_dataset(
            dataset, check_installed=True, purpose='export to ORA archive')
        ds_repo = ds.repo

        annex_objs = ds_repo.dot_git / 'annex' / 'objects'

        archive = resolve_path(target, dataset)
        if archive.is_dir():
            archive = archive / 'archive.7z'
        else:
            archive.parent.mkdir(exist_ok=True, parents=True)

        froms = ensure_list(froms)

        if not opts:
            # uncompressed by default
            opts = ['-mx0']

        res_kwargs = dict(
            action="export-archive-ora",
            logger=lgr,
        )

        if not annex_objs.is_dir():
            yield get_status_dict(
                ds=ds,
                status='notneeded',
                message='no annex keys present',
                **res_kwargs,
            )
            return

        exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive'
        if exportdir.exists():
            yield get_status_dict(
                ds=ds,
                status='error',
                message=(
                    'export directory already exists, please remove first: %s',
                    str(exportdir)),
                **res_kwargs,
            )
            return

        def expr_to_opts(expr):
            opts = []
            expr = expr.replace('(', ' ( ').replace(')', ' ) ')
            for sub_expr in expr.split(' '):
                if len(sub_expr):
                    if sub_expr in '()':
                        opts.append(f"-{sub_expr}")
                    else:
                        opts.append(f"--{sub_expr}")
            return opts

        find_filters = []
        if remote:
            find_filters = ['-('] + expr_to_opts(ds_repo.get_preferred_content('wanted', remote)) + ['-)']
        if annex_wanted:
            find_filters.extend(expr_to_opts(annex_wanted))
        # git-annex find results need to be uniqued with set, as git-annex find
        # will return duplicates if multiple symlinks point to the same key.
        if froms:
            keypaths = set([
                annex_objs.joinpath(k) for treeish in froms for k in ds_repo.call_annex_items_([
                'find', *find_filters, f"--branch={treeish}",
                "--format=${hashdirmixed}${key}/${key}\\n"])
                ])
        else:
            keypaths = set(annex_objs.joinpath(k) for k in ds_repo.call_annex_items_([
                'find', *find_filters,
                "--format=${hashdirmixed}${key}/${key}\\n"
            ]))

        log_progress(
            lgr.info,
            'oraarchiveexport',
            'Start ORA archive export %s', ds,
            total=len(keypaths),
            label='ORA archive export',
            unit=' Keys',
        )

        if missing_content == 'continue':
            missing_file_lgr_func = lgr.warning
        elif missing_content == 'ignore':
            missing_file_lgr_func = lgr.debug

        link_fx = os.link
        for keypath in keypaths:
            key = keypath.name
            hashdir = op.join(keypath.parts[-4], keypath.parts[-3])
            log_progress(
                lgr.info,
                'oraarchiveexport',
                'Export key %s to %s', key, hashdir,
                update=1,
                increment=True)
            keydir = exportdir / hashdir / key
            keydir.mkdir(parents=True, exist_ok=True)
            try:
                link_fx(str(keypath), str(keydir / key))
            except FileNotFoundError as e:
                if missing_content == 'error':
                    raise IOError('Key %s has no content available' % keypath)
                missing_file_lgr_func(
                    'Key %s has no content available',
                    str(keypath))
            except OSError:
                lgr.warning(
                    'No hard links supported at %s, will copy files instead',
                    str(keypath))
                # no hard links supported
                # switch function after first error
                link_fx = shutil.copyfile
                link_fx(str(keypath), str(keydir / key))

        log_progress(
            lgr.info,
            'oraarchiveexport',
            'Finished RIA archive export from %s', ds
        )
        try:
            subprocess.run(
                ['7z', 'u', str(archive), '.'] + opts,
                cwd=str(exportdir),
            )
            yield get_status_dict(
                path=str(archive),
                type='file',
                status='ok',
                **res_kwargs)
        except Exception as e:
            ce = CapturedException(e)
            yield get_status_dict(
                path=str(archive),
                type='file',
                status='error',
                message=('7z failed: %s', ce),
                exception=ce,
                **res_kwargs)
            return
        finally:
            rmtree(str(exportdir))
Esempio n. 35
0
    def __call__(dataset,
                 urlfile,
                 urlformat,
                 filenameformat,
                 input_type="ext",
                 exclude_autometa=None,
                 meta=None,
                 message=None,
                 dry_run=False,
                 fast=False,
                 ifexists=None,
                 missing_value=None,
                 save=True,
                 version_urls=False):
        # Temporarily work around gh-2269.
        url_file = urlfile
        url_format, filename_format = urlformat, filenameformat

        from requests.exceptions import RequestException

        from datalad.distribution.dataset import Dataset, require_dataset
        from datalad.interface.results import get_status_dict
        from datalad.support.annexrepo import AnnexRepo

        lgr = logging.getLogger("datalad.plugin.addurls")

        dataset = require_dataset(dataset, check_installed=False)
        if dataset.repo and not isinstance(dataset.repo, AnnexRepo):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message="not an annex repo")
            return

        if input_type == "ext":
            extension = os.path.splitext(url_file)[1]
            input_type = "json" if extension == ".json" else "csv"

        with open(url_file) as fd:
            try:
                rows, subpaths = extract(fd, input_type, url_format,
                                         filename_format, exclude_autometa,
                                         meta, dry_run, missing_value)
            except (ValueError, RequestException) as exc:
                yield get_status_dict(action="addurls",
                                      ds=dataset,
                                      status="error",
                                      message=exc_str(exc))
                return

        if len(rows) != len(set(row["filename"] for row in rows)):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message=("There are file name collisions; "
                                           "consider using {_repindex}"))
            return

        if dry_run:
            for subpath in subpaths:
                lgr.info("Would create a subdataset at %s", subpath)
            for row in rows:
                lgr.info("Would download %s to %s", row["url"],
                         os.path.join(dataset.path, row["filename"]))
                lgr.info(
                    "Metadata: %s",
                    sorted(u"{}={}".format(k, v)
                           for k, v in row["meta_args"].items()))
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="ok",
                                  message="dry-run finished")
            return

        if not dataset.repo:
            # Populate a new dataset with the URLs.
            for r in dataset.create(result_xfm=None, return_type='generator'):
                yield r

        annex_options = ["--fast"] if fast else []

        for spath in subpaths:
            if os.path.exists(os.path.join(dataset.path, spath)):
                lgr.warning("Not creating subdataset at existing path: %s",
                            spath)
            else:
                for r in dataset.create(spath,
                                        result_xfm=None,
                                        return_type='generator'):
                    yield r

        for row in rows:
            # Add additional information that we'll need for various
            # operations.
            filename_abs = os.path.join(dataset.path, row["filename"])
            if row["subpath"]:
                ds_current = Dataset(os.path.join(dataset.path,
                                                  row["subpath"]))
                ds_filename = os.path.relpath(filename_abs, ds_current.path)
            else:
                ds_current = dataset
                ds_filename = row["filename"]
            row.update({
                "filename_abs": filename_abs,
                "ds": ds_current,
                "ds_filename": ds_filename
            })

        if version_urls:
            num_urls = len(rows)
            log_progress(lgr.info,
                         "addurls_versionurls",
                         "Versioning %d URLs",
                         num_urls,
                         label="Versioning URLs",
                         total=num_urls,
                         unit=" URLs")
            for row in rows:
                url = row["url"]
                try:
                    row["url"] = get_versioned_url(url)
                except (ValueError, NotImplementedError) as exc:
                    # We don't expect this to happen because get_versioned_url
                    # should return the original URL if it isn't an S3 bucket.
                    # It only raises exceptions if it doesn't know how to
                    # handle the scheme for what looks like an S3 bucket.
                    lgr.warning("error getting version of %s: %s", row["url"],
                                exc_str(exc))
                log_progress(lgr.info,
                             "addurls_versionurls",
                             "Versioned result for %s: %s",
                             url,
                             row["url"],
                             update=1,
                             increment=True)
            log_progress(lgr.info, "addurls_versionurls",
                         "Finished versioning URLs")

        files_to_add = set()
        for r in add_urls(rows, ifexists=ifexists, options=annex_options):
            if r["status"] == "ok":
                files_to_add.add(r["path"])
            yield r

            msg = message or """\
[DATALAD] add files from URLs

url_file='{}'
url_format='{}'
filename_format='{}'""".format(url_file, url_format, filename_format)

        if files_to_add:
            meta_rows = [r for r in rows if r["filename_abs"] in files_to_add]
            for r in add_meta(meta_rows):
                yield r

            if save:
                for r in dataset.save(path=files_to_add,
                                      message=msg,
                                      recursive=True):
                    yield r
Esempio n. 36
0
    def __call__(self, dataset, refcommit, process_type, status):
        # shortcut
        ds = dataset

        log_progress(
            lgr.info,
            'extractorcustom',
            'Start custom metadata extraction from %s',
            ds,
            total=len(status) + 1,
            label='Custom metadata extraction',
            unit=' Files',
        )
        if process_type in ('all', 'content'):
            mfile_expr = _get_fmeta_expr(ds)
            for rec in status:
                log_progress(lgr.info,
                             'extractorcustom',
                             'Extracted custom metadata from %s',
                             rec['path'],
                             update=1,
                             increment=True)
                # build metadata file path
                meta_fpath = _get_fmeta_objpath(ds, mfile_expr, rec)
                if meta_fpath is not None and op.exists(meta_fpath):
                    try:
                        meta = jsonload(text_type(meta_fpath))
                        if isinstance(meta, dict) and meta \
                                and '@id' not in meta:
                            # in case we have a single, top-level
                            # document, and it has no ID: assume that
                            # it describes the file and assign the
                            # datalad file ID
                            meta['@id'] = get_file_id(rec)
                        if meta:
                            yield dict(
                                path=rec['path'],
                                metadata=meta,
                                type=rec['type'],
                                status='ok',
                            )
                    except Exception as e:
                        yield dict(
                            path=rec['path'],
                            type=rec['type'],
                            status='error',
                            message=exc_str(e),
                        )

        if process_type in ('all', 'dataset'):
            for r in _yield_dsmeta(ds):
                yield r
            log_progress(lgr.info,
                         'extractorcustom',
                         'Extracted custom metadata from %s',
                         ds.path,
                         update=1,
                         increment=True)

        log_progress(lgr.info, 'extractorcustom',
                     'Finished custom metadata extraction from %s', ds.path)