Ejemplo n.º 1
0
def test_get_versioned_url():
    get_test_providers(
        's3://openfmri/tarballs'
    )  # to verify having credentials to access openfmri via S3
    for url_pref in ('http://openfmri.s3.amazonaws.com',
                     'https://s3.amazonaws.com/openfmri'):
        eq_(get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz"),
            url_pref + "/tarballs/ds001_raw.tgz?versionId=null")

        eq_(get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz?param=1"),
            url_pref + "/tarballs/ds001_raw.tgz?param=1&versionId=null")

        # We don't duplicate the version if it already exists.
        eq_(
            get_versioned_url(url_pref +
                              "/tarballs/ds001_raw.tgz?versionId=null"),
            url_pref + "/tarballs/ds001_raw.tgz?versionId=null")

    # something is wrong there
    #print(get_versioned_url("http://openfmri.s3.amazonaws.com/ds001/demographics.txt"))

    eq_(get_versioned_url("someurl"),
        "someurl")  # should just return original one
    assert_raises(RuntimeError,
                  get_versioned_url,
                  "someurl",
                  guarantee_versioned=True)

    # TODO: on a bucket without versioning
    url = "http://datalad-test0-nonversioned.s3.amazonaws.com/2versions-removed-recreated.txt"
    eq_(get_versioned_url(url), url)
    eq_(get_versioned_url(url, return_all=True), [url])

    assert_raises(NotImplementedError, get_versioned_url, "s3://buga")

    urls = get_versioned_url(
        "http://datalad-test0-versioned.s3.amazonaws.com/2versions-removed-recreated.txt",
        return_all=True,
        verify=True)
    eq_(len(set(urls)), len(urls))  # all unique
    for url in urls:
        # so we didn't grab other files along with the same prefix
        ok_startswith(
            url,
            'http://datalad-test0-versioned.s3.amazonaws.com/2versions-removed-recreated.txt?versionId='
        )

    # Update a versioned URL with a newer version tag.
    url_3ver = "http://datalad-test0-versioned.s3.amazonaws.com/3versions-allversioned.txt"
    url_3ver_input = url_3ver + "?versionId=b.qCuh7Sg58VIYj8TVHzbRS97EvejzEl"
    eq_(get_versioned_url(url_3ver_input), url_3ver_input)
    eq_(get_versioned_url(url_3ver_input, update=True),
        url_3ver + "?versionId=Kvuind11HZh._dCPaDAb0OY9dRrQoTMn")
Ejemplo n.º 2
0
def test_version_url_deleted():
    get_test_providers('s3://datalad-test0-versioned/', reload=True)  # to verify having credentials to access
    # openfmri via S3
    # it existed and then was removed
    fpath = "1version-removed.txt"
    url = "http://datalad-test0-versioned.s3.amazonaws.com/%s" % fpath
    turl = "http://datalad-test0-versioned.s3.amazonaws.com/%s" \
           "?versionId=eZ5Hgwo8azfBv3QT7aW9dmm2sbLUY.QP" % fpath
    eq_(get_versioned_url(url), turl)
Ejemplo n.º 3
0
def test_get_versioned_url_anon():
    # The one without any authenticator, was crashing.
    # Also it triggered another bug about having . in the bucket name
    url_on = "http://openneuro.org.s3.amazonaws.com/ds000001/dataset_description.json"
    url_on_versioned = get_versioned_url(url_on)
    ok_startswith(url_on_versioned, url_on + "?versionId=")
Ejemplo n.º 4
0
    def __call__(dataset,
                 urlfile,
                 urlformat,
                 filenameformat,
                 input_type="ext",
                 exclude_autometa=None,
                 meta=None,
                 message=None,
                 dry_run=False,
                 fast=False,
                 ifexists=None,
                 missing_value=None,
                 save=True,
                 version_urls=False):
        # Temporarily work around gh-2269.
        url_file = urlfile
        url_format, filename_format = urlformat, filenameformat

        from requests.exceptions import RequestException

        from datalad.distribution.add import Add
        from datalad.distribution.create import Create
        from datalad.distribution.dataset import Dataset, require_dataset
        from datalad.interface.results import get_status_dict
        from datalad.support.annexrepo import AnnexRepo

        lgr = logging.getLogger("datalad.plugin.addurls")

        dataset = require_dataset(dataset, check_installed=False)
        if dataset.repo and not isinstance(dataset.repo, AnnexRepo):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message="not an annex repo")
            return

        if input_type == "ext":
            extension = os.path.splitext(url_file)[1]
            input_type = "json" if extension == ".json" else "csv"

        with open(url_file) as fd:
            try:
                rows, subpaths = extract(fd, input_type, url_format,
                                         filename_format, exclude_autometa,
                                         meta, dry_run, missing_value)
            except (ValueError, RequestException) as exc:
                yield get_status_dict(action="addurls",
                                      ds=dataset,
                                      status="error",
                                      message=exc_str(exc))
                return

        if len(rows) != len(set(row["filename"] for row in rows)):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message=("There are file name collisions; "
                                           "consider using {_repindex}"))
            return

        if dry_run:
            for subpath in subpaths:
                lgr.info("Would create a subdataset at %s", subpath)
            for row in rows:
                lgr.info("Would download %s to %s", row["url"],
                         os.path.join(dataset.path, row["filename"]))
                lgr.info(
                    "Metadata: %s",
                    sorted(u"{}={}".format(k, v)
                           for k, v in row["meta_args"].items()))
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="ok",
                                  message="dry-run finished")
            return

        if not dataset.repo:
            # Populate a new dataset with the URLs.
            for r in dataset.create(result_xfm=None,
                                    return_type='generator',
                                    save=save):
                yield r

        annex_options = ["--fast"] if fast else []

        for spath in subpaths:
            if os.path.exists(os.path.join(dataset.path, spath)):
                lgr.warning("Not creating subdataset at existing path: %s",
                            spath)
            else:
                for r in dataset.create(spath,
                                        result_xfm=None,
                                        return_type='generator',
                                        save=save):
                    yield r

        for row in rows:
            # Add additional information that we'll need for various
            # operations.
            filename_abs = os.path.join(dataset.path, row["filename"])
            if row["subpath"]:
                ds_current = Dataset(os.path.join(dataset.path,
                                                  row["subpath"]))
                ds_filename = os.path.relpath(filename_abs, ds_current.path)
            else:
                ds_current = dataset
                ds_filename = row["filename"]
            row.update({
                "filename_abs": filename_abs,
                "ds": ds_current,
                "ds_filename": ds_filename
            })

        if version_urls:
            num_urls = len(rows)
            log_progress(lgr.info,
                         "addurls_versionurls",
                         "Versioning %d URLs",
                         num_urls,
                         label="Versioning URLs",
                         total=num_urls,
                         unit=" URLs")
            for row in rows:
                url = row["url"]
                try:
                    row["url"] = get_versioned_url(url)
                except (ValueError, NotImplementedError) as exc:
                    # We don't expect this to happen because get_versioned_url
                    # should return the original URL if it isn't an S3 bucket.
                    # It only raises exceptions if it doesn't know how to
                    # handle the scheme for what looks like an S3 bucket.
                    lgr.warning("error getting version of %s: %s", row["url"],
                                exc_str(exc))
                log_progress(lgr.info,
                             "addurls_versionurls",
                             "Versioned result for %s: %s",
                             url,
                             row["url"],
                             update=1,
                             increment=True)
            log_progress(lgr.info, "addurls_versionurls",
                         "Finished versioning URLs")

        files_to_add = set()
        for r in add_urls(rows, ifexists=ifexists, options=annex_options):
            if r["status"] == "ok":
                files_to_add.add(r["path"])
            yield r

            msg = message or """\
[DATALAD] add files from URLs

url_file='{}'
url_format='{}'
filename_format='{}'""".format(url_file, url_format, filename_format)

        if files_to_add:
            for r in dataset.add(files_to_add, save=False):
                yield r

            meta_rows = [r for r in rows if r["filename_abs"] in files_to_add]
            for r in add_meta(meta_rows):
                yield r

            # Save here rather than the add call above to trigger a metadata
            # commit on the git-annex branch.
            if save:
                for r in dataset.save(message=msg, recursive=True):
                    yield r
Ejemplo n.º 5
0
    def __call__(dataset, urlfile, urlformat, filenameformat,
                 input_type="ext", exclude_autometa=None, meta=None,
                 message=None, dry_run=False, fast=False, ifexists=None,
                 missing_value=None, save=True, version_urls=False):
        # Temporarily work around gh-2269.
        url_file = urlfile
        url_format, filename_format = urlformat, filenameformat

        from requests.exceptions import RequestException

        from datalad.distribution.dataset import Dataset, require_dataset
        from datalad.interface.results import get_status_dict
        from datalad.support.annexrepo import AnnexRepo

        lgr = logging.getLogger("datalad.plugin.addurls")

        dataset = require_dataset(dataset, check_installed=False)
        if dataset.repo and not isinstance(dataset.repo, AnnexRepo):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message="not an annex repo")
            return

        if input_type == "ext":
            extension = os.path.splitext(url_file)[1]
            input_type = "json" if extension == ".json" else "csv"

        with open(url_file) as fd:
            try:
                rows, subpaths = extract(fd, input_type,
                                         url_format, filename_format,
                                         exclude_autometa, meta,
                                         dry_run,
                                         missing_value)
            except (ValueError, RequestException) as exc:
                yield get_status_dict(action="addurls",
                                      ds=dataset,
                                      status="error",
                                      message=exc_str(exc))
                return

        if len(rows) != len(set(row["filename"] for row in rows)):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message=("There are file name collisions; "
                                           "consider using {_repindex}"))
            return

        if dry_run:
            for subpath in subpaths:
                lgr.info("Would create a subdataset at %s", subpath)
            for row in rows:
                lgr.info("Would download %s to %s",
                         row["url"],
                         os.path.join(dataset.path, row["filename"]))
                lgr.info("Metadata: %s",
                         sorted(u"{}={}".format(k, v)
                                for k, v in row["meta_args"].items()))
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="ok",
                                  message="dry-run finished")
            return

        if not dataset.repo:
            # Populate a new dataset with the URLs.
            for r in dataset.create(result_xfm=None,
                                    return_type='generator'):
                yield r

        annex_options = ["--fast"] if fast else []

        for spath in subpaths:
            if os.path.exists(os.path.join(dataset.path, spath)):
                lgr.warning(
                    "Not creating subdataset at existing path: %s",
                    spath)
            else:
                for r in dataset.create(spath, result_xfm=None,
                                        return_type='generator'):
                    yield r

        for row in rows:
            # Add additional information that we'll need for various
            # operations.
            filename_abs = os.path.join(dataset.path, row["filename"])
            if row["subpath"]:
                ds_current = Dataset(os.path.join(dataset.path,
                                                  row["subpath"]))
                ds_filename = os.path.relpath(filename_abs, ds_current.path)
            else:
                ds_current = dataset
                ds_filename = row["filename"]
            row.update({"filename_abs": filename_abs,
                        "ds": ds_current,
                        "ds_filename": ds_filename})

        if version_urls:
            num_urls = len(rows)
            log_progress(lgr.info, "addurls_versionurls",
                         "Versioning %d URLs", num_urls,
                         label="Versioning URLs",
                         total=num_urls, unit=" URLs")
            for row in rows:
                url = row["url"]
                try:
                    row["url"] = get_versioned_url(url)
                except (ValueError, NotImplementedError) as exc:
                    # We don't expect this to happen because get_versioned_url
                    # should return the original URL if it isn't an S3 bucket.
                    # It only raises exceptions if it doesn't know how to
                    # handle the scheme for what looks like an S3 bucket.
                    lgr.warning("error getting version of %s: %s",
                                row["url"], exc_str(exc))
                log_progress(lgr.info, "addurls_versionurls",
                             "Versioned result for %s: %s", url, row["url"],
                             update=1, increment=True)
            log_progress(lgr.info, "addurls_versionurls", "Finished versioning URLs")

        files_to_add = set()
        for r in add_urls(rows, ifexists=ifexists, options=annex_options):
            if r["status"] == "ok":
                files_to_add.add(r["path"])
            yield r

            msg = message or """\
[DATALAD] add files from URLs

url_file='{}'
url_format='{}'
filename_format='{}'""".format(url_file, url_format, filename_format)

        if files_to_add:
            meta_rows = [r for r in rows if r["filename_abs"] in files_to_add]
            for r in add_meta(meta_rows):
                yield r

            if save:
                for r in dataset.save(path=files_to_add, message=msg, recursive=True):
                    yield r