Beispiel #1
0
def get_oracle_db(
        dbserver=None,
        port=1521,
        sid='ORCL',
        credential=None):
    dbserver = dbserver or cfg.obtain('datalad.externals.nda.dbserver',
                                      default=DEFAULT_SERVER)
    # This specific username has access to the 'Image' selection of NDA as of about today
    #username = username \
    #           or cfg.get('externals:nda', 'username',
    #                default='halchenkoy_103924')
    if not credential:
        providers = Providers.from_config_files()
        credential = providers.get_provider(DEFAULT_SERVER).credential

    if not isinstance(credential, dict):
        credential = credential()

    import cx_Oracle   # you must have the beast if you want to access the dark side
    dsnStr = cx_Oracle.makedsn(dbserver, port, sid)
    db = cx_Oracle.connect(user=credential['user'],
                           password=credential['password'],
                           dsn=dsnStr)

    return db
Beispiel #2
0
 def token(self):
     if self._token is None:
         from datalad.downloaders.providers import Providers
         providers = Providers.from_config_files()
         provider = providers.get_provider(self.API_URL)
         credential = provider.credential
         self._token = credential().get('token')
     return self._token
Beispiel #3
0
def get_test_providers(url=None, reload=False):
    """Return reusable instance of our global providers + verify credentials for url"""
    _test_providers = Providers.from_config_files(reload=reload)
    if url is not None:
        # check if we have credentials for the url
        provider = _test_providers.get_provider(url, only_nondefault=True)
        if provider is None or provider.credential is None:
            # no registered provider, or no credential needed,must be all kosher to access
            pass
        elif not provider.credential.is_known:
            raise SkipTest("This test requires known credentials for %s" % provider.credential.name)
    return _test_providers
Beispiel #4
0
def get_cached_url_content(url, name=None, fetcher=None, maxage=None):
    """Loader of a document from a url, which caches loaded instance on disk

    Doesn't do anything smart about http headers etc which could provide
    information for cache/proxy servers for how long to retain etc

    TODO: theoretically it is not network specific at all -- and just a memoize
    pattern, but may be some time we would make it treat headers etc correctly.
    And ATM would support any URL we support via providers/downloaders

    Parameters
    ----------
    fetcher: callable, optional
       Function to call with url if needed to be refetched
    maxage: float, optional
       Age in days to retain valid for.  <0 - would retain forever.  If None -
       would consult the config, 0 - would force to reload
    """
    doc_fname = get_url_cache_filename(url, name)
    if maxage is None:
        maxage = float(cfg.get('datalad.locations.cache-maxage'))

    doc = None
    if os.path.exists(doc_fname) and maxage != 0:

        fage = (time.time() - os.stat(doc_fname).st_mtime)/(24. * 3600)
        if maxage < 0 or fage < maxage:
            try:
                lgr.debug("use cached request result to '%s' from %s", url, doc_fname)
                doc = pickle.load(open(doc_fname, 'rb'))
            except Exception as e:  # it is OK to ignore any error and fall back on the true source
                lgr.warning(
                    "cannot load cache from '%s', fall back to download: %s",
                    doc_fname, exc_str(e))

    if doc is None:
        if fetcher is None:
            from datalad.downloaders.providers import Providers
            providers = Providers.from_config_files()
            fetcher = providers.fetch

        doc = fetcher(url)
        assure_dir(dirname(doc_fname))
        # use pickle to store the entire request result dict
        pickle.dump(doc, open(doc_fname, 'wb'))
        lgr.debug("stored result of request to '{}' in {}".format(url, doc_fname))
    return doc
Beispiel #5
0
def pipeline(dataset_id, url=TOPURL):
    lgr.info("Creating a pipeline for the BALSA dataset %s" % dataset_id)
    annex = Annexificator(create=False,
                          statusdb='json',
                          special_remotes=[ARCHIVES_SPECIAL_REMOTE, DATALAD_SPECIAL_REMOTE],
                          options=["-c",
                                   "annex.largefiles="
                                   "exclude=Makefile and exclude=LICENSE* and exclude=ISSUES*"
                                   " and exclude=CHANGES* and exclude=README*"
                                   " and exclude=*.[mc] and exclude=dataset*.json"
                                   " and exclude=*.txt"
                                   " and exclude=*.json"
                                   " and exclude=*.tsv"
                                   ])

    if not exists("_files"):
        makedirs("_files")

    def splitpath(data):
        data = data.copy()
        fullpath = data.pop('path')
        path = os.path.dirname(fullpath)
        if path:
            data['path'] = path
        data['filename'] = os.path.basename(fullpath)
        yield data

    files_url = opj(url, 'file/show/')  # files_url = https://balsa.wustl.edu/file/show/

    url = opj(url, 'study/')  # url = https://balsa.wustl.edu/study/
    dataset_url = '%sshow/%s' % (url, dataset_id)  # url = https://balsa.wustl.edu/study/show/[dataset_id]

    balsa = BalsaSupport(repo=annex.repo)

    from datalad.downloaders.providers import Providers
    providers = Providers.from_config_files()
    balsa_downloader = providers.get_provider(url).get_downloader(url)

    def get_disposition_filename(data):
        yield updated(data, {'filename': balsa_downloader.get_status(data['url']).filename})

    # BALSA has no versioning atm, so no changelog either
    return [
        annex.switch_branch('incoming'),
        [
            crawl_url(url),
            [
                assign({'dataset': dataset_id}),
                skip_if({'dataset': 'test study upload'}, re=True),
            ],
            [
                crawl_url(dataset_url),
                [
                    extract_meta,
                    annex,
                ],
                [
                    # canonical tarball
                    a_href_match('.*/download/.*', min_count=1),
                    # TODO:  in case of .zip files we must not use checksum backends
                    #        since those are regenerated irreproducibly each time
                    #        so we should use URL backend with those
                    annex,
                ],
                [
                    a_href_match(files_url, min_count=2),
                    assign({'path': '_files/%(url_text)s'}, interpolate=True),
                    sub({'path': {' / ': '/'}}),
                    splitpath,
                    crawl_url(),
                    a_href_match('.*/download/.*', max_count=1),
                    # so we could use it in our magical function
                    # because get_disposition will override it
                    assign({'target_filename': '%(filename)s'}, interpolate=True),
                    get_disposition_filename,
                    fix_the_filename,
                    annex,
                ],
            ],

        ],
        annex.remove_obsolete(),
        [
            annex.switch_branch('incoming-processed'),
            annex.merge_branch('incoming', one_commit_at_a_time=True, strategy='theirs', commit=False),
            [
               {'loop': True},
               find_files("\.(zip|tgz|tar(\..+)?)$", fail_if_none=True),
               annex.add_archive_content(
                   existing='archive-suffix',
                   strip_leading_dirs=True,
                   leading_dirs_depth=1,
                   delete=True,
                   exclude=['(^|%s)\._' % os.path.sep],
               ),
            ],
            balsa.verify_files,
            annex.switch_branch('master'),
            annex.merge_branch('incoming-processed', commit=True, allow_unrelated=True),
            annex.finalize(tag=True),
        ],
        annex.switch_branch('master'),
        annex.finalize(cleanup=True),
    ]
Beispiel #6
0
def _ls_s3(loc, fast=False, recursive=False, all_=False, long_=False,
           config_file=None, list_content=False):
    """List S3 bucket content"""
    if loc.startswith('s3://'):
        bucket_prefix = loc[5:]
    else:
        raise ValueError("passed location should be an s3:// url")

    import boto
    from hashlib import md5
    from boto.s3.key import Key
    from boto.s3.prefix import Prefix
    from boto.s3.connection import OrdinaryCallingFormat
    from boto.exception import S3ResponseError
    from ..support.configparserinc import SafeConfigParser  # provides PY2,3 imports

    if '/' in bucket_prefix:
        bucket_name, prefix = bucket_prefix.split('/', 1)
    else:
        bucket_name, prefix = bucket_prefix, None

    if prefix and '?' in prefix:
        ui.message("We do not care about URL options ATM, they get stripped")
        prefix = prefix[:prefix.index('?')]

    ui.message("Connecting to bucket: %s" % bucket_name)
    if config_file:
        config = SafeConfigParser()
        config.read(config_file)
        access_key = config.get('default', 'access_key')
        secret_key = config.get('default', 'secret_key')

        # TODO: remove duplication -- reuse logic within downloaders/s3.py to get connected
        kwargs = {}
        if '.' in bucket_name:
            kwargs['calling_format']=OrdinaryCallingFormat()
        conn = boto.connect_s3(access_key, secret_key, **kwargs)
        try:
            bucket = conn.get_bucket(bucket_name)
        except S3ResponseError as e:
            ui.message("E: Cannot access bucket %s by name" % bucket_name)
            all_buckets = conn.get_all_buckets()
            all_bucket_names = [b.name for b in all_buckets]
            ui.message("I: Found following buckets %s" % ', '.join(all_bucket_names))
            if bucket_name in all_bucket_names:
                bucket = all_buckets[all_bucket_names.index(bucket_name)]
            else:
                raise RuntimeError("E: no bucket named %s thus exiting" % bucket_name)
    else:
        # TODO: expose credentials
        # We don't need any provider here really but only credentials
        from datalad.downloaders.providers import Providers
        providers = Providers.from_config_files()
        provider = providers.get_provider(loc)

        if not provider:
            raise ValueError(
                "Don't know how to deal with this url %s -- no provider defined for %s. "
                "Define a new provider (DOCS: TODO) or specify just s3cmd config file instead for now."
                % loc
            )
        downloader = provider.get_downloader(loc)

        # should authenticate etc, and when ready we will ask for a bucket ;)
        bucket = downloader.access(lambda url: downloader.bucket, loc)

    info = []
    for iname, imeth in [
        ("Versioning", bucket.get_versioning_status),
        ("   Website", bucket.get_website_endpoint),
        ("       ACL", bucket.get_acl),
    ]:
        try:
            ival = imeth()
        except Exception as e:
            ival = str(e).split('\n')[0]
        info.append(" {iname}: {ival}".format(**locals()))
    ui.message("Bucket info:\n %s" % '\n '.join(info))

    kwargs = {} if recursive else {'delimiter': '/'}

    ACCESS_METHODS = [
        bucket.list_versions,
        bucket.list
    ]

    prefix_all_versions = None
    got_versioned_list = False
    for acc in ACCESS_METHODS:
        try:
            prefix_all_versions = list(acc(prefix, **kwargs))
            got_versioned_list = acc is bucket.list_versions
            break
        except Exception as exc:
            lgr.debug("Failed to access via %s: %s", acc, exc_str(exc))

    if not prefix_all_versions:
        ui.error("No output was provided for prefix %r" % prefix)
    else:
        max_length = max((len(e.name) for e in prefix_all_versions))
        max_size_length = max((len(str(getattr(e, 'size', 0))) for e in prefix_all_versions))

    results = []
    for e in prefix_all_versions:
        results.append(e)
        if isinstance(e, Prefix):
            ui.message("%s" % (e.name, ),)
            continue

        base_msg = ("%%-%ds %%s" % max_length) % (e.name, e.last_modified)
        if isinstance(e, Key):
            if got_versioned_list and not (e.is_latest or all_):
                lgr.debug(
                    "Skipping Key since not all versions requested: %s", e)
                # Skip this one
                continue
            ui.message(base_msg + " %%%dd" % max_size_length % e.size, cr=' ')
            # OPT: delayed import
            from ..support.s3 import get_key_url
            url = get_key_url(e, schema='http')
            try:
                _ = urlopen(Request(url))
                urlok = "OK"
            except HTTPError as err:
                urlok = "E: %s" % err.code

            try:
                acl = e.get_acl()
            except S3ResponseError as exc:
                acl = exc.code if exc.code in ('AccessDenied',) else str(exc)

            content = ""
            if list_content:
                # IO intensive, make an option finally!
                try:
                    # _ = e.next()[:5]  if we are able to fetch the content
                    kwargs = dict(version_id=e.version_id)
                    if list_content in {'full', 'first10'}:
                        if list_content in 'first10':
                            kwargs['headers'] = {'Range': 'bytes=0-9'}
                        content = repr(e.get_contents_as_string(**kwargs))
                    elif list_content == 'md5':
                        digest = md5()
                        digest.update(e.get_contents_as_string(**kwargs))
                        content = digest.hexdigest()
                    else:
                        raise ValueError(list_content)
                    # content = "[S3: OK]"
                except S3ResponseError as err:
                    content = str(err)
                finally:
                    content = " " + content
            ui.message(
                "ver:%-32s  acl:%s  %s [%s]%s"
                % (getattr(e, 'version_id', None),
                   acl, url, urlok, content)
                if long_ else ''
            )
        else:
            ui.message(base_msg + " " + str(type(e)).split('.')[-1].rstrip("\"'>"))
    return results
Beispiel #7
0
def _ls_s3(loc, fast=False, recursive=False, all=False, config_file=None, list_content=False):
    """List S3 bucket content"""
    if loc.startswith('s3://'):
        bucket_prefix = loc[5:]
    else:
        raise ValueError("passed location should be an s3:// url")

    import boto
    from hashlib import md5
    from boto.s3.key import Key
    from boto.s3.prefix import Prefix
    from boto.exception import S3ResponseError
    from ..support.configparserinc import SafeConfigParser  # provides PY2,3 imports

    bucket_name, prefix = bucket_prefix.split('/', 1)

    if '?' in prefix:
        ui.message("We do not care about URL options ATM, they get stripped")
        prefix = prefix[:prefix.index('?')]

    ui.message("Connecting to bucket: %s" % bucket_name)
    if config_file:
        config = SafeConfigParser(); config.read(config_file)
        access_key = config.get('default', 'access_key')
        secret_key = config.get('default', 'secret_key')

        # TODO: remove duplication -- reuse logic within downloaders/s3.py to get connected
        conn = boto.connect_s3(access_key, secret_key)
        try:
            bucket = conn.get_bucket(bucket_name)
        except S3ResponseError as e:
            ui.message("E: Cannot access bucket %s by name" % bucket_name)
            all_buckets = conn.get_all_buckets()
            all_bucket_names = [b.name for b in all_buckets]
            ui.message("I: Found following buckets %s" % ', '.join(all_bucket_names))
            if bucket_name in all_bucket_names:
                bucket = all_buckets[all_bucket_names.index(bucket_name)]
            else:
                raise RuntimeError("E: no bucket named %s thus exiting" % bucket_name)
    else:
        # TODO: expose credentials
        # We don't need any provider here really but only credentials
        from datalad.downloaders.providers import Providers
        providers = Providers.from_config_files()
        provider = providers.get_provider(loc)
        if not provider:
            raise ValueError("don't know how to deal with this url %s -- no downloader defined.  Specify just s3cmd config file instead")
        bucket = provider.authenticator.authenticate(bucket_name, provider.credential)

    info = []
    for iname, imeth in [
        ("Versioning", bucket.get_versioning_status),
        ("   Website", bucket.get_website_endpoint),
        ("       ACL", bucket.get_acl),
    ]:
        try:
            ival = imeth()
        except Exception as e:
            ival = str(e).split('\n')[0]
        info.append(" {iname}: {ival}".format(**locals()))
    ui.message("Bucket info:\n %s" % '\n '.join(info))

    kwargs = {} if recursive else {'delimiter': '/'}
    prefix_all_versions = list(bucket.list_versions(prefix, **kwargs))

    if not prefix_all_versions:
        ui.error("No output was provided for prefix %r" % prefix)
    else:
        max_length = max((len(e.name) for e in prefix_all_versions))
    for e in prefix_all_versions:
        if isinstance(e, Prefix):
            ui.message("%s" % (e.name, ),)
            continue
        ui.message(("%%-%ds %%s" % max_length) % (e.name, e.last_modified), cr=' ')
        if isinstance(e, Key):
            if not (e.is_latest or all):
                # Skip this one
                continue
            url = get_key_url(e, schema='http')
            try:
                _ = urlopen(Request(url))
                urlok = "OK"
            except HTTPError as err:
                urlok = "E: %s" % err.code

            try:
                acl = e.get_acl()
            except S3ResponseError as err:
                acl = err.message

            content = ""
            if list_content:
                # IO intensive, make an option finally!
                try:
                    # _ = e.next()[:5]  if we are able to fetch the content
                    kwargs = dict(version_id=e.version_id)
                    if list_content in {'full', 'first10'}:
                        if list_content in 'first10':
                            kwargs['headers'] = {'Range': 'bytes=0-9'}
                        content = repr(e.get_contents_as_string(**kwargs))
                    elif list_content == 'md5':
                        digest = md5()
                        digest.update(e.get_contents_as_string(**kwargs))
                        content = digest.hexdigest()
                    else:
                        raise ValueError(list_content)
                    #content = "[S3: OK]"
                except S3ResponseError as err:
                    content = err.message
                finally:
                    content = " " + content

            ui.message("ver:%-32s  acl:%s  %s [%s]%s" % (e.version_id, acl, url, urlok, content))
        else:
            if all:
                ui.message("del")
#!/usr/bin/env python3

import tqdm
from datalad.distribution.dataset import Dataset
import os.path as op
from time import sleep

from datalad.downloaders.providers import Providers
providers = Providers.from_config_files()
shub_downloader = Providers.from_config_files().get_provider('shub://doesnot/matter').get_downloader('shub://doesnot/matter')

ds = Dataset(op.dirname(op.dirname(__file__)))

repo = ds.repo

containers = [s for s in ds.config.sections() if s.startswith('datalad.containers.')]

# we need to disable datalad remove due to
# https://git-annex.branchable.com/bugs/rmurl_marks_url_not_available_in_wrong_remote/?updated
reenable_datalad = False
if 'datalad' not in repo.get_remotes():
    if 'datalad' in repo.get_special_remotes():
        repo.call_annex(['enableremote', 'datalad'])
    else:
        repo.call_annex(['initremote', 'datalad', 'externaltype=datalad', 'type=external', 'encryption=none', 'autoenable=true'])

for c in containers:
    updateurl = ds.config.get_value(c, 'updateurl')
    image = ds.config.get_value(c, 'image')
    print(f"{image}: {updateurl}")
    try:
Beispiel #9
0
    def __call__(self, data):

        stats = data.get('datalad_stats', None)
        url = "s3://%s" % self.bucket
        if self.prefix:
            url += "/" + self.prefix.lstrip('/')
        providers = Providers.from_config_files()
        downloader = providers.get_provider(url).get_downloader(url)

        # bucket = provider.authenticator.authenticate(bucket_name, provider.credential)
        try:
            _ = downloader.get_status(
                url)  # just to authenticate and establish connection
        except TargetFileAbsent as exc:
            lgr.debug(
                "Initial URL %s lead to not something downloader could fetch: %s",
                url, exc_str(exc))
            pass
        bucket = downloader.bucket
        assert (bucket is not None)

        if self.repo:
            versions_db = SingleVersionDB(self.repo)
            prev_version = versions_db.version
        else:
            prev_version, versions_db = None, None

        # TODO:  we could probably use headers to limit from previously crawled last-modified
        # for now will be inefficient -- fetch all, sort, proceed
        kwargs = {} if self.recursive else {'delimiter': '/'}
        all_versions = (bucket.list_versions if self.versioned else
                        bucket.list)(self.prefix, **kwargs)

        # Comparison becomes tricky whenever as if in our test bucket we have a collection
        # of rapid changes within the same ms, so they couldn't be sorted by last_modified, so we resolve based
        # on them being marked latest, or not being null (as could happen originally), and placing Delete after creation
        # In real life last_modified should be enough, but life can be as tough as we made it for 'testing'
        def kf(k, f):
            """Some elements, such as Prefix wouldn't have any of attributes to sort by"""
            return getattr(k, f, '')

        # So ATM it would sort Prefixes first, but that is not necessarily correct...
        # Theoretically the only way to sort Prefix'es with the rest is traverse that Prefix
        # and take latest last_modified there but it is expensive, so -- big TODO if ever ;)
        # ACTUALLY -- may be there is an API call to return sorted by last_modified, then we
        # would need only a single entry in result to determine the last_modified for the Prefix, thus TODO
        cmp = lambda k: (kf(k, 'last_modified'), k.name, kf(k, 'is_latest'),
                         kf(k, 'version_id') != 'null',
                         isinstance(k, DeleteMarker))

        versions_sorted = sorted(all_versions,
                                 key=cmp)  # attrgetter('last_modified'))
        # print '\n'.join(map(str, [cmp(k) for k in versions_sorted]))

        version_fields = ['last-modified', 'name', 'version-id']

        def get_version_cmp(k):
            # this one will return action version_id so we could uniquely identify
            return kf(k, 'last_modified'), k.name, kf(k, 'version_id')

        if prev_version:
            last_modified_, name_, version_id_ = [
                prev_version[f] for f in version_fields
            ]
            # roll forward until we get to the element > this
            # to not breed list copies
            for i, k in enumerate(versions_sorted):
                lm, n, vid = get_version_cmp(k)
                if lm > last_modified_:
                    start = i
                    break
                elif lm == last_modified_:
                    # go by name/version_id to be matched and then switch to the next one
                    if (n, vid) == (name_, version_id_):
                        start = i + 1  # from the next one
                        if stats:
                            stats.increment('skipped')
                        break
                stats.increment('skipped')
            versions_sorted = versions_sorted[start:]

        # a set of items which we have already seen/yielded so hitting any of them again
        # would mean conflict/versioning is necessary since two actions came for the same item
        staged = set()
        strategy = self.strategy
        e_prev = None
        ncommits = self.ncommits or 0

        # adding None so we could deal with the last commit within the loop without duplicating
        # logic later outside
        def update_versiondb(e, force=False):
            # this way we could recover easier after a crash
            # TODO: config crawl.crawl_s3.versiondb.saveaftereach=True
            if e is not None and (force or True):
                versions_db.version = dict(
                    zip(version_fields, get_version_cmp(e)))

        for e in versions_sorted + [None]:
            filename = e.name if e is not None else None
            if (self.strip_prefix and self.prefix):
                filename = _strip_prefix(filename, self.prefix)
            if filename and self.exclude and re.search(self.exclude, filename):
                stats.skipped += 1
                continue

            if filename in staged or e is None:
                # we should finish this one and commit
                if staged:
                    if self.versionfx and e_prev is not None:
                        version = self.versionfx(e_prev)
                        if version is not None and version not in stats.versions:
                            stats.versions.append(version)
                    if versions_db:
                        # save current "version" DB so we would know where to pick up from
                        # upon next rerun.  Record should contain
                        # last_modified, name, versionid
                        # TODO?  what if e_prev was a DeleteMarker???
                        update_versiondb(e_prev, force=True)
                    if strategy == 'commit-versions':
                        yield updated(data, {'datalad_action': 'commit'})
                        if self.ncommits:
                            ncommits += 1
                            if self.ncommits <= ncommits:
                                lgr.debug(
                                    "Interrupting on %dth commit since asked to do %d",
                                    ncommits, self.ncommits)
                                break
                    staged.clear()
                if e is None:
                    break  # we are done
            if filename:
                # might be empty if e.g. it was the self.prefix directory removed
                staged.add(filename)
            if isinstance(e, Key):
                if e.name.endswith('/'):
                    # signals a directory for which we don't care explicitly (git doesn't -- we don't! ;) )
                    continue
                url = get_key_url(e,
                                  schema=self.url_schema,
                                  versioned=self.versioned)
                # generate and pass along the status right away since we can
                yield updated(
                    data, {
                        'url':
                        url,
                        'url_status':
                        S3Downloader.get_key_status(e, dateformat='iso8601'),
                        'filename':
                        filename,
                        'datalad_action':
                        'annex',
                    })
                update_versiondb(e)
            elif isinstance(e, DeleteMarker):
                if strategy == 'commit-versions':
                    # Since git doesn't care about empty directories for us makes sense only
                    # in the case when DeleteMarker is not pointing to the subdirectory
                    # and not empty (if original directory was removed)
                    if filename and not filename.endswith('/'):
                        yield updated(data, {
                            'filename': filename,
                            'datalad_action': 'remove'
                        })
                    else:
                        # Situation there is much trickier since it seems that "directory"
                        # could also be a key itself and created/removed which somewhat interfers with
                        # all our logic here
                        # For an interesting example see
                        #  s3://openneuro/ds000217/ds000217_R1.0.0/compressed
                        lgr.info("Ignoring DeleteMarker for %s", filename)

                update_versiondb(e)
            elif isinstance(e, Prefix):
                # so  we were provided a directory (in non-recursive traversal)
                assert (not self.recursive)
                yield updated(
                    data, {
                        'url': url,
                        'filename': filename.rstrip('/'),
                        'datalad_action': 'directory',
                    })
            else:
                raise ValueError("Don't know how to treat %s" % e)
            e_prev = e
Beispiel #10
0
def _ls_s3(loc,
           fast=False,
           recursive=False,
           all_=False,
           long_=False,
           config_file=None,
           list_content=False):
    """List S3 bucket content"""
    if loc.startswith('s3://'):
        bucket_prefix = loc[5:]
    else:
        raise ValueError("passed location should be an s3:// url")

    import boto
    from hashlib import md5
    from boto.s3.key import Key
    from boto.s3.prefix import Prefix
    from boto.exception import S3ResponseError
    from ..support.configparserinc import SafeConfigParser  # provides PY2,3 imports

    if '/' in bucket_prefix:
        bucket_name, prefix = bucket_prefix.split('/', 1)
    else:
        bucket_name, prefix = bucket_prefix, None

    if prefix and '?' in prefix:
        ui.message("We do not care about URL options ATM, they get stripped")
        prefix = prefix[:prefix.index('?')]

    ui.message("Connecting to bucket: %s" % bucket_name)
    if config_file:
        config = SafeConfigParser()
        config.read(config_file)
        access_key = config.get('default', 'access_key')
        secret_key = config.get('default', 'secret_key')

        # TODO: remove duplication -- reuse logic within downloaders/s3.py to get connected
        conn = boto.connect_s3(access_key, secret_key)
        try:
            bucket = conn.get_bucket(bucket_name)
        except S3ResponseError as e:
            ui.message("E: Cannot access bucket %s by name" % bucket_name)
            all_buckets = conn.get_all_buckets()
            all_bucket_names = [b.name for b in all_buckets]
            ui.message("I: Found following buckets %s" %
                       ', '.join(all_bucket_names))
            if bucket_name in all_bucket_names:
                bucket = all_buckets[all_bucket_names.index(bucket_name)]
            else:
                raise RuntimeError("E: no bucket named %s thus exiting" %
                                   bucket_name)
    else:
        # TODO: expose credentials
        # We don't need any provider here really but only credentials
        from datalad.downloaders.providers import Providers
        providers = Providers.from_config_files()
        provider = providers.get_provider(loc)

        if not provider:
            raise ValueError(
                "Don't know how to deal with this url %s -- no provider defined for %s. "
                "Define a new provider (DOCS: TODO) or specify just s3cmd config file instead for now."
                % loc)
        downloader = provider.get_downloader(loc)

        # should authenticate etc, and when ready we will ask for a bucket ;)
        bucket = downloader.access(lambda url: downloader.bucket, loc)

    info = []
    for iname, imeth in [
        ("Versioning", bucket.get_versioning_status),
        ("   Website", bucket.get_website_endpoint),
        ("       ACL", bucket.get_acl),
    ]:
        try:
            ival = imeth()
        except Exception as e:
            ival = str(e).split('\n')[0]
        info.append(" {iname}: {ival}".format(**locals()))
    ui.message("Bucket info:\n %s" % '\n '.join(info))

    kwargs = {} if recursive else {'delimiter': '/'}

    ACCESS_METHODS = [bucket.list_versions, bucket.list]

    prefix_all_versions = None
    for acc in ACCESS_METHODS:
        try:
            prefix_all_versions = list(acc(prefix, **kwargs))
            break
        except Exception as exc:
            lgr.debug("Failed to access via %s: %s", acc, exc_str(exc))

    if not prefix_all_versions:
        ui.error("No output was provided for prefix %r" % prefix)
    else:
        max_length = max((len(e.name) for e in prefix_all_versions))
        max_size_length = max(
            (len(str(getattr(e, 'size', 0))) for e in prefix_all_versions))

    results = []
    for e in prefix_all_versions:
        results.append(e)
        if isinstance(e, Prefix):
            ui.message("%s" % (e.name, ), )
            continue

        base_msg = ("%%-%ds %%s" % max_length) % (e.name, e.last_modified)
        if isinstance(e, Key):
            if not (e.is_latest or all_):
                # Skip this one
                continue
            ui.message(base_msg + " %%%dd" % max_size_length % e.size, cr=' ')
            # OPT: delayed import
            from ..support.s3 import get_key_url
            url = get_key_url(e, schema='http')
            try:
                _ = urlopen(Request(url))
                urlok = "OK"
            except HTTPError as err:
                urlok = "E: %s" % err.code

            try:
                acl = e.get_acl()
            except S3ResponseError as err:
                acl = err.message

            content = ""
            if list_content:
                # IO intensive, make an option finally!
                try:
                    # _ = e.next()[:5]  if we are able to fetch the content
                    kwargs = dict(version_id=e.version_id)
                    if list_content in {'full', 'first10'}:
                        if list_content in 'first10':
                            kwargs['headers'] = {'Range': 'bytes=0-9'}
                        content = repr(e.get_contents_as_string(**kwargs))
                    elif list_content == 'md5':
                        digest = md5()
                        digest.update(e.get_contents_as_string(**kwargs))
                        content = digest.hexdigest()
                    else:
                        raise ValueError(list_content)
                    # content = "[S3: OK]"
                except S3ResponseError as err:
                    content = err.message
                finally:
                    content = " " + content
            if long_:
                ui.message("ver:%-32s  acl:%s  %s [%s]%s" %
                           (e.version_id, acl, url, urlok, content))
            else:
                ui.message('')
        else:
            ui.message(base_msg + " " +
                       str(type(e)).split('.')[-1].rstrip("\"'>"))
    return results
Beispiel #11
0
    def __init__(self, annex, **kwargs):
        super().__init__(annex)

        self._providers = Providers.from_config_files()