def init_remote_repo(path, ssh, shared, dataset, description=None): cmd = "git -C {} init{}".format( sh_quote(path), " --shared='{}'".format(sh_quote(shared)) if shared else '') try: ssh(cmd) except CommandError as e: lgr.error("Initialization of remote git repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False if isinstance(dataset.repo, AnnexRepo): # init remote git annex repo (part fix of #463) try: ssh( "git -C {} annex init {}".format( sh_quote(path), sh_quote(description) if description else '') ) except CommandError as e: lgr.error("Initialization of remote git annex repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False return True
def _describe_extensions(): infos = {} from pkg_resources import iter_entry_points from importlib import import_module for e in iter_entry_points('datalad.extensions'): info = {} infos[e.name] = info try: ext = e.load() info['load_error'] = None info['description'] = ext[0] info['module'] = e.module_name mod = import_module(e.module_name, package='datalad') info['version'] = getattr(mod, '__version__', None) except Exception as e: info['load_error'] = exc_str(e) continue info['entrypoints'] = entry_points = {} for ep in ext[1]: ep_info = { 'module': ep[0], 'class': ep[1], 'names': ep[2:], } entry_points['{}.{}'.format(*ep[:2])] = ep_info try: import_module(ep[0], package='datalad') ep_info['load_error'] = None except Exception as e: ep_info['load_error'] = exc_str(e) continue return infos
def _handle_exception(e, bucket_name): """Helper to handle S3 connection exception""" if e.error_code == 'AccessDenied': raise AccessDeniedError(exc_str(e)) else: raise DownloadError("Cannot connect to %s S3 bucket. Exception: %s" % (bucket_name, exc_str(e)))
def ensure_initialized(self): """Assures that manager is initialized - knows socket_dir, previous connections """ if self._socket_dir is not None: return from datalad import cfg self._socket_dir = Path(cfg.obtain('datalad.locations.sockets')) self._socket_dir.mkdir(exist_ok=True, parents=True) try: os.chmod(str(self._socket_dir), 0o700) except OSError as exc: lgr.warning( "Failed to (re)set permissions on the %s. " "Most likely future communications would be impaired or fail. " "Original exception: %s", self._socket_dir, exc_str(exc) ) try: self._prev_connections = [p for p in self.socket_dir.iterdir() if not p.is_dir()] except OSError as exc: self._prev_connections = [] lgr.warning( "Failed to list %s for existing sockets. " "Most likely future communications would be impaired or fail. " "Original exception: %s", self._socket_dir, exc_str(exc) ) lgr.log(5, "Found %d previous connections", len(self._prev_connections))
def assure_initialized(self): """Assures that manager is initialized - knows socket_dir, previous connections """ if self._socket_dir is not None: return from ..config import ConfigManager from os import chmod cfg = ConfigManager() self._socket_dir = opj(cfg.obtain('datalad.locations.cache'), 'sockets') assure_dir(self._socket_dir) try: chmod(self._socket_dir, 0o700) except OSError as exc: lgr.warning( "Failed to (re)set permissions on the %s. " "Most likely future communications would be impaired or fail. " "Original exception: %s", self._socket_dir, exc_str(exc)) from os import listdir from os.path import isdir try: self._prev_connections = [ opj(self.socket_dir, p) for p in listdir(self.socket_dir) if not isdir(opj(self.socket_dir, p)) ] except OSError as exc: self._prev_connections = [] lgr.warning( "Failed to list %s for existing sockets. " "Most likely future communications would be impaired or fail. " "Original exception: %s", self._socket_dir, exc_str(exc)) lgr.log(5, "Found %d previous connections", len(self._prev_connections))
def get_bucket(conn, bucket_name): """A helper to get a bucket Parameters ---------- bucket_name: str Name of the bucket to connect to """ try: bucket = conn.get_bucket(bucket_name) except S3ResponseError as e: # can initially deny or error to connect to the specific bucket by name, # and we would need to list which buckets are available under following # credentials: lgr.debug("Cannot access bucket %s by name: %s", bucket_name, exc_str(e)) try: all_buckets = conn.get_all_buckets() except S3ResponseError as e2: lgr.debug("Cannot access all buckets: %s", exc_str(e2)) _handle_exception(e, 'any (originally requested %s)' % bucket_name) all_bucket_names = [b.name for b in all_buckets] lgr.debug("Found following buckets %s", ', '.join(all_bucket_names)) if bucket_name in all_bucket_names: bucket = all_buckets[all_bucket_names.index(bucket_name)] else: _handle_exception(e, bucket_name) return bucket
def get_singularity_jobspec(cmd): """Extract the runscript of a singularity container used as an executable Parameters ---------- cmd : list A command as an argument list. Returns ------- None or str, None or list If no singularity is available, or the executable in the command is not a singularity image given by its path, None is return. Otherwise the runscript of the container is returned a string. The second value is None if the first is None, or a list of arguments to the runscript. """ # get the path to the command's executable exec_path = cmd[0] runner = Runner() if not op.exists(exec_path): # probably a command from PATH return # this is a real file, not just a command on the path try: stdout, stderr = runner.run( ['singularity', '--version'], log_stdout=True, log_stderr=True, expect_stderr=True, expect_fail=True, ) # TODO could be used to tailor handling to particular versions except CommandError as e: # pragma: no cover # we do not have a singularity installation that we can handle # log debug, because there is no guarantee that the executable # actually was a singularity container lgr.debug('No suitable singularity version installed: %s', exc_str(e)) return # we have singularity try: stdout, stderr = runner.run( # stringification only needed for pythons older than 3.6 ['singularity', 'exec', exec_path, 'cat', '/singularity'], log_stdout=True, log_stderr=True, expect_stderr=True, expect_fail=True, ) # TODO could be used to tailor handling to particular versions except CommandError as e: # we do not have a singularity installation that we can handle # log debug, because there is no guarantee that the executable # actually was a singularity container lgr.debug('%s is not a singularity image: %s', exec_path, exc_str(e)) return # all but the container itself are the arguments return exec_path, cmd[1:]
def _visit_url(self, url, data): if url in self._seen: return # this is just a cruel first attempt lgr.debug("Visiting %s" % url) try: retry = 0 orig_url = url if self._redirects_cache is not None: url = self._redirects_cache.get(url, url) while True: retry += 1 if retry > 100: raise DownloadError( "We have followed 100 redirects already. Something is wrong!" ) try: self._seen.add(url) page = self._providers.fetch(url, allow_redirects=False) break except UnhandledRedirectError as exc: # since we care about tracking URL for proper full url construction # we should disallow redirects and handle them manually here lgr.debug("URL %s was redirected to %s" % (url, exc.url)) if url == exc.url: raise DownloadError( "Was redirected to the same url upon %s" % exc_str(exc)) url = exc.url if self._redirects_cache is not None: self._redirects_cache[orig_url] = exc.url except DownloadError as exc: lgr.warning("URL %s failed to download: %s" % (url, exc_str(exc))) if self.failed in {None, 'skip'}: # TODO: config -- crawl.failed='skip' should be a config option, for now always skipping return raise # otherwise -- kaboom data_ = updated(data, zip(self._output, (page, url))) yield data_ # now recurse if matchers were provided matchers = self._matchers if matchers: lgr.debug("Looking for more URLs at %s using %s", url, matchers) for matcher in (matchers if isinstance(matchers, (list, tuple)) else [matchers]): for data_matched in matcher(data_): if 'url' not in data_matched: lgr.warning("Got data without a url from %s" % matcher) continue # proxy findings for data_matched_ in self._visit_url( data_matched['url'], data_matched): yield data_matched_
def _describe_dataset(ds, sensitive): from datalad.interface.results import success_status_map from datalad.api import metadata try: infos = { 'path': ds.path, 'repo': ds.repo.__class__.__name__ if ds.repo else None, } if not sensitive: infos['metadata'] = _HIDDEN elif ds.id: ds_meta = metadata( dataset=ds, reporton='datasets', return_type='list', result_filter=lambda x: x['action'] == 'metadata' and success_status_map[x['status']] == 'success', result_renderer='disabled', on_failure='ignore') if ds_meta: ds_meta = [dm['metadata'] for dm in ds_meta] if len(ds_meta) == 1: ds_meta = ds_meta.pop() infos['metadata'] = ds_meta else: infos['metadata'] = None return infos except InvalidGitRepositoryError as e: return {"invalid": exc_str(e)}
def _describe_system(): import platform as pl from datalad import get_encoding_info if hasattr(pl, 'dist'): dist = pl.dist() else: # Python 3.8 removed .dist but recommended "distro" is slow, so we # try it only if needed try: import distro dist = distro.linux_distribution(full_distribution_name=False) except ImportError: lgr.info( "Please install 'distro' package to obtain distribution information" ) dist = tuple() except Exception as exc: lgr.warning( "No distribution information will be provided since 'distro' " "fails to import/run: %s", exc_str(exc) ) dist = tuple() return { 'type': os.name, 'name': pl.system(), 'release': pl.release(), 'version': pl.version(), 'distribution': ' '.join([_t2s(dist), _t2s(pl.mac_ver()), _t2s(pl.win32_ver())]).rstrip(), 'max_path_length': get_max_path_length(getpwd()), 'encoding': get_encoding_info(), }
def _get(self, filepath): if not lexists(filepath): return None # I wish I could just test using filesystem stats but that would not # be reliable, and also file might not even be here. # File might be under git, not annex so then we would need to assess size filestat = os.lstat(filepath) try: with disable_logger(): info = self.annex.info(filepath, batch=True) size = info['size'] except (CommandError, TypeError) as exc: # must be under git or a plain file lgr.debug( "File %s must be not under annex, since info failed: %s" % (filepath, exc_str(exc))) size = filestat.st_size # deduce mtime from the file or a content which it points to. Take the oldest (I wonder # if it would bite ;) XXX) mtime = filestat.st_mtime if islink(filepath): filepath_ = realpath(filepath) # symlinked to if exists(filepath_): mtime_ = os.stat(filepath_).st_mtime mtime = min(mtime_, mtime) return FileStatus(size=size, mtime=mtime)
def get_run_info(message): """Extract run information from `message` Parameters ---------- message : str A commit message. Returns ------- A tuple with the command's message and a dict with run information. Both these values are None if `message` doesn't have a run command. Raises ------ A ValueError if the information in `message` is invalid. """ cmdrun_regex = r'\[DATALAD RUNCMD\] (.*)=== Do not change lines below ' \ r'===\n(.*)\n\^\^\^ Do not change lines above \^\^\^' runinfo = re.match(cmdrun_regex, message, re.MULTILINE | re.DOTALL) if not runinfo: return None, None rec_msg, runinfo = runinfo.groups() try: runinfo = json.loads(runinfo) except Exception as e: raise ValueError( 'cannot rerun command, command specification is not valid JSON: ' '%s' % exc_str(e)) if 'cmd' not in runinfo: raise ValueError("Looks like a run commit but does not have a command") return rec_msg.rstrip(), runinfo
def close(self, allow_fail=True, ctrl_path=None): """Closes all connections, known to this instance. Parameters ---------- allow_fail: bool, optional If True, swallow exceptions which might be thrown during connection.close, and just log them at DEBUG level ctrl_path: str, Path, or list of str or Path, optional If specified, only the path(s) provided would be considered """ if self._connections: ctrl_paths = [Path(p) for p in ensure_list(ctrl_path)] to_close = [ c for c in self._connections # don't close if connection wasn't opened by SSHManager if self._connections[c].ctrl_path not in self._prev_connections and self._connections[c].ctrl_path.exists() and (not ctrl_paths or self._connections[c].ctrl_path in ctrl_paths ) ] if to_close: lgr.debug("Closing %d SSH connections..." % len(to_close)) for cnct in to_close: f = self._connections[cnct].close if allow_fail: f() else: try: f() except Exception as exc: lgr.debug("Failed to close a connection: " "%s", exc_str(exc)) self._connections = dict()
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None): content_by_ds = {} if isinstance(recursion_limit, int) and recursion_limit <= 0: return content_by_ds # loop over submodules not subdatasets to get the url right away # install using helper that give some flexibility regarding where to # get the module from for sub in ds.repo.get_submodules(): subds = Dataset(opj(ds.path, sub.path)) if start is not None and not subds.path.startswith(_with_sep(start)): # this one we can ignore, not underneath the start path continue if not subds.is_installed(): try: lgr.info("Installing subdataset %s", subds.path) subds = _install_subds_from_flexible_source( ds, sub.path, sub.url, reckless) # we want the entire thing, but mark this subdataset # as automatically installed content_by_ds[subds.path] = [curdir] except Exception as e: # skip, if we didn't manage to install subdataset lgr.warning( "Installation of subdatasets %s failed, skipped", subds) lgr.debug("Installation attempt failed with exception: %s", exc_str(e)) continue # otherwise recurse # we can skip the start expression, we know we are within content_by_ds.update(_recursive_install_subds_underneath( subds, recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit, reckless=reckless )) return content_by_ds
def _install_subds_from_flexible_source(ds, sm_path, sm_url, reckless): """Tries to obtain a given subdataset from several meaningful locations""" # compose a list of candidate clone URLs clone_urls = _get_flexible_source_candidates_for_submodule( ds, sm_path, sm_url) # now loop over all candidates and try to clone subds = Dataset(opj(ds.path, sm_path)) try: clone_url = _clone_from_any_source(clone_urls, subds.path) except GitCommandError as e: raise InstallFailedError( msg="Failed to install %s from %s (%s)" % ( subds, clone_urls, exc_str(e)) ) # do fancy update if sm_path in ds.get_subdatasets(absolute=False, recursive=False): lgr.debug("Update cloned subdataset {0} in parent".format(subds)) ds.repo.update_submodule(sm_path, init=True) else: # submodule is brand-new and previously unknown ds.repo.add_submodule(sm_path, url=clone_url) _fixup_submodule_dotgit_setup(ds, sm_path) _handle_possible_annex_dataset(subds, reckless) return subds
def _revrange_as_results(dset, revrange): ds_repo = dset.repo rev_lines = ds_repo.get_revisions(revrange, fmt="%H %P", options=["--reverse", "--topo-order"]) if not rev_lines: return for rev_line in rev_lines: # The strip() below is necessary because, with the format above, a # commit without any parent has a trailing space. (We could also use a # custom `rev-list --parents ...` call to avoid this.) fields = rev_line.strip().split(" ") rev, parents = fields[0], fields[1:] res = get_status_dict("run", ds=dset, commit=rev, parents=parents) full_msg = ds_repo.format_commit("%B", rev) try: msg, info = get_run_info(dset, full_msg) except ValueError as exc: # Recast the error so the message includes the revision. raise ValueError("Error on {}'s message: {}".format( rev, exc_str(exc))) if info is not None: if len(parents) != 1: lgr.warning( "%s has run information but is a %s commit; " "it will not be re-executed", rev, "merge" if len(parents) > 1 else "root") continue res["run_info"] = info res["run_message"] = msg yield dict(res, status="ok")
def get_max_path_length(top_path=None, maxl=1000): """Deduce the maximal length of the filename in a given path """ if not top_path: top_path = getpwd() import os import random from datalad import lgr from datalad.dochelpers import exc_str from datalad.support import path prefix = path.join(top_path, "dl%d" % random.randint(1 ,100000)) # some smart folks could implement binary search for this max_path_length = None for i in range(maxl-len(prefix)): filename = prefix + '_' * i path_length = len(filename) try: with open(filename, 'w') as f: max_path_length = path_length except Exception as exc: lgr.debug( "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s", path_length, max_path_length, exc_str(exc)) break unlink(filename) return max_path_length
def _describe_annex(): from datalad.cmd import ( GitWitlessRunner, StdOutErrCapture, ) runner = GitWitlessRunner() try: out = runner.run(['git', 'annex', 'version'], protocol=StdOutErrCapture) except CommandError as e: return dict( version='not available', message=exc_str(e), ) info = {} for line in out['stdout'].split(os.linesep): key = line.split(':')[0] if not key: continue value = line[len(key) + 2:].strip() key = key.replace('git-annex ', '') if key.endswith('s'): value = value.split() info[key] = value return info
def _has_active_postupdate(ds, name, ssh): """Figure out either has active post-update hook Returns ------- bool or None None if something went wrong and we could not figure out """ has_active_post_update = None try: # TODO -- we might need to expanduser taking .user into account # but then it must be done also on remote side out = CreateSibling._run_on_ds_ssh_remote( ds, name, ssh, 'cd {path} && [ -x .git/hooks/post-update ] && echo yes || echo no' ) out = out.strip() assert out in ('yes', 'no') has_active_post_update = out == "yes" except CommandError as e: lgr.debug( "Could not figure out either %s on remote %s has active " "post_update hook due to %s", ds, name, exc_str(e) ) return has_active_post_update
def __call__(astype, dataset, getcmdhelp=False, output=None, **kwargs): # get a handle on the relevant plugin module import datalad.export as export_mod try: exmod = import_module('.%s' % (astype, ), package=export_mod.__package__) except ImportError as e: raise ValueError("cannot load exporter '{}': {}".format( astype, exc_str(e))) if getcmdhelp: # no result, but return the module to make the renderer do the rest return (exmod, None) ds = require_dataset(dataset, check_installed=True, purpose='exporting') # call the plugin, either with the argv array from the cmdline call # or directly with the kwargs if 'datalad_unparsed_args' in kwargs: result = exmod._datalad_export_plugin_call( ds, argv=kwargs['datalad_unparsed_args'], output=output) else: result = exmod._datalad_export_plugin_call(ds, output=output, **kwargs) return (exmod, result)
def _generate_extension_api(): """Auto detect all available extensions and generate an API from them """ from importlib import import_module from pkg_resources import iter_entry_points from .interface.base import get_api_name from datalad.dochelpers import exc_str import logging lgr = logging.getLogger('datalad.api') for entry_point in iter_entry_points('datalad.extensions'): try: lgr.debug( 'Loading entrypoint %s from datalad.extensions for API building', entry_point.name) grp_descr, interfaces = entry_point.load() lgr.debug( 'Loaded entrypoint %s from datalad.extensions', entry_point.name) except Exception as e: lgr.warning('Failed to load entrypoint %s: %s', entry_point.name, exc_str(e)) continue for intfspec in interfaces: # turn the interface spec into an instance mod = import_module(intfspec[0]) intf = getattr(mod, intfspec[1]) api_name = get_api_name(intfspec) if api_name in globals(): lgr.debug( 'Command %s from extension %s is replacing a previously loaded implementation', api_name, entry_point.name) globals()[api_name] = intf.__call__
def check_compress_file(ext, annex, path, name): # we base the archive name on the filename, in order to also # be able to properly test compressors where the corresponding # archive format has no capability of storing a filename # (i.e. where the archive name itself determines the filename # of the decompressed file, like .xz) archive = op.join(name, _filename + ext) compress_files([_filename], archive, path=path) assert_true(op.exists(archive)) if annex: # It should work even when file is annexed and is a symlink to the # key from datalad.support.annexrepo import AnnexRepo repo = AnnexRepo(path, init=True) repo.add(_filename) repo.commit(files=[_filename], msg="commit") dir_extracted = name + "_extracted" try: decompress_file(archive, dir_extracted) except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) _filepath = op.join(dir_extracted, _filename) ok_file_has_content(_filepath, 'content')
def _describe_system(): import platform as pl from datalad import get_encoding_info from datalad.utils import get_linux_distribution try: dist = get_linux_distribution() except Exception as exc: lgr.warning("Failed to get distribution information: %s", exc_str(exc)) dist = tuple() return { 'type': os.name, 'name': pl.system(), 'release': pl.release(), 'version': pl.version(), 'distribution': ' '.join([_t2s(dist), _t2s(pl.mac_ver()), _t2s(pl.win32_ver())]).rstrip(), 'max_path_length': get_max_path_length(getpwd()), 'encoding': get_encoding_info(), }
def _transfer(self, cmd, key, path): akeys_tried = [] # the same file could come from multiple files within the same archive # So far it doesn't make sense to "try all" of them since if one fails # it means the others would fail too, so it makes sense to immediately # prune the list so we keep only the ones from unique akeys. # May be whenever we support extraction directly from the tarballs # we should go through all and choose the one easiest to get or smth. for akey, afile in self._gen_akey_afiles(key, sorted=True, unique_akeys=True): akeys_tried.append(akey) try: akey_fpath = self.get_contentlocation(akey) if not akey_fpath: # TODO: make it more stringent? # Command could have fail to run if key was not present locally yet # Thus retrieve the key using annex # TODO: we need to report user somehow about this happening and progress on the download self.runner(["git-annex", "get", "--key", akey], cwd=self.path, expect_stderr=True) akey_fpath = self.get_contentlocation(akey) if not akey_fpath: raise RuntimeError( "We were reported to fetch it alright but now can't get its location. Check logic" ) akey_path = opj(self.repo.path, akey_fpath) assert exists( akey_path), "Key file %s is not present" % akey_path # Extract that bloody file from the bloody archive # TODO: implement/use caching, for now a simple one # actually patool doesn't support extraction of a single file # https://github.com/wummel/patool/issues/20 # so pwd = getpwd() lgr.debug( "Getting file {afile} from {akey_path} while PWD={pwd}". format(**locals())) apath = self.cache[akey_path].get_extracted_file(afile) link_file_load(apath, path) self.send('TRANSFER-SUCCESS', cmd, key) return except Exception as exc: # from celery.contrib import rdb # rdb.set_trace() from datalad.dochelpers import exc_str exc_ = exc_str(exc) self.debug( "Failed to fetch {akey} containing {key}: {exc_}".format( **locals())) continue self.error( "Failed to fetch any archive containing {key}. Tried: {akeys}". format(**locals()))
def __contains__(self, url): try: return self._get_provider(url) in self._cookies_db except Exception as exc: lgr.warning("Failed to check for having a cookie for %s: %s", url, exc_str(exc)) return None
def _read(stream, input_type): if input_type in ["csv", "tsv"]: import csv csvrows = csv.reader(stream, delimiter="\t" if input_type == "tsv" else ",") try: headers = next(csvrows) except StopIteration: raise ValueError("Failed to read {} rows from {}".format( input_type.upper(), stream)) lgr.debug("Taking %s fields from first line as headers: %s", len(headers), headers) idx_map = dict(enumerate(headers)) rows = [dict(zip(headers, r)) for r in csvrows] elif input_type == "json": import json try: rows = json.load(stream) except json.decoder.JSONDecodeError as e: raise ValueError("Failed to read JSON from stream {}: {}".format( stream, exc_str(e))) # For json input, we do not support indexing by position, # only names. idx_map = {} else: raise ValueError("input_type {} is invalid. Known values: {}".format( input_type, ", ".join(INPUT_TYPES))) return rows, idx_map
def get_commit_runinfo(repo, commit="HEAD"): """Return message and run record from a commit message If none found - returns None, None; if anything goes wrong - throws ValueError with the message describing the issue """ commit_msg = repo.repo.git.show(commit, "--format=%s%n%n%b", "--no-patch") cmdrun_regex = r'\[DATALAD RUNCMD\] (.*)=== Do not change lines below ' \ r'===\n(.*)\n\^\^\^ Do not change lines above \^\^\^' runinfo = re.match(cmdrun_regex, commit_msg, re.MULTILINE | re.DOTALL) if not runinfo: return None, None rec_msg, runinfo = runinfo.groups() try: runinfo = json.loads(runinfo) except Exception as e: raise ValueError( 'cannot rerun command, command specification is not valid JSON: ' '%s' % exc_str(e)) if 'cmd' not in runinfo: raise ValueError( "{} looks like a run commit but does not have a command".format( repo.repo.git.rev_parse("--short", commit))) return rec_msg, runinfo
def __getitem__(self, url): try: return self._cookies_db[self._get_provider(url)] except Exception as exc: lgr.warning("Failed to get a cookie for %s: %s", url, exc_str(exc)) return None
def __contains__(self, url): try: return self._get_provider(url) in self.cookies_db except Exception as exc: lgr.warning("Failed to check for having a cookie for %s: %s", url, exc_str(exc)) return None
def _generate_extension_api(): """Auto detect all available extensions and generate an API from them """ from importlib import import_module from pkg_resources import iter_entry_points from .interface.base import get_api_name from datalad.dochelpers import exc_str import logging lgr = logging.getLogger('datalad.api') for entry_point in iter_entry_points('datalad.extensions'): try: lgr.debug( 'Loading entrypoint %s from datalad.extensions for API building', entry_point.name) grp_descr, interfaces = entry_point.load() lgr.debug('Loaded entrypoint %s from datalad.extensions', entry_point.name) except Exception as e: lgr.warning('Failed to load entrypoint %s: %s', entry_point.name, exc_str(e)) continue for intfspec in interfaces: # turn the interface spec into an instance mod = import_module(intfspec[0]) intf = getattr(mod, intfspec[1]) api_name = get_api_name(intfspec) if api_name in globals(): lgr.debug( 'Command %s from extension %s is replacing a previously loaded implementation', api_name, entry_point.name) globals()[api_name] = intf.__call__
def close(self, allow_fail=True): """Closes all connections, known to this instance. Parameters ---------- allow_fail: bool, optional If True, swallow exceptions which might be thrown during connection.close, and just log them at DEBUG level """ if self._connections: to_close = [c for c in self._connections # don't close if connection wasn't opened by SSHManager if self._connections[c].ctrl_path not in self._prev_connections and exists(self._connections[c].ctrl_path)] if to_close: lgr.debug("Closing %d SSH connections..." % len(to_close)) for cnct in to_close: f = self._connections[cnct].close if allow_fail: f() else: try: f() except Exception as exc: lgr.debug("Failed to close a connection: " "%s", exc_str(exc)) self._connections = dict()
def get_max_path_length(top_path=None, maxl=1000): """Deduce the maximal length of the filename in a given path """ if not top_path: top_path = getpwd() import random from datalad import lgr from datalad.dochelpers import exc_str from datalad.support import path prefix = path.join(top_path, "dl%d" % random.randint(1, 100000)) # some smart folks could implement binary search for this max_path_length = None for i in range(maxl - len(prefix)): filename = prefix + '_' * i path_length = len(filename) try: with open(filename, 'w') as f: max_path_length = path_length except Exception as exc: lgr.debug( "Failed to create sample file for length %d. Last succeeded was %s. Exception: %s", path_length, max_path_length, exc_str(exc)) break unlink(filename) return max_path_length
def close(self, allow_fail=True, ctrl_path=None): """Closes all connections, known to this instance. Parameters ---------- allow_fail: bool, optional If True, swallow exceptions which might be thrown during connection.close, and just log them at DEBUG level ctrl_path: str or list of str, optional If specified, only the path(s) provided would be considered """ if self._connections: from datalad.utils import assure_list ctrl_paths = assure_list(ctrl_path) to_close = [c for c in self._connections # don't close if connection wasn't opened by SSHManager if self._connections[c].ctrl_path not in self._prev_connections and exists(self._connections[c].ctrl_path) and (not ctrl_paths or self._connections[c].ctrl_path in ctrl_paths)] if to_close: lgr.debug("Closing %d SSH connections..." % len(to_close)) for cnct in to_close: f = self._connections[cnct].close if allow_fail: f() else: try: f() except Exception as exc: lgr.debug("Failed to close a connection: " "%s", exc_str(exc)) self._connections = dict()
def _read(stream, input_type): if input_type == "csv": import csv csvrows = csv.reader(stream) try: headers = next(csvrows) except StopIteration: raise ValueError("Failed to read CSV rows from {}".format(stream)) lgr.debug("Taking %s fields from first line as headers: %s", len(headers), headers) idx_map = dict(enumerate(headers)) rows = [dict(zip(headers, r)) for r in csvrows] elif input_type == "json": import json try: rows = json.load(stream) except json.decoder.JSONDecodeError as e: raise ValueError("Failed to read JSON from stream {}: {}".format( stream, exc_str(e))) # For json input, we do not support indexing by position, # only names. idx_map = {} else: raise ValueError("input_type must be 'csv', 'json', or 'ext'") return rows, idx_map
def _describe_dataset(ds, sensitive): from datalad.interface.results import success_status_map from datalad.api import metadata try: infos = { 'path': ds.path, 'repo': ds.repo.__class__.__name__ if ds.repo else None, 'id': ds.id, } if not sensitive: infos['metadata'] = _HIDDEN elif ds.id: ds_meta = metadata( dataset=ds, reporton='datasets', return_type='list', result_filter=lambda x: x['action'] == 'metadata' and success_status_map[x['status']] == 'success', result_renderer='disabled', on_failure='ignore') if ds_meta: ds_meta = [dm['metadata'] for dm in ds_meta] if len(ds_meta) == 1: ds_meta = ds_meta.pop() infos['metadata'] = ds_meta else: infos['metadata'] = None return infos except InvalidGitRepositoryError as e: return {"invalid": exc_str(e)}
def get_metadata(self, dataset, content): if not content: return {}, [] contentmeta = [] for f in self.paths: fpath = opj(self.ds.path, f) try: img = Image.open(fpath) except Exception as e: lgr.debug("Image metadata parser failed to load %s: %s", fpath, exc_str(e)) continue meta = { 'type': 'dctype:Image', } # run all extractors meta.update({k: v(img) for k, v in self._extractors.items()}) # filter useless fields (empty strings and NaNs) meta = {k: v for k, v in meta.items() if not (hasattr(v, '__len__') and not len(v))} contentmeta.append((f, meta)) return { '@context': vocabulary, }, \ contentmeta
def _load(self): if self._cookies_db is not None: return if self._filename: filename = self._filename cookies_dir = os.path.dirname(filename) else: cookies_dir = os.path.join( appdirs.user_config_dir(), 'datalad') # FIXME prolly shouldn't hardcode 'datalad' filename = os.path.join(cookies_dir, 'cookies') # TODO: guarantee restricted permissions if not os.path.exists(cookies_dir): os.makedirs(cookies_dir) lgr.debug("Opening cookies DB %s", filename) try: self._cookies_db = shelve.open(filename, writeback=True, protocol=2) except Exception as exc: lgr.warning("Failed to open cookies DB %s: %s", filename, exc_str(exc))
def import_modules(modnames, pkg, msg="Failed to import {module}", log=lgr.debug): """Helper to import a list of modules without failing if N/A Parameters ---------- modnames: list of str List of module names to import pkg: str Package under which to import msg: str, optional Message template for .format() to log at DEBUG level if import fails. Keys {module} and {package} will be provided and ': {exception}' appended log: callable, optional Logger call to use for logging messages """ from importlib import import_module _globals = globals() mods_loaded = [] for modname in modnames: try: _globals[modname] = mod = import_module('.{}'.format(modname), pkg) mods_loaded.append(mod) except Exception as exc: from datalad.dochelpers import exc_str log((msg + ': {exception}').format(module=modname, package=pkg, exception=exc_str(exc))) return mods_loaded
def check_crawl_autoaddtext(gz, ind, topurl, outd): ds = create(outd) ds.run_procedure("cfg_text2git") with chpwd(outd): # TODO -- dataset argument template_kwargs = { 'url': topurl, 'a_href_match_': '.*', } if gz: template_kwargs['archives_re'] = "\.gz$" crawl_init(template_kwargs, save=True, template='simple_with_archives') try: crawl() except MissingExternalDependency as exc: raise SkipTest(exc_str(exc)) ok_clean_git(outd) ok_file_under_git(outd, "anothertext", annexed=False) ok_file_under_git(outd, "d/textfile", annexed=False) ok_file_under_git(outd, "d/tooshort", annexed=True) if 'compressed.dat.gz' in TEST_TREE2: if gz: ok_file_under_git(outd, "compressed.dat", annexed=False) ok_file_has_content(op.join(outd, "compressed.dat"), u"мама мыла раму") else: ok_file_under_git(outd, "compressed.dat.gz", annexed=True) else: raise SkipTest( "Need datalad >= 0.11.2 to test .gz files decompression")
def _install_necessary_subdatasets( ds, path, reckless, refds_path, description=None): """Installs subdatasets of `ds`, that are necessary to obtain in order to have access to `path`. Gets the subdataset containing `path` regardless of whether or not it was already installed. While doing so, installs everything necessary in between the uppermost installed one and `path`. Note: `ds` itself has to be installed. Parameters ---------- ds: Dataset path: str reckless: bool """ # figuring out what dataset to start with, --contains limits --recursive # to visit only subdataset on the trajectory to the target path subds_trail = ds.subdatasets(contains=path, recursive=True) if not subds_trail: # there is not a single known subdataset (installed or not) # for this path -- job done return # otherwise we start with the one deepest down cur_subds = subds_trail[-1] while not GitRepo.is_valid_repo(cur_subds['path']): # install using helper that give some flexibility regarding where to # get the module from try: sd = _install_subds_from_flexible_source( Dataset(cur_subds['parentds']), relpath(cur_subds['path'], start=cur_subds['parentds']), cur_subds['gitmodule_url'], reckless, description=description) except Exception as e: # skip all of downstairs, if we didn't manage to install subdataset yield get_status_dict( 'install', path=cur_subds['path'], type='dataset', status='error', logger=lgr, refds=refds_path, message=("Installation of subdatasets %s failed with exception: %s", cur_subds['path'], exc_str(e))) return # report installation, whether it helped or not yield get_status_dict( 'install', ds=sd, status='ok', logger=lgr, refds=refds_path, message=("Installed subdataset in order to get %s", path)) # now check whether the just installed subds brought us any closer to # the target path subds_trail = sd.subdatasets(contains=path, recursive=False) if not subds_trail: # no (newly available) subdataset get's us any closer return # next round cur_subds = subds_trail[-1]
def _handle_exception(e, bucket_name): """Helper to handle S3 connection exception""" raise ( AccessDeniedError if e.error_code == 'AccessDenied' else DownloadError)( "Cannot connect to %s S3 bucket. Exception: %s" % (bucket_name, exc_str(e)) )
def _clone_from_any_source(sources, dest): # should not be the case, but we need to distinguish between failure # of git-clone, due to existing target and an unsuccessful clone # attempt. See below. existed = dest and exists(dest) for source_ in sources: try: lgr.debug("Retrieving a dataset from URL: " "{0}".format(source_)) with swallow_logs(): GitRepo.clone(path=dest, url=source_, create=True) return source_ # do not bother with other sources if succeeded except GitCommandError as e: lgr.debug("Failed to retrieve from URL: " "{0}".format(source_)) if not existed and dest \ and exists(dest): lgr.debug("Wiping out unsuccessful clone attempt at " "{}".format(dest)) rmtree(dest) if source_ == sources[-1]: # Note: The following block is evaluated whenever we # fail even with the last try. Not nice, but currently # necessary until we get a more precise exception: #################################### # TODO: We may want to introduce a --force option to # overwrite the target. # TODO: Currently assuming if `existed` and there is a # GitCommandError means that these both things are connected. # Need newer GitPython to get stderr from GitCommandError # (already fixed within GitPython.) if existed: # rudimentary check for an installed dataset at target: # (TODO: eventually check for being the one, that this # is about) dest_ds = Dataset(dest) if dest_ds.is_installed(): lgr.info("{0} appears to be installed already." "".format(dest_ds)) break else: lgr.warning("Target {0} already exists and is not " "an installed dataset. Skipped." "".format(dest)) # Keep original in debug output: lgr.debug("Original failure:{0}" "{1}".format(linesep, exc_str(e))) return None ################## # Re-raise if failed even with the last candidate lgr.debug("Unable to establish repository instance at " "{0} from {1}" "".format(dest, sources)) raise
def test_GitRepo_gitpy_injection(path, path2): gr = GitRepo(path, create=True) gr._GIT_COMMON_OPTIONS.extend(['test-option']) with assert_raises(GitCommandError) as cme: gr.repo.git.unknown_git_command() assert_in('test-option', exc_str(cme.exception)) # once set, these option should be persistent across git calls: with assert_raises(GitCommandError) as cme: gr.repo.git.another_unknown_git_command() assert_in('test-option', exc_str(cme.exception)) # but other repos should not be affected: gr2 = GitRepo(path2, create=True) with assert_raises(GitCommandError) as cme: gr2.repo.git.unknown_git_command() assert_not_in('test-option', exc_str(cme.exception))
def test_wtf(path): # smoke test for now with swallow_outputs() as cmo: wtf(dataset=path) assert_not_in('## dataset', cmo.out) assert_in('## configuration', cmo.out) # Those sections get sensored out by default now assert_not_in('user.name: ', cmo.out) with chpwd(path): with swallow_outputs() as cmo: wtf() assert_not_in('## dataset', cmo.out) assert_in('## configuration', cmo.out) # now with a dataset ds = create(path) with swallow_outputs() as cmo: wtf(dataset=ds.path) assert_in('## configuration', cmo.out) assert_in('## dataset', cmo.out) assert_in('path: {}'.format(ds.path), cmo.out) # and if we run with all sensitive for sensitive in ('some', True): with swallow_outputs() as cmo: wtf(dataset=ds.path, sensitive=sensitive) # we fake those for tests anyways, but we do show cfg in this mode # and explicitly not showing them assert_in('user.name: %s' % _HIDDEN, cmo.out) with swallow_outputs() as cmo: wtf(dataset=ds.path, sensitive='all') assert_not_in(_HIDDEN, cmo.out) # all is shown assert_in('user.name: ', cmo.out) skip_if_no_module('pyperclip') # verify that it works correctly in the env/platform import pyperclip with swallow_outputs() as cmo: try: pyperclip.copy("xxx") pyperclip_works = pyperclip.paste().strip() == "xxx" wtf(dataset=ds.path, clipboard=True) except (AttributeError, pyperclip.PyperclipException) as exc: # AttributeError could come from pyperclip if no DISPLAY raise SkipTest(exc_str(exc)) assert_in("WTF information of length", cmo.out) assert_not_in('user.name', cmo.out) if not pyperclip_works: # Some times does not throw but just fails to work raise SkipTest( "Pyperclip seems to be not functioning here correctly") assert_not_in('user.name', pyperclip.paste()) assert_in(_HIDDEN, pyperclip.paste()) # by default no sensitive info assert_in("cmd:annex:", pyperclip.paste()) # but the content is there
def _get_github_entity(gh, cred, github_user, github_passwd, github_organization): # figure out authentication if not (github_user and github_passwd): # access to the system secrets if github_user: # check that they keystore knows about this user if github_user != cred.get('user', github_user): # there is a mismatch, we need to ask creds = cred.enter_new() github_user = creds['user'] github_passwd = creds['password'] # if a user is provided, go with it, don't even ask any store if github_user is None and not cred.is_known: # let's figure out authentication if github_user is None: # check if there is an oauth token from # https://github.com/sociomantic/git-hub github_user = cfg.get('hub.oauthtoken', None) if github_user is None: # still nothing, ask if necessary creds = cred() github_user = creds['user'] github_passwd = creds['password'] if not github_user: raise gh.BadCredentialsException(403, 'no user specified') # this will always succeed, but it might later throw an exception # if the credentials were wrong # XXX make sure to wipe out known credentials if that happens authed_gh = gh.Github( github_user, password=github_passwd) try: if github_organization: try: entity = authed_gh.get_organization(github_organization) except gh.UnknownObjectException as e: raise ValueError('unknown organization "{}" [{}]'.format( github_organization, exc_str(e))) else: entity = authed_gh.get_user() except gh.BadCredentialsException as e: # things blew up, wipe out cred store, if anything is in it if cred.is_known: cred.delete() raise e return entity
def get_git_version(self): key = 'cmd:git' if key in self._remote_props: return self._remote_props[key] git_version = None try: git_version = self('git version')[0].split()[2] except CommandError as e: lgr.debug('Failed to determine Git version: %s', exc_str(e)) self._remote_props[key] = git_version return git_version
def assure_initialized(self): """Assures that manager is initialized - knows socket_dir, previous connections """ if self._socket_dir is not None: return from ..config import ConfigManager from os import chmod cfg = ConfigManager() self._socket_dir = opj(cfg.obtain('datalad.locations.cache'), 'sockets') assure_dir(self._socket_dir) try: chmod(self._socket_dir, 0o700) except OSError as exc: lgr.warning( "Failed to (re)set permissions on the %s. " "Most likely future communications would be impaired or fail. " "Original exception: %s", self._socket_dir, exc_str(exc) ) from os import listdir from os.path import isdir try: self._prev_connections = [opj(self.socket_dir, p) for p in listdir(self.socket_dir) if not isdir(opj(self.socket_dir, p))] except OSError as exc: self._prev_connections = [] lgr.warning( "Failed to list %s for existing sockets. " "Most likely future communications would be impaired or fail. " "Original exception: %s", self._socket_dir, exc_str(exc) ) lgr.log(5, "Found %d previous connections", len(self._prev_connections))
def _get_github_entity(gh, cred, github_login, github_passwd, github_organization): if github_login == 'disabledloginfortesting': raise gh.BadCredentialsException(403, 'no login specified') if not (github_login and github_passwd): # we don't have both # check if there is an oauth token from # https://github.com/sociomantic/git-hub token = False if not cred.is_known: if not github_login: # try find a token as login github_login = cfg.get('hub.oauthtoken', None) token = True if not (github_login and (github_passwd or token)): # still at least one missing, utilize the credential store # to get auth info, pass potential passwd value along cred.enter_new( user=github_login, password=github_passwd) # now we should really have it creds = cred() github_login = creds['user'] github_passwd = creds['password'] if not github_login: raise gh.BadCredentialsException(403, 'no login specified') # this will always succeed, but it might later throw an exception # if the credentials were wrong # and this case, known credentials are wiped out again below authed_gh = gh.Github( github_login, password=github_passwd) try: if github_organization: try: entity = authed_gh.get_organization(github_organization) except gh.UnknownObjectException as e: raise ValueError('unknown organization "{}" [{}]'.format( github_organization, exc_str(e))) else: entity = authed_gh.get_user() except gh.BadCredentialsException as e: # things blew up, wipe out cred store, if anything is in it if cred.is_known: cred.delete() raise e return entity
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None, refds_path=None, description=None): if isinstance(recursion_limit, int) and recursion_limit <= 0: return # install using helper that give some flexibility regarding where to # get the module from for sub in ds.subdatasets( return_type='generator', result_renderer='disabled'): subds = Dataset(sub['path']) if sub.get('gitmodule_datalad-recursiveinstall', '') == 'skip': lgr.debug( "subdataset %s is configured to be skipped on recursive installation", sub['path']) continue if start is not None and not path_is_subpath(subds.path, start): # this one we can ignore, not underneath the start path continue if sub.get('state', None) != 'absent': # dataset was already found to exist yield get_status_dict( 'install', ds=subds, status='notneeded', logger=lgr, refds=refds_path) # do not continue, even if an intermediate dataset exists it # does not imply that everything below it does too else: # try to get this dataset try: subds = _install_subds_from_flexible_source( ds, relpath(sub['path'], start=ds.path), sub['gitmodule_url'], reckless, description=description) yield get_status_dict( 'install', ds=subds, status='ok', logger=lgr, refds=refds_path, message=("Installed subdataset %s", subds), parentds=ds.path) except Exception as e: # skip all of downstairs, if we didn't manage to install subdataset yield get_status_dict( 'install', ds=subds, status='error', logger=lgr, refds=refds_path, message=("Installation of subdatasets %s failed with exception: %s", subds, exc_str(e))) continue # otherwise recurse # we can skip the start expression, we know we are within for res in _recursive_install_subds_underneath( subds, recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit, reckless=reckless, refds_path=refds_path): yield res
def init_remote_repo(path, ssh, shared, dataset, description=None): cmd = ["git", "-C", path, "init"] if shared: cmd.append("--shared=%s" % shared) try: ssh(cmd) except CommandError as e: lgr.error("Initialization of remote git repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False if isinstance(dataset.repo, AnnexRepo): # init remote git annex repo (part fix of #463) try: ssh( ["git", "-C", path, "annex", "init"] + ([description] if description else []) ) except CommandError as e: lgr.error("Initialization of remote git annex repository failed at %s." "\nError: %s\nSkipping ..." % (path, exc_str(e))) return False return True
def get_bucket(conn, bucket_name): """A helper to get a bucket Parameters ---------- bucket_name: str Name of the bucket to connect to """ bucket = None try: bucket = conn.get_bucket(bucket_name) except S3ResponseError as e: # can initially deny or error to connect to the specific bucket by name, # and we would need to list which buckets are available under following # credentials: lgr.debug("Cannot access bucket %s by name: %s", bucket_name, exc_str(e)) if conn.anon: raise AnonymousAccessDeniedError( "Access to the bucket %s did not succeed. Requesting " "'all buckets' for anonymous S3 connection makes " "little sense and thus not supported." % bucket_name, supported_types=['aws-s3'] ) all_buckets = [] try: all_buckets = conn.get_all_buckets() except S3ResponseError as e2: lgr.debug("Cannot access all buckets: %s", exc_str(e2)) _handle_exception(e, 'any (originally requested %s)' % bucket_name) all_bucket_names = [b.name for b in all_buckets] lgr.debug("Found following buckets %s", ', '.join(all_bucket_names)) if bucket_name in all_bucket_names: bucket = all_buckets[all_bucket_names.index(bucket_name)] else: _handle_exception(e, bucket_name) return bucket
def _revs_as_results(dset, revs): for rev in revs: res = get_status_dict("run", ds=dset, commit=rev) full_msg = dset.repo.format_commit("%B", rev) try: msg, info = get_run_info(dset, full_msg) except ValueError as exc: # Recast the error so the message includes the revision. raise ValueError( "Error on {}'s message: {}".format(rev, exc_str(exc))) if info is not None: res["run_info"] = info res["run_message"] = msg yield dict(res, status="ok")
def get_remote_git_version(ssh): try: # options to disable all auto so we don't trigger them while testing # for absent changes out, err = ssh(["git"] + ["version"]) assert out.strip().startswith("git version") git_version = out.strip().split()[2] lgr.debug("Detected git version on server: %s" % git_version) return LooseVersion(git_version) except CommandError as e: lgr.warning( "Failed to determine git version on remote.\n" "Error: {0}\nTrying to configure anyway " "...".format(exc_str(e))) return None
def __call__(paths, reference_date="@1514764800", revs=None, annex="all", no_tags=False, older=False): from datalad.support.repodates import check_dates which = "older" if older else "newer" try: ref_ts = _parse_date(reference_date) except ValueError as exc: lgr.error("Could not parse '%s' as a date", reference_date) yield get_status_dict("check_dates", status="error", message=exc_str(exc)) return lgr.info("Searching for dates %s than %s", which, time.strftime("%d %b %Y %H:%M:%S +0000", time.gmtime(ref_ts))) for repo in _git_repos(paths or ["."]): fullpath = os.path.abspath(repo) lgr.debug("Checking %s", fullpath) try: report = check_dates(repo, ref_ts, which=which, revs=revs or ["--all"], annex={"all": True, "none": False, "tree": "tree"}[annex], tags=not no_tags) except InvalidGitRepositoryError as exc: lgr.warning("Skipping invalid Git repo: %s", repo) continue yield get_status_dict( "check_dates", status="ok", path=fullpath, message=("Found {} dates" if report["objects"] else "No {} dates found").format(which), report=report)
def get_run_info(dset, message): """Extract run information from `message` Parameters ---------- message : str A commit message. Returns ------- A tuple with the command's message and a dict with run information. Both these values are None if `message` doesn't have a run command. Raises ------ A ValueError if the information in `message` is invalid. """ cmdrun_regex = r'\[DATALAD RUNCMD\] (.*)=== Do not change lines below ' \ r'===\n(.*)\n\^\^\^ Do not change lines above \^\^\^' runinfo = re.match(cmdrun_regex, message, re.MULTILINE | re.DOTALL) if not runinfo: return None, None rec_msg, runinfo = runinfo.groups() try: runinfo = json.loads(runinfo) except Exception as e: raise ValueError( 'cannot rerun command, command specification is not valid JSON: ' '%s' % exc_str(e) ) if not isinstance(runinfo, (list, dict)): # this is a run record ID -> load the beast record_dir = dset.config.get( 'datalad.run.record-directory', default=op.join('.datalad', 'runinfo')) record_path = op.join(dset.path, record_dir, runinfo) if not op.lexists(record_path): raise ValueError("Run record sidecar file not found: {}".format(record_path)) # TODO `get` the file recs = load_stream(record_path, compressed=True) # TODO check if there is a record runinfo = next(recs) if 'cmd' not in runinfo: raise ValueError("Looks like a run commit but does not have a command") return rec_msg.rstrip(), runinfo
def get_cached_url_content(url, name=None, fetcher=None, maxage=None): """Loader of a document from a url, which caches loaded instance on disk Doesn't do anything smart about http headers etc which could provide information for cache/proxy servers for how long to retain etc TODO: theoretically it is not network specific at all -- and just a memoize pattern, but may be some time we would make it treat headers etc correctly. And ATM would support any URL we support via providers/downloaders Parameters ---------- fetcher: callable, optional Function to call with url if needed to be refetched maxage: float, optional Age in days to retain valid for. <0 - would retain forever. If None - would consult the config, 0 - would force to reload """ doc_fname = get_url_cache_filename(url, name) if maxage is None: maxage = float(cfg.get('datalad.locations.cache-maxage')) doc = None if os.path.exists(doc_fname) and maxage != 0: fage = (time.time() - os.stat(doc_fname).st_mtime)/(24. * 3600) if maxage < 0 or fage < maxage: try: lgr.debug("use cached request result to '%s' from %s", url, doc_fname) doc = pickle.load(open(doc_fname, 'rb')) except Exception as e: # it is OK to ignore any error and fall back on the true source lgr.warning( "cannot load cache from '%s', fall back to download: %s", doc_fname, exc_str(e)) if doc is None: if fetcher is None: from datalad.downloaders.providers import Providers providers = Providers.from_config_files() fetcher = providers.fetch doc = fetcher(url) assure_dir(dirname(doc_fname)) # use pickle to store the entire request result dict pickle.dump(doc, open(doc_fname, 'wb')) lgr.debug("stored result of request to '{}' in {}".format(url, doc_fname)) return doc
def get_native_metadata(ds, guess_type=False, ds_identifier=None): """Parse a dataset to gather its native metadata Returns ------- List Each item in the list is a metadata dictionary (JSON-LD compliant). The first items corresponds to the annex-based metadata of the dataset. The last items contains the native metadata of the dataset content. Any additional items correspond to subdataset metadata sets. """ if ds_identifier is None: ds_identifier = ds.id # using a list, because we could get multiple sets of meta data per # dataset, and we want to quickly collect them without having to do potentially # complex graph merges meta = [] # get native metadata nativetypes = get_metadata_type(ds, guess=guess_type) if not nativetypes: return meta # keep local, who knows what some parsers might pull in from . import parsers for nativetype in nativetypes: if nativetype == 'aggregate': # this is special and needs to be ignored here, even if it was # configured. reason: this parser runs anyway in get_metadata() continue pmod = import_module('.{}'.format(nativetype), package=parsers.__package__) try: native_meta = pmod.MetadataParser(ds).get_metadata(ds_identifier) except Exception as e: lgr.error('failed to get native metadata ({}): {}'.format(nativetype, exc_str(e))) continue if native_meta: # TODO here we could apply a "patch" to the native metadata, if desired # try hard to keep things a simple non-nested list if isinstance(native_meta, list): meta.extend(native_meta) else: meta.append(native_meta) return meta