def download(self, from_infos, to_infos, no_progress_bar=False, names=None): names = self._verify_path_args(from_infos, to_infos, names) for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info['scheme'] != self.scheme: raise NotImplementedError if to_info['scheme'] != 'local': raise NotImplementedError bucket = from_info['bucket'] path = from_info['path'] logger.debug("Downloading '{}/{}' to '{}'".format( bucket, path, to_info['path'])) tmp_file = self.tmp_file(to_info['path']) if not name: name = os.path.basename(to_info['path']) cb = None if no_progress_bar else Callback(name) self._makedirs(to_info['path']) try: self.blob_service.get_blob_to_path(bucket, path, tmp_file, progress_callback=cb) except Exception: msg = "failed to download '{}/{}'".format(bucket, path) logger.warning(msg) else: os.rename(tmp_file, to_info['path']) if not no_progress_bar: progress.finish_target(name)
def reproduce(self, force=False, dry=False, interactive=False): if not self.changed() and not force: return None if (self.cmd or self.is_import) and not self.locked and not dry: # Removing outputs only if we actually have command to reproduce self.remove_outs(ignore_remove=False) msg = "Going to reproduce '{stage}'. Are you sure you want to continue?".format( stage=self.relpath) if interactive and not prompt.confirm(msg): raise DvcException("reproduction aborted by the user") logger.info("Reproducing '{stage}'".format(stage=self.relpath)) self.run(dry=dry) logger.debug("'{stage}' was reproduced".format(stage=self.relpath)) return self
def daemon(args): """Launch a `dvc daemon` command in a detached process. Args: args (list): list of arguments to append to `dvc daemon` command. """ cmd = [sys.executable] if not is_binary(): cmd += ['-m', 'dvc'] cmd += ['daemon', '-q'] + args logger.debug("Trying to spawn '{}'".format(cmd)) if os.name == 'nt': _spawn_windows(cmd) elif os.name == 'posix': _spawn_posix(cmd) else: raise NotImplementedError logger.debug("Spawned '{}'".format(cmd))
def reproduce(self, force=False, dry=False, interactive=False, no_commit=False): if not self.changed() and not force: return None msg = ("Going to reproduce '{stage}'. " "Are you sure you want to continue?".format(stage=self.relpath)) if interactive and not prompt.confirm(msg): raise DvcException("reproduction aborted by the user") logger.info("Reproducing '{stage}'".format(stage=self.relpath)) self.run(dry=dry, no_commit=no_commit, force=force) logger.debug("'{stage}' was reproduced".format(stage=self.relpath)) return self
def checkout(self, output, force=False, progress_callback=None): scheme = output.path_info["scheme"] if scheme not in ["", "local"] and scheme != self.scheme: raise NotImplementedError checksum = output.info.get(self.PARAM_CHECKSUM) if not checksum: msg = "No checksum info for '{}'." logger.debug(msg.format(str(output.path_info))) return if not self.changed(output.path_info, output.info): msg = "Data '{}' didn't change." logger.debug(msg.format(str(output.path_info))) return if self.changed_cache(checksum): msg = "Cache '{}' not found. File '{}' won't be created." logger.warning(msg.format(checksum, str(output.path_info))) self.safe_remove(output.path_info, force=force) return msg = "Checking out '{}' with cache '{}'." logger.debug(msg.format(str(output.path_info), checksum)) self.do_checkout(output, force=force, progress_callback=progress_callback)
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) ssh = self.ssh(host=to_infos[0]['host'], user=to_infos[0]['user'], port=to_infos[0]['port']) sftp = ssh.open_sftp() for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info['scheme'] != 'ssh': raise NotImplementedError if from_info['scheme'] != 'local': raise NotImplementedError logger.debug("Uploading '{}' to '{}/{}'".format(from_info['path'], to_info['host'], to_info['path'])) if not name: name = os.path.basename(from_info['path']) dname = posixpath.dirname(to_info['path']) self._exec(ssh, 'mkdir -p {}'.format(dname)) try: sftp.put(from_info['path'], to_info['path'], callback=create_cb(name)) except Exception: msg = "failed to upload '{}' to '{}/{}'" logger.error(msg.format(from_info['path'], to_info['host'], to_info['path'])) continue progress.finish_target(name) sftp.close() ssh.close()
def _unprotect_file(self, path): import stat import uuid from dvc.system import System from dvc.utils import copyfile, move, remove if System.is_symlink(path) or System.is_hardlink(path): logger.debug("Unprotecting '{}'".format(path)) tmp = os.path.join(os.path.dirname(path), "." + str(uuid.uuid4())) move(path, tmp) copyfile(tmp, path) remove(tmp) else: logger.debug( "Skipping copying for '{}', since it is not " "a symlink or a hardlink.".format(path) ) os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
def download( self, from_infos, to_infos, no_progress_bar=False, names=None, resume=False, ): names = self._verify_path_args(from_infos, to_infos, names) for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info["scheme"] != "local": raise NotImplementedError if to_info["scheme"] != "local": raise NotImplementedError logger.debug("Downloading '{}' to '{}'".format( from_info["path"], to_info["path"])) if not name: name = os.path.basename(to_info["path"]) makedirs(os.path.dirname(to_info["path"]), exist_ok=True) tmp_file = tmp_fname(to_info["path"]) try: copyfile( from_info["path"], tmp_file, no_progress_bar=no_progress_bar, name=name, ) os.rename(tmp_file, to_info["path"]) except Exception: logger.error("failed to download '{}' to '{}'".format( from_info["path"], to_info["path"])) continue
def test_exc_info_on_other_record_types(self, caplog, dt): with caplog.at_level(logging.DEBUG, logger="dvc"): try: raise Exception("description") except Exception: stack_trace = traceback.format_exc() logger.debug("", exc_info=True) expected = ( "{green}{datetime}{nc} " "{blue}DEBUG{nc}: description\n" "{red}{line}{nc}\n" "{stack_trace}" "{red}{line}{nc}".format( line="-" * 60, stack_trace=stack_trace, datetime=dt, **colors, ) ) assert expected == formatter.format(caplog.records[0])
def _compute_md5(self): from dvc.output.local import OutputLOCAL d = self.dumpd() # NOTE: removing md5 manually in order to not affect md5s in deps/outs if self.PARAM_MD5 in d.keys(): del d[self.PARAM_MD5] # Ignore the wdir default value. In this case stage file w/o # wdir has the same md5 as a file with the default value specified. # It's important for backward compatibility with pipelines that # didn't have WDIR in their stage files. if d.get(self.PARAM_WDIR) == ".": del d[self.PARAM_WDIR] # NOTE: excluding parameters that don't affect the state of the # pipeline. Not excluding `OutputLOCAL.PARAM_CACHE`, because if # it has changed, we might not have that output in our cache. m = dict_md5(d, exclude=[self.PARAM_LOCKED, OutputLOCAL.PARAM_METRIC]) logger.debug("Computed stage '{}' md5: '{}'".format(self.relpath, m)) return m
def _collect_dir_cache(self, out, branch=None, remote=None, force=False, jobs=None): info = out.dumpd() ret = [info] r = out.remote md5 = info[r.PARAM_CHECKSUM] if self.cache.local.changed_cache_file(md5): try: self.cloud.pull(ret, jobs=jobs, remote=remote, show_checksums=False) except DvcException as exc: msg = "Failed to pull cache for '{}': {}" logger.debug(msg.format(out, exc)) if self.cache.local.changed_cache_file(md5): msg = ("Missing cache for directory '{}'. " "Cache for files inside will be lost. " "Would you like to continue? Use '-f' to force. ") if not force and not prompt.confirm(msg): raise DvcException( "unable to fully collect used cache" " without cache for directory '{}'".format(out)) else: return ret for i in self.cache.local.load_dir_cache(md5): i["branch"] = branch i[r.PARAM_PATH] = os.path.join(info[r.PARAM_PATH], i[r.PARAM_RELPATH]) ret.append(i) return ret
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info["scheme"] != "local": raise NotImplementedError if from_info["scheme"] != "local": raise NotImplementedError logger.debug("Uploading '{}' to '{}'".format( from_info["path"], to_info["path"])) if not name: name = os.path.basename(from_info["path"]) makedirs(os.path.dirname(to_info["path"]), exist_ok=True) try: copyfile(from_info["path"], to_info["path"], name=name) except Exception: logger.error("failed to upload '{}' to '{}'".format( from_info["path"], to_info["path"]))
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info['scheme'] != 'local': raise NotImplementedError if from_info['scheme'] != 'local': raise NotImplementedError logger.debug("Uploading '{}' to '{}'".format( from_info['path'], to_info['path'])) if not name: name = os.path.basename(from_info['path']) self._makedirs(to_info['path']) try: copyfile(from_info['path'], to_info['path'], name=name) except Exception: logger.error("failed to upload '{}' to '{}'".format( from_info['path'], to_info['path']))
def ssh(self, host=None, user=None, port=None): msg = "Establishing ssh connection with '{}' " \ "through port '{}' as user '{}'" logger.debug(msg.format(host, port, user)) ssh = paramiko.SSHClient() ssh.load_system_host_keys() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) if self.ask_password and self.password is None: msg = ("Enter a private key passphrase or a password for " "host '{}' port '{}' user '{}'").format(host, port, user) self.password = prompt.password(msg) ssh.connect(host, username=user, port=port, key_filename=self.keyfile, timeout=self.timeout, password=self.password) return ssh
def _unprotect_file(path): if System.is_symlink(path) or System.is_hardlink(path): logger.debug("Unprotecting '{}'".format(path)) tmp = os.path.join(os.path.dirname(path), "." + str(uuid.uuid4())) # The operations order is important here - if some application # would access the file during the process of copyfile then it # would get only the part of file. So, at first, the file should be # copied with the temporary name, and then original file should be # replaced by new. copyfile( path, tmp, name="Unprotecting '{}'".format(os.path.relpath(path)), ) remove(path) os.rename(tmp, path) else: logger.debug("Skipping copying for '{}', since it is not " "a symlink or a hardlink.".format(path)) os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) s3 = self.s3 for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info["scheme"] != "s3": raise NotImplementedError if from_info["scheme"] != "local": raise NotImplementedError logger.debug( "Uploading '{}' to '{}/{}'".format( from_info["path"], to_info["bucket"], to_info["path"] ) ) if not name: name = os.path.basename(from_info["path"]) total = os.path.getsize(from_info["path"]) cb = Callback(name, total) try: s3.upload_file( from_info["path"], to_info["bucket"], to_info["path"], Callback=cb, ) except Exception: msg = "failed to upload '{}'".format(from_info["path"]) logger.error(msg) continue progress.finish_target(name)
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) gs = self.gs for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info["scheme"] != "gs": raise NotImplementedError if from_info["scheme"] != "local": raise NotImplementedError logger.debug( "Uploading '{}' to '{}/{}'".format( from_info["path"], to_info["bucket"], to_info["path"] ) ) if not name: name = os.path.basename(from_info["path"]) progress.update_target(name, 0, None) try: bucket = gs.bucket(to_info["bucket"]) blob = bucket.blob(to_info["path"]) blob.upload_from_filename(from_info["path"]) except Exception: msg = "failed to upload '{}' to '{}/{}'" logger.error( msg.format( from_info["path"], to_info["bucket"], to_info["path"] ) ) continue progress.finish_target(name)
def blob_service(self): if self.__blob_service is None: logger.debug('URL {}'.format(self.url)) logger.debug('Connection string {}'.format(self.connection_string)) self.__blob_service = BlockBlobService( connection_string=self.connection_string) logger.debug('Container name {}'.format(self.bucket)) self.__blob_service.create_container(self.bucket) return self.__blob_service
def blob_service(self): if self.__blob_service is None: logger.debug("URL {}".format(self.url)) logger.debug("Connection string {}".format(self.connection_string)) self.__blob_service = BlockBlobService( connection_string=self.connection_string ) logger.debug("Container name {}".format(self.bucket)) try: # verify that container exists self.__blob_service.list_blobs( self.bucket, delimiter="/", num_results=1 ) except AzureMissingResourceHttpError: self.__blob_service.create_container(self.bucket) return self.__blob_service
def link(self, cache, path): assert os.path.isfile(cache) dname = os.path.dirname(path) if not os.path.exists(dname): os.makedirs(dname) # NOTE: just create an empty file for an empty cache if os.path.getsize(cache) == 0: open(path, "w+").close() msg = "Created empty file: {} -> {}".format(cache, path) logger.debug(msg) return i = len(self.cache_types) while i > 0: try: self.CACHE_TYPE_MAP[self.cache_types[0]](cache, path) if self.protected: os.chmod(path, stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH) msg = "Created {}'{}': {} -> {}".format( "protected " if self.protected else "", self.cache_types[0], cache, path, ) logger.debug(msg) return except DvcException as exc: msg = "Cache type '{}' is not supported: {}" logger.debug(msg.format(self.cache_types[0], str(exc))) del self.cache_types[0] i -= 1 raise DvcException("no possible cache types left to try out.")
def test_debug(self): with logger.verbose(): logger.debug("message") self.assertEqual(self.stdout, "Debug: message\n")
def percent_cb(name, complete, total): """ Callback for updating target progress """ logger.debug("{}: {} transferred out of {}".format(name, sizeof_fmt(complete), sizeof_fmt(total))) progress.update_target(name, complete, total)
def _is_outdated_file(self): ctime = os.path.getmtime(self.updater_file) outdated = (time.time() - ctime >= self.TIMEOUT) if outdated: logger.debug("'{}' is outdated(".format(self.updater_file)) return outdated
def _do_update(self, path): """ Make sure the stored info for the given path is up to date. """ if not os.path.exists(path): return (None, None) actual_mtime, actual_size = self._mtime_and_size(path) actual_inode = self._inode(path) cmd = "SELECT * from {} WHERE inode={}".format( self.STATE_TABLE, self._to_sqlite(actual_inode) ) self._execute(cmd) ret = self._fetchall() if not ret: md5, info = self._collect(path) cmd = ( "INSERT INTO {}(inode, mtime, size, md5, timestamp) " 'VALUES ({}, "{}", "{}", "{}", "{}")' ) self._execute( cmd.format( self.STATE_TABLE, self._to_sqlite(actual_inode), actual_mtime, actual_size, md5, int(nanotime.timestamp(time.time())), ) ) self.inserts += 1 else: assert len(ret) == 1 assert len(ret[0]) == 5 inode, mtime, size, md5, _ = ret[0] inode = self._from_sqlite(inode) assert inode == actual_inode logger.debug( "Inode '{}', mtime '{}', actual mtime '{}', size '{}', " "actual size '{}'.".format( inode, mtime, actual_mtime, size, actual_size ) ) if actual_mtime != mtime or actual_size != size: md5, info = self._collect(path) cmd = ( "UPDATE {} SET " 'mtime = "{}", size = "{}", ' 'md5 = "{}", timestamp = "{}" ' "WHERE inode = {}" ) self._execute( cmd.format( self.STATE_TABLE, actual_mtime, actual_size, md5, int(nanotime.timestamp(time.time())), self._to_sqlite(actual_inode), ) ) else: info = None cmd = 'UPDATE {} SET timestamp = "{}" WHERE inode = {}' self._execute( cmd.format( self.STATE_TABLE, int(nanotime.timestamp(time.time())), self._to_sqlite(actual_inode), ) ) return (md5, info)
def _inode(path): logger.debug("Path {} inode {}".format(path, System.inode(path))) return System.inode(path)
def _fetchall(self): ret = self.cursor.fetchall() logger.debug("fetched: {}".format(ret)) return ret
def _execute(self, cmd): logger.debug(cmd) return self.cursor.execute(cmd)
def changed(self, path_info, checksum_info): """Checks if data has changed. A file is considered changed if: - It doesn't exist on the working directory (was unlinked) - Checksum is not computed (saving a new file) - The checkusm stored in the State is different from the given one - There's no file in the cache Args: path_info: dict with path information. checksum: expected checksum for this data. Returns: bool: True if data has changed, False otherwise. """ logger.debug("checking if '{}'('{}') has changed.".format( path_info, checksum_info)) if not self.exists(path_info): logger.debug("'{}' doesn't exist.".format(path_info)) return True checksum = checksum_info.get(self.PARAM_CHECKSUM) if checksum is None: logger.debug("checksum for '{}' is missing.".format(path_info)) return True if self.changed_cache(checksum): logger.debug("cache for '{}'('{}') has changed.".format( path_info, checksum)) return True actual = self.save_info(path_info)[self.PARAM_CHECKSUM] if checksum != actual: logger.debug( "checksum '{}'(actual '{}') for '{}' has changed.".format( checksum, actual, path_info)) return True logger.debug("'{}' hasn't changed.".format(path_info)) return False
def inode(path): logger.debug('Path {} inode {}'.format(path, System.inode(path))) return System.inode(path)
def install_or_update(self, parent_repo, pkg_params): from git.cmd import Git if not self.is_in_root(): raise DvcException( "This command can be run only from a repository root" ) if not os.path.exists(self.MODULES_DIR): logger.debug("Creating modules dir {}".format(self.MODULES_DIR)) os.makedirs(self.MODULES_DIR) parent_repo.scm.ignore(os.path.abspath(self.MODULES_DIR)) module_name = ( Git.polish_url(pkg_params.address).strip("/").split("/")[-1] ) if not module_name: raise DvcException( "Package address error: unable to extract package name" ) with TempGitRepo( pkg_params.address, module_name, Package.MODULES_DIR ) as tmp_repo: outputs_to_copy = tmp_repo.outs if pkg_params.select: outputs_to_copy = list( filter( lambda out: out.dvc_path in pkg_params.select, outputs_to_copy, ) ) fetched_stage_files = set( map(lambda o: o.stage.path, outputs_to_copy) ) tmp_repo.fetch(fetched_stage_files) module_dir = self.create_module_dir(module_name) tmp_repo.persist_to(module_dir, parent_repo) dvc_file = self.get_dvc_file_name( pkg_params.file, pkg_params.target_dir, module_name ) try: self.persist_stage_and_scm_state( parent_repo, outputs_to_copy, pkg_params.target_dir, dvc_file, ) except Exception as ex: raise DvcException( "Package '{}' was installed " "but stage file '{}' " "was not created properly: {}".format( pkg_params.address, dvc_file, ex ) ) parent_repo.checkout(dvc_file)