def collect_dir_cache(self, dname): dir_info = [] db = self.state.load() bar = False for root, dirs, files in os.walk(dname): if len(files) > LARGE_DIR_SIZE: msg = "Computing md5 for a large directory {}. " \ "This is only done once." Logger.info(msg.format(os.path.relpath(dname))) bar = True title = os.path.relpath(dname) processed = 0 total = len(files) progress.update_target(title, 0, total) for fname in files: path = os.path.join(root, fname) relpath = self.unixpath(os.path.relpath(path, dname)) if bar: progress.update_target(title, processed, total) processed += 1 md5 = self.state.update(path, use_db=db) dir_info.append({self.PARAM_RELPATH: relpath, self.PARAM_MD5: md5}) db.commit() db.close() if bar: progress.finish_target(title) # NOTE: sorting the list by path to ensure reproducibility dir_info = sorted(dir_info, key=itemgetter(self.PARAM_RELPATH)) md5 = dict_md5(dir_info) + self.MD5_DIR_SUFFIX if self.changed_cache(md5): self.dump_dir_cache(md5, dir_info) return (md5, dir_info)
def run(self): if not self.no_git_actions and not self.git.is_ready_to_go(): return 1 if os.path.realpath(os.path.curdir) != self.settings.git.git_dir_abs: Logger.error( 'DVC error: initialization could be done only from git root directory {}' .format(self.settings.git.git_dir_abs)) return 1 config_dir_path = self.get_not_existing_path(Config.CONFIG_DIR) data_dir_path = self.get_not_existing_path(self.parsed_args.data_dir) cache_dir_path = self.get_not_existing_path(Config.CONFIG_DIR, Config.CACHE_DIR) state_dir_path = self.get_not_existing_path(Config.CONFIG_DIR, Config.STATE_DIR) self.settings.config.set(self.parsed_args.data_dir) conf_file_name = self.get_not_existing_conf_file_name() config_dir_path.mkdir() data_dir_path.mkdir() cache_dir_path.mkdir() state_dir_path.mkdir() Logger.info('Directories {}/, {}/, {}/, {}/ were created'.format( config_dir_path.name, data_dir_path.name, cache_dir_path.name, state_dir_path.name)) self.create_empty_file() conf_file = open(conf_file_name, 'wt') conf_file.write(self.CONFIG_TEMPLATE.format(data_dir_path.name)) conf_file.close() message = 'DVC init. data dir {}, cache dir {}, state dir {}, '.format( data_dir_path.name, cache_dir_path.name, state_dir_path.name) if self.commit_if_needed(message) == 1: return 1 self.modify_gitignore(config_dir_path.name, cache_dir_path.name) return self.commit_if_needed('DVC init. Commit .gitignore file')
def link(self, cache, path): assert os.path.isfile(cache) dname = os.path.dirname(path) if not os.path.exists(dname): os.makedirs(dname) i = len(self.cache_types) while i > 0: try: self.CACHE_TYPE_MAP[self.cache_types[0]](cache, path) return except Exception as exc: msg = 'Cache type \'{}\' is not supported'.format( self.cache_types[0]) Logger.debug(msg) del self.cache_types[0] i -= 1 raise DvcException('No possible cache types left to try out.')
def _resume_multipart(self, key, fname): """ Try resuming multipart upload. """ try: mp_id = open(self._upload_tracker(fname), 'r').read() except Exception as exc: Logger.debug( "Failed to read upload tracker file for {}: {}".format( fname, exc)) return None for part in key.bucket.get_all_multipart_uploads(): if part.id != mp_id: continue Logger.debug("Found existing multipart {}".format(mp_id)) return part return None
def check_opt(self): _section, _opt = self.parsed_args.name.strip().split('.', 1) add = (self.parsed_args.value != None and self.parsed_args.unset == False) section = self._get_key(self.configobj, _section, add) if not section: Logger.error('Invalid option name {}'.format(_section)) return 1 opt = self._get_key(self.configobj[section], _opt, add) if not opt: Logger.error('Invalid option value: {}'.format(_opt)) return 1 self.section = section self.opt = opt return 0
def run(self): with DvcLock(self.is_locker, self.git): name = self.parsed_args.name cloud = self.parsed_args.cloud or self.settings.config.cloud # print('NAME: {}'.format(name)) # print('CLOUD: {}'.format(cloud)) if not name: Logger.error('Instance name is not defined') return 1 try: InstanceManager().create(name, cloud, self.parsed_args, self.settings.config) except DvcException as ex: Logger.error('Instance creation error: {}'.format(ex)) return 1 return 0
def load_dir_cache(path): if os.path.isabs(path): relpath = os.path.relpath(path) else: relpath = path try: with open(path, 'r') as fd: d = json.load(fd) except Exception as exc: msg = u'Failed to load dir cache \'{}\'' Logger.error(msg.format(relpath), exc) return [] if not isinstance(d, list): msg = u'Dir cache file format error \'{}\': skipping the file' Logger.error(msg.format(relpath)) return [] return d
def upload(self, paths, path_infos, names=None): names = self._verify_path_args(path_infos, paths, names) for path, path_info, name in zip(paths, path_infos, names): if path_info['scheme'] != 'local': raise NotImplementedError Logger.debug("Uploading '{}' to '{}'".format( path, path_info['path'])) if not name: name = os.path.basename(path) self._makedirs(path_info['path']) try: copyfile(path, path_info['path'], name=name) except Exception as exc: Logger.error("Failed to upload '{}' tp '{}'".format( path, path_info['path']))
def link(self, src, link): dname = os.path.dirname(link) if not os.path.exists(dname): os.makedirs(dname) if self.cache_type != None: types = [self.cache_type] else: types = self.CACHE_TYPES for typ in types: try: self.CACHE_TYPE_MAP[typ](src, link) self.link_state.update(link) return except Exception as exc: msg = 'Cache type \'{}\' is not supported'.format(typ) Logger.debug(msg) if typ == types[-1]: raise DvcException(msg, cause=exc)
def ignore(self, path): entry, gitignore = self._get_gitignore(path) ignore_list = [] if os.path.exists(gitignore): ignore_list = open(gitignore, 'r').readlines() filtered = list( filter(lambda x: x.strip() == entry.strip(), ignore_list)) if len(filtered) != 0: return msg = "Adding '{}' to '{}'.".format(os.path.relpath(path), os.path.relpath(gitignore)) Logger.info(msg) content = entry if len(ignore_list) > 0: content = '\n' + content open(gitignore, 'a').write(content)
def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.link_state = LinkState(self) self.logger = Logger(self.config._config[Config.SECTION_CORE].get( Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self._ignore() self.updater.check()
def download(self, path_infos, fnames, no_progress_bar=False, names=None): names = self._verify_path_args(path_infos, fnames, names) session = boto3.session.Session() s3 = session.client('s3') for fname, path_info, name in zip(fnames, path_infos, names): if path_info['scheme'] != 's3': raise NotImplementedError Logger.debug("Downloading '{}/{}' to '{}'".format( path_info['bucket'], path_info['key'], fname)) tmp_file = self.tmp_file(fname) if not name: name = os.path.basename(fname) if no_progress_bar: cb = None else: total = s3.head_object(Bucket=path_info['bucket'], Key=path_info['key'])['ContentLength'] cb = Callback(name, total) self._makedirs(fname) try: s3.download_file(path_info['bucket'], path_info['key'], tmp_file, Callback=cb) except Exception as exc: Logger.error( "Failed to download '{}/{}'".format( path_info['bucket'], path_info['key']), exc) return os.rename(tmp_file, fname) if not no_progress_bar: progress.finish_target(name)
def download(self, from_infos, to_infos, no_progress_bar=False, names=None): names = self._verify_path_args(from_infos, to_infos, names) for to_info, from_info, name in zip(to_infos, from_infos, names): if from_info['scheme'] != self.scheme: raise NotImplementedError if to_info['scheme'] != 'local': raise NotImplementedError bucket = from_info['bucket'] key = from_info['key'] Logger.debug("Downloading '{}/{}' to '{}'".format( bucket, key, to_info['path'])) tmp_file = self.tmp_file(to_info['path']) if not name: name = os.path.basename(to_info['path']) cb = None if no_progress_bar else Callback(name) self._makedirs(to_info['path']) try: self.blob_service.get_blob_to_path(bucket, key, tmp_file, progress_callback=cb) except Exception as exc: msg = "Failed to download '{}/{}'".format(bucket, key) Logger.warn(msg, exc) else: os.rename(tmp_file, to_info['path']) if not no_progress_bar: progress.finish_target(name)
def reproduce(self, changed_files): Logger.debug('Reproduce data item {}. recursive={}, force={}'.format( self._data_item.data.relative, self._recursive, self._force)) if self.state.locked: Logger.debug('Data item {} is not reproducible'.format(self._data_item.data.relative)) return False if self.is_repro_required(changed_files, self._data_item): if self._data_item.data.dvc not in changed_files: Logger.debug('Data item {} is going to be reproduced'.format(self._data_item.data.relative)) self.reproduce_data_item(changed_files) changed_files.add(self._data_item.data.dvc) return True else: msg = 'Data item {} is not going to be reproduced because it is already reproduced' Logger.debug(msg.format(self._data_item.data.relative)) else: Logger.debug('Data item {} is up to date'.format(self._data_item.data.relative)) return False pass
def _import(self, target): url, item = target o = urlparse(url) typ = self.SCHEME_MAP.get(o.scheme, None) if typ == None: Logger.error('Not supported scheme \'{}\''.format(o.scheme)) return None #To handle ConfigI case if not hasattr(self._settings.config, '_config'): self._config = None cloud_settings = None else: self._config = self._settings.config._config cloud_settings = self.get_cloud_settings( self._config, typ, self._settings.path_factory) cloud = self.CLOUD_MAP[typ](cloud_settings) return cloud.import_data(url, item)
def _sync_to_cloud_aws(self, data_item): """ sync_to_cloud, aws version """ aws_key = self.cache_file_key(data_item.resolved_cache.dvc) bucket = self._get_bucket_aws() key = bucket.get_key(aws_key) if key: Logger.debug('File already uploaded to the cloud. Checksum validation...') md5_cloud = key.etag[1:-1] md5_local = file_md5(data_item.resolved_cache.dvc)[0] if md5_cloud == md5_local: Logger.debug('File checksum matches. No uploading is needed.') return Logger.debug('Checksum miss-match. Re-uploading is required.') Logger.info('Uploading cache file "{}" to S3 "{}"'.format(data_item.resolved_cache.relative, aws_key)) key = bucket.new_key(aws_key) key.set_contents_from_filename(data_item.resolved_cache.relative, cb=percent_cb) Logger.info('Uploading completed')
def _read_metric_from_state_file(self, hash, target, settings): try: data_item = settings.path_factory.data_item(target) except DataItemError as ex: Logger.warn('Target file {} is not data item: {}'.format( target, ex)) return None try: cmd_corresponded_state_file = [ 'git', 'show', '{}:{}'.format(hash, data_item.state.relative) ] state_file_content = Executor.exec_cmd_only_success( cmd_corresponded_state_file) except ExecutorError as ex: msg = '[dvc-git] Cannot obtain content of target symbolic file {} with hash {}: {}' Logger.warn(msg.format(target, hash, ex)) return None state_file = StateFile.loads(state_file_content, settings) return state_file.single_target_metric
def run_command(self, cmd_args, data_items_from_args, not_data_items_from_args, stdout=None, stderr=None, shell=False): Logger.debug( 'Run command with args: {}. Data items from args: {}. stdout={}, stderr={}, shell={}' .format(' '.join(cmd_args), ', '.join([x.data.dvc for x in data_items_from_args]), stdout, stderr, shell)) repo_change = RepositoryChange(cmd_args, self.settings, stdout, stderr, shell=shell) if not self.no_git_actions and not self._validate_file_states( repo_change): self.remove_new_files(repo_change) raise RunError('Errors occurred.') output_set = set(self.declaration_output_data_items + repo_change.changed_data_items) output_files_dvc = [x.data.dvc for x in output_set] input_set = set(data_items_from_args + self.declaration_input_data_items) - output_set input_files_dvc = [x.data.dvc for x in input_set] code_dependencies_dvc = self.git.abs_paths_to_dvc( self.code_dependencies + not_data_items_from_args) result = [] for data_item in repo_change.changed_data_items: Logger.debug( 'Move output file "{}" to cache dir "{}" and create a symlink'. format(data_item.data.relative, data_item.cache.relative)) data_item.move_data_to_cache() Logger.debug('Create state file "{}"'.format( data_item.state.relative)) state_file = StateFile(StateFile.COMMAND_RUN, data_item.state.relative, self.settings, input_files_dvc, output_files_dvc, code_dependencies_dvc, argv=cmd_args, lock=self.lock, stdout=self._stdout_to_dvc(stdout), stderr=self._stdout_to_dvc(stderr), shell=shell) state_file.save() result.append(state_file) return result
def checkout(self, path_info, checksum_info): path = path_info['path'] md5 = checksum_info.get(self.PARAM_MD5, None) cache = self.get(md5) if not cache: Logger.warn('No cache info for \'{}\'. Skipping checkout.'.format( os.path.relpath(path))) return if os.path.exists(path): msg = u'Data \'{}\' exists. Removing before checkout' Logger.debug(msg.format(os.path.relpath(path))) remove(path) msg = u'Checking out \'{}\' with cache \'{}\'' Logger.debug(msg.format(os.path.relpath(path), md5)) if not self.is_dir_cache(cache): self.link(md5, path, dump=True) return # Create dir separately so that dir is created # even if there are no files in it if not os.path.exists(path): os.makedirs(path) for entry in self.load_dir_cache(cache): md5 = entry[self.PARAM_MD5] relpath = entry[self.PARAM_RELPATH] p = os.path.join(path, relpath) self.link(md5, p, dump=False) self.link_state.dump()
def create_state_files(self, targets, lock): """ Create state files for all targets. """ for t in targets: orig_target, processed_data_item = t input, data_item = orig_target output = data_item.data.relative if processed_data_item == None: Logger.debug( 'Skipping creating state file for failed import {}'.format( data_item.state.relative)) continue Logger.debug('Creating symlink {} --> {}'.format( data_item.symlink_file, data_item.data.relative)) System.symlink(data_item.symlink_file, data_item.data.relative) state_file = StateFile(StateFile.COMMAND_IMPORT_FILE, data_item, self.settings, argv=[input, output], input_files=[], output_files=[output], lock=lock) state_file.save() Logger.debug('State file "{}" was created'.format( data_item.state.relative))
def terminate_instances(self, instance): instance = self.get_instance_id(instance) if not instance: Logger.error('Instance Id is not specified') return (active, not_active, rest_inst) = self.all_instances() if instance == 'all': target_in_active = active target_in_not_active = not_active else: target_in_active = list( filter(lambda inst: inst.id == instance, active)) target_in_not_active = list( filter(lambda inst: inst.id == instance, not_active)) if target_in_not_active: self._conn.terminate_instances( instance_ids=[inst.id for inst in target_in_not_active]) for inst in target_in_active: inst.remove_tag(self.INSTANCE_STATE_TAG, 'True') inst.add_tag(self.INSTANCE_STATE_TAG, 'False') self._conn.terminate_instances(instance_ids=[inst.id]) if instance != 'all' and len(target_in_active) > 0 and len( not_active) > 0: new_active_inst = not_active[0] new_active_inst.remove_tag(self.INSTANCE_STATE_TAG, 'False') new_active_inst.add_tag(self.INSTANCE_STATE_TAG, 'True') randomly = '' if len(not_active) > 1: randomly = 'randomly ' Logger.error( '{} instance {} was {} selected as active because of an active instance was terminated' .format(new_active_inst.instance_type, new_active_inst.id, randomly)) pass
def _checkout(self, path, md5): cache = self.get(md5) if not cache or not os.path.exists(cache) or self._changed(md5): if cache: Logger.warn(u'\'{}({})\': cache file not found'.format( os.path.relpath(cache), os.path.relpath(path))) remove(path) return if os.path.exists(path): msg = u'Data \'{}\' exists. Removing before checkout' Logger.debug(msg.format(os.path.relpath(path))) remove(path) msg = u'Checking out \'{}\' with cache \'{}\'' Logger.debug(msg.format(os.path.relpath(path), os.path.relpath(cache))) if not self.is_dir_cache(cache): self.link(cache, path) return dir_cache = self.dir_cache(cache) for relpath, c in dir_cache.items(): p = os.path.join(path, relpath) self.link(c, p)
def read_metrics(self, fname, branch): try: lines = self.git.get_file_content(fname, branch).split('\n') except ExecutorError as ex: msg = 'Unable to read metrics file from branch {}: {}' data_item = self.settings.path_factory.data_item(fname) try: self.git.get_file_content(data_item.state.relative, branch) Logger.error( msg.format(branch, 'this is data file, not metric file')) except ExecutorError: Logger.error( msg.format(branch, 'file does not exist in this branch')) return None return None metric = utils.parse_target_metric(lines) if not metric: msg = 'Unable to parse metrics from the first line of file {} in branch {}' Logger.error(msg.format(fname, branch)) return None return metric
def check(self): current = VERSION_BASE if os.getenv('CI'): return if os.path.isfile(self.updater_file): ctime = os.path.getctime(self.updater_file) if time.time() - ctime < self.TIMEOUT: msg = '{} is not old enough to check for updates' Logger.debug(msg.format(self.UPDATER_FILE)) return os.unlink(self.updater_file) try: r = requests.get(self.URL) j = r.json() latest = j['version'] open(self.updater_file, 'w+').close() except Exception as exc: Logger.debug('Failed to obtain latest version: {}'.format( str(exc))) return l_major, l_minor, l_patch = latest.split('.') c_major, c_minor, c_patch = current.split('.') if l_major <= c_major and \ l_minor <= c_minor and \ l_patch <= c_patch: return msg = 'You are using dvc version {}, however version {} is available. Consider upgrading.' Logger.warn(msg.format(current, latest))
def push(self, data_item): """ push, gcp version """ bucket = self._get_bucket_gc(self.storage_bucket) blob_name = self.cache_file_key(data_item.resolved_cache.dvc) name = os.path.basename(data_item.resolved_cache.dvc) blob = bucket.get_blob(blob_name) if blob is not None and blob.exists(): if self._cmp_checksum(blob, data_item.resolved_cache.dvc): Logger.debug('checksum %s matches. Skipping upload' % data_item.cache.relative) return data_item Logger.debug('checksum %s mismatch. re-uploading' % data_item.cache.relative) # same as in _import progress.update_target(name, 0, None) blob = bucket.blob(blob_name) blob.upload_from_filename(data_item.resolved_cache.relative) progress.finish_target(name) Logger.debug('uploading %s completed' % data_item.resolved_cache.relative) return data_item
def run(self): if self.is_locker: lock = fasteners.InterProcessLock(self.git.lock_file) gotten = lock.acquire(timeout=5) if not gotten: Logger.info('Cannot perform the cmd since DVC is busy and locked. Please retry the cmd later.') return 1 good = self.config.sanity_check() if not good[0]: Logger.error('config \'%s\' is not correctly setup. Please fix:' % Runtime.CONFIG) for e in good[1]: Logger.error(' ' + e) return 1 try: for target in self.parsed_args.targets: data_item = self.settings.path_factory.data_item(target) if System.islink(target): self.sync_symlink(data_item) elif os.path.isdir(target): self.sync_dir(target) else: raise DataSyncError('File "{}" does not exit'.format(target)) finally: if self.is_locker: lock.release()
def checkout(self, path_info, checksum_info, dump=True): path = path_info['path'] md5 = checksum_info.get(self.PARAM_MD5, None) cache = self.get(md5) if not cache or not os.path.exists(cache) or self.changed(md5): if cache: Logger.warn(u'\'{}({})\': cache file not found'.format( os.path.relpath(cache), os.path.relpath(path))) remove(path) return if os.path.exists(path): msg = u'Data \'{}\' exists. Removing before checkout' Logger.debug(msg.format(os.path.relpath(path))) remove(path) if not self.is_dir_cache(cache): self.link(cache, path, dump=dump) return msg = u'Checking out directory \'{}\' with cache \'{}\'' Logger.debug(msg.format(os.path.relpath(path), os.path.relpath(cache))) # Create dir separately so that dir is created # even if there are no files in it if not os.path.exists(path): os.makedirs(path) dir_cache = self.dir_cache(cache) for relpath, c in dir_cache.items(): p = os.path.join(path, relpath) self.link(c, p, dump=dump)
def upload(self, from_infos, to_infos, names=None): names = self._verify_path_args(to_infos, from_infos, names) ssh = self.ssh(host=to_infos[0]['host'], user=to_infos[0]['user'], port=to_infos[0]['port']) sftp = ssh.open_sftp() for from_info, to_info, name in zip(from_infos, to_infos, names): if to_info['scheme'] != 'ssh': raise NotImplementedError if from_info['scheme'] != 'local': raise NotImplementedError Logger.debug("Uploading '{}' to '{}/{}'".format(from_info['path'], to_info['host'], to_info['path'])) if not name: name = os.path.basename(from_info['path']) dname = posixpath.dirname(to_info['path']) self._exec(ssh, 'mkdir -p {}'.format(dname)) try: sftp.put(from_info['path'], to_info['path'], callback=create_cb(name)) except Exception as exc: msg = "Failed to upload '{}' to '{}/{}'" Logger.error(msg.format(from_info['path'], to_info['host'], to_info['path'], exc)) continue progress.finish_target(name) sftp.close() ssh.close()
def load_dir_cache(self, md5): path = self.get(md5) assert self.is_dir_cache(path) try: with open(path, 'r') as fd: d = json.load(fd) except Exception as exc: msg = u'Failed to load dir cache \'{}\'' Logger.error(msg.format(os.path.relpath(path)), exc) return [] if not isinstance(d, list): msg = u'Dir cache file format error \'{}\': skipping the file' Logger.error(msg.format(os.path.relpath(path))) return [] for info in d: info['relpath'] = self.ospath(info['relpath']) return d
def _pull_key(self, key, fname, no_progress_bar=False): Logger.debug("Pulling key '{}' from bucket '{}' to file '{}'".format( key.name, key.bucket.name, fname)) self._makedirs(fname) tmp_file = self.tmp_file(fname) name = os.path.relpath(fname, self._cloud_settings.cache.cache_dir) if self._cmp_checksum(key, fname): Logger.debug('File "{}" matches with "{}".'.format( fname, key.name)) return fname Logger.debug('Downloading cache file from S3 "{}/{}" to "{}"'.format( key.bucket.name, key.name, fname)) if no_progress_bar: cb = None else: cb = create_cb(name) res_h = ResumableDownloadHandler( tracker_file_name=self._download_tracker(tmp_file), num_retries=10) try: key.get_contents_to_filename(tmp_file, cb=cb, res_download_handler=res_h) except Exception as exc: Logger.error('Failed to download "{}": {}'.format(key.name, exc)) return None os.rename(tmp_file, fname) if not no_progress_bar: progress.finish_target(name) Logger.debug('Downloading completed') return fname