def is_repro_required(self, changed_files, data_item): state_file = StateFile.load(data_item.state.relative, self._settings) if state_file.locked: Logger.debug( u'Repro is not required for locked data item {}'.format( data_item.data.relative)) return False is_dependency_check_required = self._recursive if not is_dependency_check_required and not self.is_cache_exists(): is_dependency_check_required = True Logger.info( u'Reproduction {}. Force dependency check since cache file is missing.' .format(self._data_item.data.relative)) if is_dependency_check_required: if self.were_dependencies_changed(changed_files, data_item.data.dvc): self.log_repro_reason(u'input dependencies were changed') return True if self._force: self.log_repro_reason(u'it was forced') return True if not self.is_cache_exists(): self.log_repro_reason(u'cache file is missing.') return True if self.were_sources_changed(self._globally_changed_files): self.log_repro_reason(u'sources were changed') return True return False
def status(self, checksum_infos, remote, jobs=None, show_checksums=False): Logger.info("Preparing to pull data from {}".format(remote.url)) title = "Collecting information" progress.set_n_total(1) progress.update_target(title, 0, 100) checksum_infos, missing = self._collect(checksum_infos) checksum_infos += missing progress.update_target(title, 10, 100) md5s, names = self._group(checksum_infos, show_checksums=show_checksums) progress.update_target(title, 20, 100) path_infos = remote.md5s_to_path_infos(md5s) progress.update_target(title, 30, 100) remote_exists = remote.exists(path_infos) progress.update_target(title, 90, 100) local_exists = [not self.changed_cache_file(md5) for md5 in md5s] progress.finish_target(title) return [(name, STATUS_MAP[l, r]) for name, l, r in zip(names, local_exists, remote_exists)]
def run(self): if self.is_locker: lock = fasteners.InterProcessLock(self.git.lock_file) gotten = lock.acquire(timeout=5) if not gotten: Logger.info('Cannot perform the cmd since DVC is busy and locked. Please retry the cmd later.') return 1 good = self.config.sanity_check() if not good[0]: Logger.error('config \'%s\' is not correctly setup. Please fix:' % Runtime.CONFIG) for e in good[1]: Logger.error(' ' + e) return 1 try: for target in self.parsed_args.targets: data_item = self.settings.path_factory.data_item(target) if System.islink(target): self.sync_symlink(data_item) elif os.path.isdir(target): self.sync_dir(target) else: raise DataSyncError('File "{}" does not exit'.format(target)) finally: if self.is_locker: lock.release()
def sanity_check(self): creds = self.get_aws_credentials() if creds is None: Logger.info( "can't find aws credetials, assuming envirment variables or iam role" ) self._aws_creds = creds
def _sync_to_cloud_gcp(self, data_item): """ sync_to_cloud, gcp version """ bucket = self._get_bucket_gc() blob_name = self.cache_file_key(data_item.resolved_cache.dvc) blob = bucket.get_blob(blob_name) if blob is not None and blob.exists(): b64_encoded_md5 = base64.b64encode( file_md5(data_item.resolved_cache.dvc)[1]) if blob.md5_hash == b64_encoded_md5: Logger.debug('checksum %s matches. Skipping upload' % data_item.cache.relative) return Logger.debug('checksum %s mismatch. re-uploading' % data_item.cache.relative) Logger.info('uploading cache file "{} to gc "{}"'.format( data_item.cache.relative, blob_name)) blob = bucket.blob(blob_name) blob.upload_from_filename(data_item.resolved_cache.relative) Logger.info('uploading %s completed' % data_item.resolved_cache.relative)
def repro_data_items(self, data_item_list): error = False changed = False for data_item in data_item_list: try: repro_change = ReproChange(data_item, self) if repro_change.reproduce(): changed = True Logger.info(u'Data item "{}" was reproduced.'.format( data_item.data.relative)) else: Logger.info( u'Reproduction is not required for data item "{}".'. format(data_item.data.relative)) except ReproError as err: Logger.error('Error in reproducing data item {}: {}'.format( data_item.data.relative, str(err))) error = True break if error and not self.skip_git_actions: Logger.error( 'Errors occurred. One or more repro cmd was not successful.') self.not_committed_changes_warning() return changed and not error
def sync_from_cloud(self, item): """ sync from cloud, aws version """ bucket = self._get_bucket_aws() key_name = self.cache_file_key(item.resolved_cache.dvc) key = bucket.get_key(key_name) if not key: Logger.error( 'File "{}" does not exist in the cloud'.format(key_name)) return Logger.info('Downloading cache file from S3 "{}/{}"'.format( bucket.name, key_name)) try: temp_file = tempfile.NamedTemporaryFile( dir=item.resolved_cache.dirname, delete=False) key.get_contents_to_filename(temp_file.name, cb=create_cb( item.resolved_cache.relative)) os.rename(temp_file.name, item.resolved_cache.relative) except Exception as exc: Logger.error('Failed to download "{}": {}'.format(key_name, exc)) if temp_file and os.path.exists(temp_file.name): os.remove(temp_file.name) return progress.finish_target(os.path.basename(item.resolved_cache.relative)) Logger.info('Downloading completed')
def repro_stages(self, stages, recursive, force): error = False changed = False for stage in stages: try: change = ReproStage(self.settings, stage, recursive, force) if change.reproduce(): changed = True Logger.info(u'Stage "{}" was reproduced.'.format( stage.path)) else: Logger.info( u'Reproduction is not required for stage "{}".'.format( stage.path)) except ReproError as err: Logger.error('Error in reproducing stage {}: {}'.format( stage.path, str(err))) error = True break if error and not self.no_git_actions: Logger.error( 'Errors occurred. One or more repro cmd was not successful.') self.not_committed_changes_warning() return changed and not error
def lock_files(self, files, target): cmd = 'lock' if target else 'unlock' error = 0 for file in files: try: data_item = self.settings.path_factory.existing_data_item(file) state = StateFile.load(data_item.state.relative, self.settings) if state.locked and target: Logger.warn('Data item {} is already locked'.format( data_item.data.relative)) elif not state.locked and not target: Logger.warn('Data item {} is already unlocked'.format( data_item.data.relative)) else: state.locked = target Logger.debug('Saving status file for data item {}'.format( data_item.data.relative)) state.save() Logger.info('Data item {} was {}ed'.format( data_item.data.relative, cmd)) except Exception as ex: error += 1 Logger.error('Unable to {} {}: {}'.format(cmd, file, ex)) if error > 0 and not self.no_git_actions: Logger.error( 'Errors occurred. One or more repro cmd was not successful.') self.not_committed_changes_warning() else: self.commit_if_needed('DVC lock: {}'.format(' '.join(self.args))) return 0
def run(self): data_dir_path = self.get_not_existing_dir(self.parsed_args.data_dir) cache_dir_path = self.get_not_existing_dir(self.parsed_args.cache_dir) state_dir_path = self.get_not_existing_dir(self.parsed_args.state_dir) conf_file_name = self.get_not_existing_conf_file_name() data_dir_path.mkdir() cache_dir_path.mkdir() state_dir_path.mkdir() Logger.info('Directories {}/, {}/ and {}/ were created'.format( data_dir_path.name, cache_dir_path.name, state_dir_path.name)) conf_file = open(conf_file_name, 'wt') conf_file.write( self.CONFIG_TEMPLATE.format(data_dir_path.name, cache_dir_path.name, state_dir_path.name)) conf_file.close() self.modify_gitignore(cache_dir_path.name) message = 'DVC init. data dir {}, cache dir {}, state dir {}'.format( data_dir_path.name, cache_dir_path.name, state_dir_path.name) return self.commit_if_needed(message)
def run_instance(self): if not self._image: raise InstanceError( 'Cannot run EC2 instance: image (AMI) is not defined') # Remove active tag. active_filter = { 'tag-key': self.INSTANCE_STATE_TAG, 'tag-value': 'True' } active_reserv = self._conn.get_all_instances(filters=active_filter) active_instances = [i for r in active_reserv for i in r.instances] if len(active_instances) > 0: #active_insts = active_reserv.instances if len(active_instances) > 1: Logger.error( 'EC2 instances consistency error - more than one active EC2 instance' ) for inst in active_instances: inst.remove_tag(self.INSTANCE_STATE_TAG, 'True') inst.add_tag(self.INSTANCE_STATE_TAG, 'False') if inst.state != self.TERMINATED_STATE: Logger.info('{} instance {} is not longer active'.format( inst.instance_type, inst.id)) self._run_instance() pass
def checkout(self, path_info, checksum_info): if path_info['scheme'] != 'hdfs': raise NotImplementedError assert path_info.get('url') checksum = checksum_info.get(self.PARAM_CHECKSUM, None) if not checksum: return if not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." Logger.info(msg.format(self.to_string(path_info))) return if self.changed_cache(checksum): msg = "Cache '{}' not found. File '{}' won't be created." Logger.warn(msg.format(checksum, self.to_string(path_info))) return if self.exists([path_info])[0]: msg = "Data '{}' exists. Removing before checkout." Logger.warn(msg.format(self.to_string(path_info))) self.remove(path_info) return msg = "Checking out '{}' with cache '{}'." Logger.info(msg.format(self.to_string(path_info), checksum)) src = path_info.copy() src['url'] = posixpath.join(self.url, checksum[0:2], checksum[2:]) self.cp(src, path_info)
def checkout(self, path_info, checksum_info): if path_info['scheme'] != 's3': raise NotImplementedError etag = checksum_info.get(self.PARAM_ETAG, None) if not etag: return if not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." Logger.info(msg.format(self.to_string(path_info))) return if self.changed_cache(etag): msg = "Cache '{}' not found. File '{}' won't be created." Logger.warn(msg.format(etag, self.to_string(path_info))) return if self.exists([path_info])[0]: msg = "Data '{}' exists. Removing before checkout." Logger.warn(msg.format(self.to_string(path_info))) self.remove(path_info) return msg = "Checking out '{}' with cache '{}'." Logger.info(msg.format(self.to_string(path_info), etag)) key = posixpath.join(self.prefix, etag[0:2], etag[2:]) from_info = {'scheme': 's3', 'bucket': self.bucket, 'key': key} self._copy(from_info, path_info)
def create_spot_instance(self): # Create spot instance req = self._conn.request_spot_instances( price=self._args.spot_price, image_id=self._args.image, key_name=self.get_key_name(), instance_type=self._args.instance_type, security_groups=self.get_security_groups(), monitoring_enabled=self.toBool(self._args.monitoring), #subnet_id = self._args.subnet_id, placement=self._args.zone, ebs_optimized=self.toBool(self._args.ebs_optimized)) job_instance_id = self._wait_for_an_instance(req) if not job_instance_id: self._conn.cancel_spot_instance_requests(req[0].id) raise InstanceError( u'Unable to obtain {} spot instance in region {} for price ${}: {}' .format((self._args.instance_type, self._args.region, self._args.spot_price, 'the request was canceled'))) Logger.info(u'{} spot instance was created: {}'.format( self._type, job_instance_id)) reservations = self._conn.get_all_instances( instance_ids=job_instance_id) instance = reservations[0].instances[0] return instance
def push(self, data_item): """ push, gcp version """ bucket = self._get_bucket_gc(self.storage_bucket) blob_name = self.cache_file_key(data_item.resolved_cache.dvc) name = os.path.basename(data_item.resolved_cache.dvc) blob = bucket.get_blob(blob_name) if blob is not None and blob.exists(): if self._cmp_checksum(blob, data_item.resolved_cache.dvc): Logger.debug('checksum %s matches. Skipping upload' % data_item.cache.relative) return data_item Logger.debug('checksum %s mismatch. re-uploading' % data_item.cache.relative) # same as in _import progress.update_target(name, 0, None) blob = bucket.blob(blob_name) blob.upload_from_filename(data_item.resolved_cache.relative) progress.finish_target(name) Logger.info('uploading %s completed' % data_item.resolved_cache.relative) return data_item
def _import(self, bucket_name, key, fname, data_item): bucket = self._get_bucket_gc(bucket_name) name = os.path.basename(fname) tmp_file = self.tmp_file(fname) blob = bucket.get_blob(key) if not blob: Logger.error('File "{}" does not exist in the cloud'.format(key)) return None Logger.info('Downloading cache file from gc "{}/{}"'.format( bucket.name, key)) # percent_cb is not available for download_to_filename, so # lets at least update progress at keypoints(start, finish) progress.update_target(name, 0, None) try: blob.download_to_filename(tmp_file) os.rename(tmp_file, fname) except Exception as exc: Logger.error('Failed to download "{}": {}'.format(key, exc)) return None progress.finish_target(name) Logger.info('Downloading completed') return data_item
def loads(project=None, cmd=None, deps=[], outs=[], outs_no_cache=[], metrics_no_cache=[], fname=None, cwd=os.curdir, locked=False, add=False, overwrite=True, ignore_build_cache=False, remove_outs=False): stage = Stage(project=project, cwd=cwd, cmd=cmd, locked=locked) stage.outs = output.loads_from(stage, outs, use_cache=True) stage.outs += output.loads_from(stage, outs_no_cache, use_cache=False) stage.outs += output.loads_from(stage, metrics_no_cache, use_cache=False, metric=True) stage.deps = dependency.loads_from(stage, deps) if fname is not None and os.path.basename(fname) != fname: msg = "Stage file name '{}' should not contain subdirectories. " \ "Use '-c|--cwd' to change location of the stage file." raise StageFileBadNameError(msg.format(fname)) fname, cwd = Stage._stage_fname_cwd(fname, cwd, stage.outs, add=add) Stage._check_inside_project(project, cwd) cwd = os.path.abspath(cwd) path = os.path.join(cwd, fname) stage.cwd = cwd stage.path = path # NOTE: remove outs before we check build cache if remove_outs: stage.remove_outs(ignore_remove=False) project.logger.warn("Build cache is ignored when using " "--remove-outs.") ignore_build_cache = True else: stage.unprotect_outs() if os.path.exists(path): if not ignore_build_cache and stage.is_cached(): Logger.info('Stage is cached, skipping.') return None msg = "'{}' already exists. Do you wish to run the command and " \ "overwrite it?".format(stage.relpath) if not overwrite and not project.prompt.prompt(msg, False): raise StageFileAlreadyExistsError(stage.relpath) return stage
def checkout(self, path_info, checksum_info): if path_info['scheme'] != 'ssh': raise NotImplementedError md5 = checksum_info.get(self.PARAM_MD5, None) if not md5: return if not self.changed(path_info, checksum_info): msg = "Data '{}' didn't change." Logger.info(msg.format(self.to_string(path_info))) return if self.changed_cache(md5): msg = "Cache '{}' not found. File '{}' won't be created." Logger.warn(msg.format(md5, self.to_string(path_info))) return if self.exists([path_info])[0]: msg = "Data '{}' exists. Removing before checkout." Logger.warn(msg.format(self.to_string(path_info))) self.remove(path_info) return msg = "Checking out '{}' with cache '{}'." Logger.info(msg.format(self.to_string(path_info), md5)) src = path_info.copy() src['path'] = posixpath.join(self.prefix, md5[0:2], md5[2:]) self.cp(src, path_info)
def ignore(self, path): entry, hgignore = self._get_hgignore(path) ignore_list = [] if os.path.exists(hgignore): ignore_list = open(hgignore, 'r').readlines() filtered = list(filter(lambda x: x.strip() == entry.strip(), ignore_list)) if len(filtered) != 0: return msg = "Adding '{}' to '{}'.".format(os.path.relpath(path), os.path.relpath(hgignore)) Logger.info(msg) content = entry if len(ignore_list) > 0: content = '\n' + content with open(hgignore, 'a') as fd: fd.write(content) if self.project is not None: # NOTE: Can _files_to_git_add be changed to something more generic? self.project._files_to_git_add.append(os.path.relpath(hgignore))
def test_stdout(self, mock_stdout, mock_stderr): non_error_message = 'non-error message' Logger.init() Logger.info(non_error_message) self.assertEqual('', mock_stderr.getvalue()) self.assertEqual('{}\n'.format(non_error_message), mock_stdout.getvalue())
def _save_dir(self, path_info): path = path_info['path'] md5, dir_info = self.state.update_info(path) dir_relpath = os.path.relpath(path) dir_size = len(dir_info) bar = dir_size > LARGE_DIR_SIZE Logger.info("Linking directory '{}'.".format(dir_relpath)) for processed, entry in enumerate(dir_info): relpath = entry[self.PARAM_RELPATH] m = entry[self.PARAM_MD5] p = os.path.join(path, relpath) c = self.get(m) if self.changed_cache(m): self._move(p, c) else: remove(p) self.link(c, p) if bar: progress.update_target(dir_relpath, processed, dir_size) self.state.update_link(path) if bar: progress.finish_target(dir_relpath) return {self.PARAM_MD5: md5}
def remove(self, path): aws_file_name = self.cache_file_key(path) Logger.debug( u'[Cmd-Remove] Remove from cloud {}.'.format(aws_file_name)) if not self._aws_creds.access_key_id or not self._aws_creds.secret_access_key: Logger.debug( '[Cmd-Remove] Unable to check cache file in the cloud') return conn = S3Connection(self._aws_creds.access_key_id, self._aws_creds.secret_access_key) bucket_name = self.storage_bucket bucket = conn.lookup(bucket_name) if bucket: key = bucket.get_key(aws_file_name) if not key: Logger.warn( '[Cmd-Remove] S3 remove warning: ' 'file "{}" does not exist in S3'.format(aws_file_name)) else: key.delete() Logger.info( '[Cmd-Remove] File "{}" was removed from S3'.format( aws_file_name))
def run(self): if not self.no_git_actions and not self.git.is_ready_to_go(): return 1 if os.path.realpath(os.path.curdir) != self.settings.git.git_dir_abs: Logger.error( 'DVC error: initialization could be done only from git root directory {}' .format(self.settings.git.git_dir_abs)) return 1 config_dir_path = self.get_not_existing_path(Config.CONFIG_DIR) cache_dir_path = self.get_not_existing_path(Config.CONFIG_DIR, Config.CACHE_DIR_NAME) conf_file_name = self.get_not_existing_conf_file_name() config_dir_path.mkdir() cache_dir_path.mkdir() Logger.info('Directories {}/, {}/ were created'.format( config_dir_path.name, os.path.join(config_dir_path.name, cache_dir_path.name))) conf_file = open(conf_file_name, 'wt') conf_file.write(self.CONFIG_TEMPLATE) conf_file.close() self.git.modify_gitignore([ os.path.join(config_dir_path.name, cache_dir_path.name), os.path.join(config_dir_path.name, os.path.basename(self.git.lock_file)), '*' + DataItem.LOCAL_STATE_FILE_SUFFIX ]) message = 'DVC init. cache dir {}'.format(cache_dir_path.name) return self.commit_if_needed(message)
def commit_all_changes_and_log_status(self, message): statuses = self.commit_all_changes(message) Logger.info( '[Git] A new commit {} was made in the current branch. Added files:' .format(self.curr_commit)) for status, file in statuses: Logger.info('[Git]\t{} {}'.format(status, file)) pass
def show(self, config, section, opt): if section not in config.keys(): raise ConfigError("Section '{}' doesn't exist".format(section)) if opt not in config[section].keys(): raise ConfigError("Option '{}.{}' doesn't exist".format(section, opt)) Logger.info(config[section][opt])
def run(self): for section in self.configobj.keys(): r = re.match(Config.SECTION_REMOTE_REGEX, section) if r: name = r.group('name') url = self.configobj[section].get(Config.SECTION_REMOTE_URL, '') Logger.info('{}\t{}'.format(name, url)) return 0
def dump(self, fname=None): if not fname: fname = self.path msg = "Saving information to '{}'.".format(os.path.relpath(fname)) Logger.info(msg) with open(fname, 'w') as fd: yaml.safe_dump(self.dumpd(), fd, default_flow_style=False)
def reproduce_data_item(self, changed_files): Logger.debug('Reproducing data item {}.'.format(self._data_item.data.dvc)) for output_dvc in self._state.output_files: Logger.debug('Removing output file {} before reproduction.'.format(output_dvc)) try: data_item = self.cmd_obj.settings.path_factory.existing_data_item_from_dvc_path(output_dvc) os.remove(data_item.data.relative) except Exception as ex: msg = 'Data item {} cannot be removed before reproduction: {}' Logger.error(msg.format(output_dvc, ex)) changed_files.add(output_dvc) if self.state.is_import_file: Logger.debug('Reproducing data item {}. Re-import cmd: {}'.format( self._data_item.data.relative, ' '.join(self.state.argv))) if len(self.state.argv) != 2: msg = 'Data item "{}" cannot be re-imported because of arguments number {} is incorrect. Argv: {}' raise ReproError(msg.format(self._data_item.data.relative, len(self.state.argv), self.state.argv)) input = self.state.argv[0] output = self.state.argv[1] cmd = CmdImportFile(self._settings) cmd.set_git_action(True) cmd.set_locker(False) Logger.info(u'Reproducing import command: {}'.format(output)) if cmd.import_and_commit_if_needed(input, output, lock=True, check_if_ready=False) != 0: raise ReproError('Import command reproduction failed') return True elif self.state.is_run: cmd = CmdRun(self._settings) cmd.set_git_action(True) cmd.set_locker(False) Logger.info('Reproducing run command for data item {}. Args: {}'.format( self._data_item.data.relative, ' '.join(self.state.argv))) data_items_from_args, not_data_items_from_args = self.cmd_obj.argv_files_by_type(self.state.argv) if cmd.run_and_commit_if_needed(self.state.argv, data_items_from_args, not_data_items_from_args, self.state.stdout, self.state.stderr, self.state.shell, check_if_ready=False) != 0: raise ReproError('Run command reproduction failed') return True else: # Ignore EMPTY_FILE command pass pass
def modify_gitignore(self, config_dir_name, cache_dir_name): gitignore_file = os.path.join(self.git.git_dir, '.gitignore') if not os.path.exists(gitignore_file): open(gitignore_file, 'a').close() Logger.info('File .gitignore was created') with open(gitignore_file, 'a') as fd: fd.write('\n{}'.format(os.path.join(config_dir_name, cache_dir_name))) fd.write('\n{}'.format(os.path.join(config_dir_name, os.path.basename(self.git.lock_file)))) Logger.info('Directory {} was added to .gitignore file'.format(cache_dir_name))
def checkout(items): for item in items: if CmdCheckout.cache_ok(item): continue if os.path.isfile(item.data.relative): os.remove(item.data.relative) System.hardlink(item.cache.relative, item.data.relative) Logger.info('Checkout \'{}\''.format(item.data.relative))