def checksum(self): remote_cmd = (f'{self.cypher_command} {self.rpath} | ' 'awk \'{ print $1 }\';') hex_ = self._ssh(remote_cmd).decode(self.encoding) log.debug(hex_) return bytes.fromhex(hex_)
def data(self, generator): cache = self.cache if cache is not None: cmeta = cache.meta else: assert self.cache is None # FIXME do we touch a file, write the meta # and then write the data? # do we touch a temporary file, write the meta # unlink the symlink, and move the temp file in, and then write the data? # the order that we do this in is very important for robustness to failure # especially when updating a file ... # storing history in the symlink cache also an option? log.debug(f'writing to {self}') chunk1 = next(generator) # if an error occurs don't open the file with open(self, 'wb') as f: f.write(chunk1) for chunk in generator: #log.debug(chunk) f.write(chunk) if cache is not None: # FIXME cache if not cache.meta: cache.meta = cmeta # glories of persisting xattrs :/ # yep sometimes the xattrs get blasted >_< assert cache.meta assert self.cache.meta
def _data_setter(self, generator): """ a data setter that can be used in a chain of generators """ log.debug(f'writing to {self}') chunk1 = next(generator) # if an error occurs don't open the file with open(self, 'wb') as f: f.write(chunk1) yield chunk1 for chunk in generator: #log.debug(chunk) f.write(chunk) yield chunk
def copy_to(self, target, force=False, copy_cache_meta=False): """ copy from a the current path object to a target path """ if type(target) != type(self): target = self.__class__(target) if not target.exists() and not target.is_symlink() or force: target.data = self.data else: raise exc.PathExistsError(f'{target}') if copy_cache_meta: log.debug(f'copying cache meta {self.cache.meta}') target.cache_init(self.cache.meta)
def update_cache(self, cache=None, fetch=True): """ Update a cache object using the metadata attached to this remote. This is different form _cache_setter in that it runs update_meta by default, handles many more edge cases, and checks for consistency. _cache_setter is usually invoked internally by a CachePath method that wants to register itself with a remote as an implementaiton detail. """ if cache is not None and self.cache is not None: # TODO see if there are any exceptions to this behavior raise TypeError( 'cannot accept cache kwarg when self.cache not None') elif cache is None: cache = self.cache parent_changed = self._parent_changed(cache) if self.cache is None: # HACK test if cache is not None before it may have been reassigned if cache.name != self.name: msg = ('Cannot update the name and content of a file at the ' 'same time.\nAre you sure you have passed the right ' f'cache object?\n{cache.name} != {self.name}') raise ValueError(msg) elif parent_changed: msg = ('Cannot update the parent and content of a file at the ' 'same time.\nAre you sure you have passed the right ' f'cache object?\n{cache.parent.id} != {self.parent_id}') raise ValueError(msg) log.debug(f'maybe updating cache for {self.name}') file_is_different = cache._meta_updater(self.meta, fetch=fetch) # update the cache first # FIXME this may be out of order ... # then move to the new name if relevant # prevents moving partial metadata onto existing files if cache.name != self.name or parent_changed: # this is localy correct # the issue is that move is now smarter # and will detect if a parent path has changed try: cache.move(remote=self) except exc.WhyDidntThisGetMovedBeforeError as e: # AAAAAAAAAAAAAAAAAAAAAAAAAAAAA # deal with the sadness that is non-unique filenames # I am 99.999999999999999% certain that users do not # expect this behavior ... log.error(e) self._on_cache_move_error(e, cache) return file_is_different
def _update_meta(old, new): if not old: return False, new # if there is no file it is both different and not different if not new: return False, old file_is_different = False kwargs = {k: v for k, v in old.items()} if old.id != new.id: kwargs['old_id'] = old.id for k, vnew in new.items(): vold = kwargs[k] if vnew is None or hasattr(vnew, '__iter__') and not vnew: # don't update with None or empty iterables continue if vold is not None and vold != vnew: log.info(f'{old.id} field {k} changed from {vold} -> {vnew}') if k in ('created', 'updated', 'size', 'checksum', 'file_id'): file_is_different = True kwargs[k] = vnew if file_is_different: # strip fields missing from new in the case where # we aren't merging metadata from two different sources for k, vnew in new.items(): if k == 'old_id': continue if vnew is None: log.debug(kwargs.pop(k)) #old.updated == new.updated #old.updated < new.updated #old.updated > new.updated #old.created == new.created #old.created < new.created #old.created > new.created return file_is_different, PathMeta(**kwargs)
def inner(child): if child.is_dir() and isd: if child.name == self.name: self.meta = child.meta return elif child.is_file() and isf: log.debug(f'{child.name} {child.stem}, {child.suffix!r}') log.debug(f'{self.name} {self.stem}, {self.suffix!r}') if child.name == self.name: self.meta = child.meta elif child.name == self.stem: candidates.append(child) elif child.stem == self.name: candidates.append(child) elif child.stem == self.stem: # worst cases candidates.append(child) else: #log.critical('file type mismatch') pass
def setxattr(self, key, value, namespace=XATTR_DEFAULT_NS): if not isinstance(value, bytes): # checksums raise TypeError(f'setxattr only accepts values already encoded to bytes!\n{value!r}') else: bytes_value = value if isinstance(key, bytes): key = key.decode() name = self._key_convention(key, namespace) stream = self._stream(name) log.debug(name) log.debug(stream) log.debug(bytes_value) with open(stream, 'wb') as f: f.write(bytes_value)
def _bootstrap_recursive(self, only=tuple(), skip=tuple(), sparse=False): # TODO if rchildren looks like it could be bad # go back up to dataset level? #sname = lambda gen: sorted(gen, key=lambda c: c.name) # c.name doesn't work for remotes #rcs = sname(self.remote._rchildren(create_cache=False, sparse=sparse)) rcs = self.remote._rchildren(create_cache=False, sparse=sparse) local_paths = list(self.local.rchildren) local_files = set(p for p in local_paths if p.is_file() or p.is_broken_symlink()) file_index = {f.cache_id: f for f in local_files} # FIXME WARNING can get big # FIXME have to compute file_index here because for some reason # computing local_dirs will remove folders entirely !?? local_dirs = set( p.relative_to(self.anchor) for p in local_paths if p.is_dir()) if local_dirs: rcs = list(rcs) # sigh remote_dirs = set(c for c in rcs if c.is_dir()) rd = set( d.as_path() for d in remote_dirs) # FIXME as_path => lots of network calls old_local = local_dirs - rd while old_local: thisl = sorted(old_local, key=lambda d: len(d.as_posix())) for d in thisl: ad = self.anchor.local / d if ad.cache is None: log.critical( f'would you fix the nullability already? {d}') continue new = ad.cache.refresh() #log.info(f'{new}') local_dirs = set( ld for ld in local_dirs if not ld.as_posix().startswith(d.as_posix())) old_local = local_dirs - rd if sparse: #if local_dirs: #gen = (c for c in _local_remotes if c.is_dir() or (c.is_file() and c._sparse_include())) #else: gen = (c for c in rcs if c.is_dir() or (c.is_file() and c._sparse_include())) # FIXME rcs still takes too long, though using the generator # does get some useful work done first else: # FIXME horrid performance on remotes with loads of files gen = sorted(rcs, key=lambda c: len(c.as_path().as_posix())) for child in gen: # use the remote's recursive implementation # not the local implementation, since the # remote may have additional requirements #child.bootstrap(only=only, skip=skip) # because of how remote works now we don't even have to # bootstrap this cc = child.cache if cc is None: if child.is_file() and child.id in file_index: _cache = file_index[child.id].cache cmeta = _cache.meta rmeta = child.meta file_is_different, nmeta = self._update_meta(cmeta, rmeta) if file_is_different: log.critical(f'WAT {_cache}') else: yield _cache # yield the old cache if it exists # otherwise consumers of bootstrap will # think the file may have been deleted continue cc = child.cache_init() log.debug(cc) yield cc
def meta(self, pathmeta): if not self.exists(): # if the path does not exist write even temporary to disk if self.is_symlink(): meta = self.meta if meta == pathmeta: log.debug( f'Metadata unchanged for {meta.id}. Not updating.') return if meta.id != pathmeta.id: msg = ('Existing cache id does not match new id!\n' f'{self!r}\n' f'{meta.id} != {pathmeta.id}\n' f'{meta.as_pretty()}\n' f'{pathmeta.as_pretty()}') log.critical(msg) meta_newer = 'Meta newer. Not updating.' pathmeta_newer = 'Other meta newer.' msg = '{}' # apparently I was out of my mind when I wrote this originally ... if meta.updated is None and pathmeta.updated is None: log.warning( 'no change since either has an updated value (wat)' ) return #FIXME if meta.updated > pathmeta.updated: log.info(msg.format(meta_newer)) return # this is the right thing to do for a sane filesystem elif meta.updated < pathmeta.updated: log.info(msg.format(pathmeta_newer)) # THIS IS EXPLICITLY ALLOWED else: # they are equal extra = 'Both updated at the same time ' if meta.created is not None and pathmeta.created is not None: if meta.created > pathmeta.created: log.info(msg.format(extra + meta_newer)) return elif meta.created < pathmeta.created: log.info(msg.format(extra + pathmeta_newer)) # THIS IS EXPLICITLY ALLOWED else: # same created log.info( msg.format( 'Identical timestamps. Not updating.')) return elif meta.created is not None: log.info( msg.format( extra + 'Meta has datetime other does not. Not updating.' )) return elif pathmeta.created is not None: msg = msg.format( extra + 'Meta has no datetime other does.') log.info(msg) raise exc.MetadataIdMismatchError(msg) else: # both none log.info( msg.format(extra + ( 'Identical update time both missing created time. ' 'Not updating.'))) return # equality # id mismatch all cases above should return or raise except for other metadata newer if meta.size is not None and pathmeta.size is None: log.error('new meta has no size so will not overwrite') return # FIXME do the timestamp dance above here log.debug('Metadata exists, but ids match so will update') # trash old versions instead of just unlinking pc = self.local.cache trash = pc.trash self.rename(trash / f'{pc.parent.id}-{meta.id}-{self.name}') #self.unlink() # FIXME if an id starts with / then the local name is overwritten due to pathlib logic # we need to error if that happens #symlink = pathlib.PurePosixPath(self.local.name, pathmeta.as_symlink().as_posix().strip('/')) symlink = pathlib.PurePosixPath( self.local.name) / pathmeta.as_symlink() self.local.symlink_to(symlink) else: raise exc.PathExistsError(f'Path exists {self}')