class FileDataMappingProxy(MultiOpenMixin, RunStateMixin): ''' Mapping-like class to cache data chunks to bypass gdbm indices and the like. Data are saved immediately into an in memory cache and an asynchronous worker copies new data into a cache file and also to the backend storage. ''' @pfx_method def __init__( self, backend, *, dirpath=None, max_cachefile_size=None, max_cachefiles=None, runstate=None, ): ''' Initialise the cache. Parameters: * `backend`: mapping underlying us * `dirpath`: directory to store cache files * `max_cachefile_size`: maximum cache file size; a new cache file is created if this is exceeded; default: DEFAULT_CACHEFILE_HIGHWATER * `max_cachefiles`: number of cache files to keep around; no more than this many cache files are kept at a time; default: DEFAULT_MAX_CACHEFILES ''' RunStateMixin.__init__(self, runstate=runstate) if max_cachefile_size is None: max_cachefile_size = DEFAULT_CACHEFILE_HIGHWATER if max_cachefiles is None: max_cachefiles = DEFAULT_MAX_CACHEFILES self.backend = backend if not isdirpath(dirpath): raise ValueError("dirpath=%r: not a directory" % (dirpath, )) self.dirpath = dirpath self.max_cachefile_size = max_cachefile_size self.max_cachefiles = max_cachefiles self.cached = {} # map h => data self.saved = {} # map h => _CachedData(cachefile, offset, length) self._lock = Lock() self.cachefiles = [] self._add_cachefile() self._workQ = None self._worker = None self.runstate.notify_cancel.add(lambda rs: self.close()) def startup(self): ''' Startup the proxy. ''' self._workQ = IterableQueue() self._worker = Thread(name="%s WORKER" % (self, ), target=self._work) self._worker.start() @pfx_method def shutdown(self): ''' Shut down the cache. Stop the worker, close the file cache. ''' self._workQ.close() self._worker.join() if self.cached: error("blocks still in memory cache: %r", self.cached) for cachefile in self.cachefiles: cachefile.close() def _add_cachefile(self): cachefile = RWFileBlockCache(dirpath=self.dirpath) self.cachefiles.insert(0, cachefile) if len(self.cachefiles) > self.max_cachefiles: old_cachefile = self.cachefiles.pop() old_cachefile.close() def _getref(self, h): ''' Fetch a cache reference from self.saved, return None if missing. Automatically prune stale saved entries if the cachefile is closed. ''' saved = self.saved ref = saved.get(h) if ref is not None: if ref.cachefile.closed: ref = None del saved[h] return ref def __contains__(self, h): ''' Mapping method supporting "in". ''' with self._lock: if h in self.cached: return True if self._getref(h) is not None: return True backend = self.backend if backend: return h in backend return False def keys(self): ''' Mapping method for .keys. ''' seen = set() for h in list(self.cached.keys()): yield h seen.add(h) saved = self.saved with self._lock: saved_keys = list(saved.keys()) for h in saved_keys: if h not in seen and self._getref(h): yield h seen.add(h) backend = self.backend if backend: for h in backend.keys(): if h not in seen: yield h def __getitem__(self, h): ''' Fetch the data with key `h`. Raise KeyError if missing. ''' with self._lock: # fetch from memory try: data = self.cached[h] except KeyError: # fetch from file ref = self._getref(h) if ref is not None: return ref.fetch() else: # straight from memory cache return data # not in memory or file cache: fetch from backend, queue store into cache backend = self.backend if not backend: raise KeyError('no backend: h=%s' % (h, )) data = backend[h] with self._lock: self.cached[h] = data self._workQ.put((h, data, False)) return data def __setitem__(self, h, data): ''' Store `data` against key `h`. ''' with self._lock: if h in self.cached: # in memory cache, do not save return if self._getref(h): # in file cache, do not save return # save in memory cache self.cached[h] = data # queue for file cache and backend self._workQ.put((h, data, True)) def _work(self): for h, data, in_backend in self._workQ: with self._lock: if self._getref(h): # already in file cache, therefore already sent to backend continue cachefile = self.cachefiles[0] offset = cachefile.put(data) with self._lock: self.saved[h] = CachedData(cachefile, offset, len(data)) # release memory cache entry try: del self.cached[h] except KeyError: pass if offset + len(data) >= self.max_cachefile_size: # roll over to new cache file self._add_cachefile() # store into the backend if not in_backend: backend = self.backend if backend: self.backend[h] = data
class FilesDir(SingletonMixin, HashCodeUtilsMixin, MultiOpenMixin, RunStateMixin, FlaggedMixin, Mapping): ''' Base class indexing locally stored data in files for a specific hashclass. There are two main subclasses of this at present: * `DataDir`: the data are kept in a subdirectory of UUID-named files, supporting easy merging and updating. * `PlatonicDataDir`: the data are present in a normal file tree, such as a preexisting media server directory or the like. ''' STATE_FILENAME_FORMAT = 'index-{hashname}-state.sqlite' INDEX_FILENAME_BASE_FORMAT = 'index-{hashname}' DATA_ROLLOVER = DEFAULT_ROLLOVER _FD_Singleton_Key_Tuple = namedtuple( 'FilesDir_FD_Singleton_Key_Tuple', 'cls realdirpath hashclass indexclass rollover flags_id' ) @classmethod def _resolve(cls, *, hashclass, indexclass, rollover, flags, flags_prefix): ''' Resolve the `__init__()` arguments, shared by `__init__` and `_singleton_key`. ''' if indexclass is None: indexclass = choose_indexclass( cls.INDEX_FILENAME_BASE_FORMAT.format(hashname=hashclass.HASHNAME) ) if rollover is None: rollover = cls.DATA_ROLLOVER elif rollover < 1024: raise ValueError( "rollover < 1024" " (a more normal size would be in megabytes or gigabytes): %r" % (rollover,) ) if flags is None: if flags_prefix is None: flags = DummyFlags() flags_prefix = 'DUMMY' else: if flags_prefix is None: raise ValueError("flags provided but no flags_prefix") return SimpleNamespace( hashclass=hashclass, indexclass=indexclass, rollover=rollover, flags=flags, flags_prefix=flags_prefix ) @classmethod def _singleton_key( cls, topdirpath, *, hashclass, indexclass=None, rollover=None, flags=None, flags_prefix=None, **_, ): resolved = cls._resolve( hashclass=hashclass, indexclass=indexclass, rollover=rollover, flags=flags, flags_prefix=flags_prefix ) return cls._FD_Singleton_Key_Tuple( cls=cls, realdirpath=realpath(topdirpath), hashclass=resolved.hashclass, indexclass=resolved.indexclass, rollover=resolved.rollover, flags_id=id(resolved.flags) ) @require(lambda topdirpath: isinstance(topdirpath, str)) @require(lambda hashclass: issubclass(hashclass, HashCode)) def __init__( self, topdirpath, *, hashclass, indexclass=None, rollover=None, flags=None, flags_prefix=None, ): ''' Initialise the `DataDir` with `topdirpath`. Parameters: * `topdirpath`: a directory containing state information about the `DataFile`s; this contains the index-state.csv file and the associated index dbm-ish files. * `hashclass`: the hashclass used for indexing * `indexclass`: the `IndexClass` providing the index to chunks in the `DataFile`s. If not specified, a supported index class with an existing index file will be chosen, otherwise the most favoured indexclass available will be chosen. * `rollover`: data file roll over size; if a data file grows beyond this a new datafile is commenced for new blocks. Default: `self.DATA_ROLLOVER`. * `flags`: optional `Flags` object for control; if specified then `flags_prefix` is also required. * `flags_prefix`: prefix for control flag names. Note that `__init__` only saves the settings such as the `indexclass` and ensures that requisite directories exist. The monitor thread and runtime state are set up by the `startup` method and closed down by the `shutdown` method. ''' if hasattr(self, '_filemap'): return resolved = self._resolve( hashclass=hashclass, indexclass=indexclass, rollover=rollover, flags=flags, flags_prefix=flags_prefix ) RunStateMixin.__init__(self) MultiOpenMixin.__init__(self) FlaggedMixin.__init__( self, flags=resolved.flags, prefix=resolved.flags_prefix ) self.indexclass = resolved.indexclass self.rollover = resolved.rollover self.hashclass = hashclass self.hashname = hashclass.HASHNAME self.topdirpath = topdirpath self.statefilepath = joinpath( topdirpath, self.STATE_FILENAME_FORMAT.format(hashname=self.hashname) ) self.index = None self._filemap = None self._unindexed = None self._cache = None self._data_proxy = None self._dataQ = None self._data_progress = None self._monitor_Thread = None self._WDFstate = None self._lock = RLock() def __str__(self): return '%s(%s)' % (self.__class__.__name__, shortpath(self.topdirpath)) def __repr__(self): return ( '%s(topdirpath=%r,indexclass=%s)' % (self.__class__.__name__, self.topdirpath, self.indexclass) ) def initdir(self): ''' Init a directory and its "data" subdirectory. ''' topdirpath = self.topdirpath if not isdirpath(topdirpath): info("mkdir %r", topdirpath) with Pfx("mkdir(%r)", topdirpath): os.mkdir(topdirpath) datasubdirpath = joinpath(topdirpath, 'data') if not isdirpath(datasubdirpath): info("mkdir %r", datasubdirpath) with Pfx("mkdir(%r)", datasubdirpath): os.mkdir(datasubdirpath) @contextmanager def startup_shutdown(self): ''' Start up and shut down the `FilesDir`: take locks, start worker threads etc. ''' self.initdir() self._rfds = {} self._unindexed = {} self._filemap = SqliteFilemap(self, self.statefilepath) hashname = self.hashname self.index = self.indexclass( self.pathto(self.INDEX_FILENAME_BASE_FORMAT.format(hashname=hashname)) ) self.index.open() self.runstate.start() # cache of open DataFiles self._cache = LRU_Cache( maxsize=4, on_remove=lambda k, datafile: datafile.close() ) # Set up data queue. # The .add() method adds the data to self._unindexed, puts the # data onto the data queue, and returns. # The data queue worker saves the data to backing files and # updates the indices. self._data_progress = Progress( name=str(self) + " data queue ", total=0, units_scale=BINARY_BYTES_SCALE, ) if defaults.show_progress: proxy_cmgr = upd_state.upd.insert(1) else: proxy_cmgr = nullcontext() with proxy_cmgr as data_proxy: self._data_proxy = data_proxy self._dataQ = IterableQueue(65536) self._data_Thread = bg_thread( self._data_queue, name="%s._data_queue" % (self,), ) self._monitor_Thread = bg_thread( self._monitor_datafiles, name="%s-datafile-monitor" % (self,), ) yield self.runstate.cancel() self.flush() # shut down the monitor Thread mon_thread = self._monitor_Thread if mon_thread is not None: mon_thread.join() self._monitor_Thread = None # drain the data queue self._dataQ.close() self._data_Thread.join() self._dataQ = None self._data_thread = None # update state to substrate self._cache = None self._filemap.close() self._filemap = None self.index.close() # close the read file descriptors for rfd in self._rfds.values(): with Pfx("os.close(rfd:%d)", rfd): os.close(rfd) del self._rfds self.runstate.stop() def pathto(self, rpath): ''' Return the path to `rpath`, which is relative to the `topdirpath`. ''' return joinpath(self.topdirpath, rpath) def datapathto(self, rpath): ''' Return the path to `rpath`, which is relative to the `datadirpath`. ''' return self.pathto(joinpath('data', rpath)) @typechecked def new_datafile(self) -> DataFileState: ''' Create a new datafile. Return its `DataFileState`. ''' while True: filename = str(uuid4()) + self.DATA_DOT_EXT pathname = self.datapathto(filename) if existspath(pathname): error("new datafile path already exists, retrying: %r", pathname) continue with Pfx(pathname): try: createpath(pathname) except OSError as e: if e.errno == errno.EEXIST: error("new datafile path already exists") continue raise break return self._filemap.add_path(filename) def add(self, data): ''' Add `data` to the cache, queue data for indexing, return hashcode. ''' hashcode = self.hashclass.from_chunk(data) if hashcode not in self._unindexed: self._unindexed[hashcode] = data self._data_progress.total += len(data) self._dataQ.put(data) return hashcode def _data_queue(self): wf = None DFstate = None filenum = None index = self.index unindexed = self._unindexed dataQ = self._dataQ progress = self._data_progress hashchunk = self.hashclass.from_chunk batch_size = 128 def data_batches(dataQ, batch_size): for data in dataQ: # assemble up to 64 chunks at a time data_batch = [data] while not dataQ.empty() and len(data_batch) < batch_size: data_batch.append(next(dataQ)) yield data_batch data_batch = None batches = data_batches(dataQ, batch_size) if defaults.show_progress: batches = progress.iterbar( batches, itemlenfunc=lambda batch: sum(map(len, batch)), proxy=self._data_proxy ) for data_batch in batches: batch_length = len(data_batch) ##print("data batch of", batch_length) # FileDataIndexEntry by hashcode for batch update of index after flush entry_bs_by_hashcode = {} for data in data_batch: hashcode = hashchunk(data) if hashcode not in index: # new data, save to a datafile and update the index # pretranscribe the in-file data record # save the data record to the current file if wf is None: DFstate = self.new_datafile() filenum = DFstate.filenum wf = open(DFstate.pathname, 'ab') self._WDFstate = DFstate bs, data_offset, data_length, flags = self.data_save_information( data ) offset = wf.tell() wf.write(bs) length = len(bs) post_offset = offset + length # make a record for this chunk entry_bs_by_hashcode[hashcode] = bytes( FileDataIndexEntry( filenum=filenum, data_offset=offset + data_offset, data_length=data_length, flags=flags, ) ) # after the batch, flush and roll over if beyond the high water mark if wf is not None: wf.flush() with self._lock: for hashcode, entry_bs in entry_bs_by_hashcode.items(): index[hashcode] = entry_bs try: del unindexed[hashcode] except KeyError: # this can happen when the same key is indexed twice # entirely plausible if a new datafile is added to the datadir pass # note that the index is up to post_offset DFstate.indexed_to = post_offset rollover = self.rollover if rollover is not None and wf.tell() >= rollover: # file now full, close it so as to start a new one on next write os.close(wfd) wfd = None self._filemap.set_indexed_to(DFstate.filenum, DFstate.indexed_to) DFstate = None if batch_length < batch_size: sleep(0.2) if wf is not None: wf.close() wf = None if DFstate is not None: self._filemap.set_indexed_to(DFstate.filenum, DFstate.indexed_to) def get_Archive(self, name=None, **kw): ''' Return the Archive named `name`. If `name` is omitted or `None` the Archive path is the `topdirpath` plus the extension `'.vt'`. Otherwise it is the `topdirpath` plus a dash plus the `name` plus the extension `'.vt'`. The `name` may not be empty or contain a dot or a dash. ''' with Pfx("%s.get_Archive", self): if name is None or not name: archivepath = self.topdirpath + '.vt' else: if '.' in name or '/' in name: raise ValueError("invalid name: %r" % (name,)) archivepath = self.topdirpath + '-' + name + '.vt' return Archive(archivepath, **kw) @locked def flush(self): ''' Flush all the components. ''' self._cache.flush() self.index.flush() def __setitem__(self, hashcode, data): h = self.add(data) if hashcode != h: raise ValueError( 'supplied hashcode %s does not match data, data added under %s instead' % (hashcode, h) ) def __len__(self): return len(self.index) @pfx_method def hashcodes_from(self, *, start_hashcode=None): ''' Generator yielding the hashcodes from the database in order starting with optional `start_hashcode`. Parameters: * `start_hashcode`: the first hashcode; if missing or `None`, iteration starts with the first key in the index ''' # important: consult this BEFORE self.index.keys otherwise items might # flow from unindexed to the index unseen with self._lock: unindexed = list(self._unindexed) if start_hashcode is not None and unindexed: unindexed = filter(lambda h: h >= start_hashcode, unindexed) hs = map( self.hashclass, self.index.sorted_keys(start_hashcode=start_hashcode), ) unindexed = set(unindexed) if unindexed: hs = filter(lambda h: h not in unindexed, hs) return imerge(hs, sorted(unindexed)) def __iter__(self): return self.hashcodes_from() # without this "in" tries to iterate over the mapping with int indices def __contains__(self, hashcode): return hashcode in self._unindexed or hashcode in self.index def __getitem__(self, hashcode): ''' Return the decompressed data associated with the supplied `hashcode`. ''' unindexed = self._unindexed try: return unindexed[hashcode] except KeyError: index = self.index try: with self._lock: entry_bs = index[hashcode] except KeyError: raise KeyError("%s[%s]: hash not in index" % (self, hashcode)) entry = FileDataIndexEntry.from_bytes(entry_bs) filenum = entry.filenum try: try: rfd = self._rfds[filenum] except KeyError: # TODO: shove this sideways to self.open_datafile # which releases an existing datafile if too many are open DFstate = self._filemap[filenum] rfd = self._rfds[filenum] = openfd_read(DFstate.pathname) return entry.fetch_fd(rfd) except Exception as e: exception("%s[%s]:%s not available: %s", self, hashcode, entry, e) raise KeyError(str(hashcode)) from e
def _monitor_datafiles(self): ''' Thread body to poll the ideal tree for new or changed files. ''' proxy = upd_state.proxy proxy.prefix = str(self) + " monitor " meta_store = self.meta_store filemap = self._filemap datadirpath = self.pathto('data') if meta_store is not None: topdir = self.topdir else: warning("%s: no meta_store!", self) updated = False disabled = False while not self.cancelled: sleep(self.DELAY_INTERSCAN) if self.flag_scan_disable: if not disabled: info("scan %r DISABLED", shortpath(datadirpath)) disabled = True continue if disabled: info("scan %r ENABLED", shortpath(datadirpath)) disabled = False # scan for new datafiles with Pfx("%r", datadirpath): seen = set() info("scan tree...") with proxy.extend_prefix(" scan"): for dirpath, dirnames, filenames in os.walk(datadirpath, followlinks=True): dirnames[:] = sorted(dirnames) filenames = sorted(filenames) sleep(self.DELAY_INTRASCAN) if self.cancelled or self.flag_scan_disable: break rdirpath = relpath(dirpath, datadirpath) with Pfx(rdirpath): with (proxy.extend_prefix(" " + rdirpath) if filenames else nullcontext()): # this will be the subdirectories into which to recurse pruned_dirnames = [] for dname in dirnames: if self.exclude_dir(joinpath(rdirpath, dname)): # unwanted continue subdirpath = joinpath(dirpath, dname) try: S = os.stat(subdirpath) except OSError as e: # inaccessable warning("stat(%r): %s, skipping", subdirpath, e) continue ino = S.st_dev, S.st_ino if ino in seen: # we have seen this subdir before, probably via a symlink # TODO: preserve symlinks? attach alter ego directly as a Dir? debug( "seen %r (dev=%s,ino=%s), skipping", subdirpath, ino[0], ino[1] ) continue seen.add(ino) pruned_dirnames.append(dname) dirnames[:] = pruned_dirnames if meta_store is None: warning("no meta_store") D = None else: with meta_store: D = topdir.makedirs(rdirpath, force=True) # prune removed names names = list(D.keys()) for name in names: if name not in dirnames and name not in filenames: info("del %r", name) del D[name] for filename in filenames: with Pfx(filename): if self.cancelled or self.flag_scan_disable: break rfilepath = joinpath(rdirpath, filename) if self.exclude_file(rfilepath): continue filepath = joinpath(dirpath, filename) if not isfilepath(filepath): continue # look up this file in our file state index DFstate = filemap.get(rfilepath) if (DFstate is not None and D is not None and filename not in D): # in filemap, but not in dir: start again warning("in filemap but not in Dir, rescanning") filemap.del_path(rfilepath) DFstate = None if DFstate is None: DFstate = filemap.add_path(rfilepath) try: new_size = DFstate.stat_size(self.follow_symlinks) except OSError as e: if e.errno == errno.ENOENT: warning("forgetting missing file") self._del_datafilestate(DFstate) else: warning("stat: %s", e) continue if new_size is None: # skip non files debug("SKIP non-file") continue if meta_store: try: E = D[filename] except KeyError: E = FileDirent(filename) D[filename] = E else: if not E.isfile: info( "new FileDirent replacing previous nonfile: %s", E ) E = FileDirent(filename) D[filename] = E if new_size > DFstate.scanned_to: with proxy.extend_prefix( " scan %s[%d:%d]" % (filename, DFstate.scanned_to, new_size)): if DFstate.scanned_to > 0: info("scan from %d", DFstate.scanned_to) if meta_store is not None: blockQ = IterableQueue() R = meta_store._defer( lambda B, Q: top_block_for(spliced_blocks(B, Q)), E.block, blockQ ) scan_from = DFstate.scanned_to scan_start = time() scanner = DFstate.scanfrom(offset=DFstate.scanned_to) if defaults.show_progress: scanner = progressbar( DFstate.scanfrom(offset=DFstate.scanned_to), "scan " + rfilepath, position=DFstate.scanned_to, total=new_size, units_scale=BINARY_BYTES_SCALE, itemlenfunc=lambda t3: t3[2] - t3[0], update_frequency=128, ) for pre_offset, data, post_offset in scanner: hashcode = self.hashclass.from_chunk(data) entry = FileDataIndexEntry( filenum=DFstate.filenum, data_offset=pre_offset, data_length=len(data), flags=0, ) entry_bs = bytes(entry) with self._lock: index[hashcode] = entry_bs if meta_store is not None: B = Block(data=data, hashcode=hashcode, added=True) blockQ.put((pre_offset, B)) DFstate.scanned_to = post_offset if self.cancelled or self.flag_scan_disable: break if meta_store is not None: blockQ.close() try: top_block = R() except MissingHashcodeError as e: error("missing data, forcing rescan: %s", e) DFstate.scanned_to = 0 else: E.block = top_block D.changed = True updated = True elapsed = time() - scan_start scanned = DFstate.scanned_to - scan_from if elapsed > 0: scan_rate = scanned / elapsed else: scan_rate = None if scan_rate is None: info( "scanned to %d: %s", DFstate.scanned_to, transcribe_bytes_geek(scanned) ) else: info( "scanned to %d: %s at %s/s", DFstate.scanned_to, transcribe_bytes_geek(scanned), transcribe_bytes_geek(scan_rate) ) # stall after a file scan, briefly, to limit impact if elapsed > 0: sleep(min(elapsed, self.DELAY_INTRASCAN)) # update the archive after updating from a directory if updated and meta_store is not None: self.sync_meta() updated = False self.flush()
class POP3(MultiOpenMixin): ''' Simple POP3 class with support for streaming use. ''' def __init__(self, conn_spec): if isinstance(conn_spec, str): conn_spec = ConnectionSpec.from_spec(conn_spec) self.conn_spec = conn_spec self._result_queue = None self._client_worker = None self._sock = None self.recvf = None self.sendf = None self._lock = RLock() @pfx def startup(self): ''' Connect to the server and log in. ''' self._sock = self.conn_spec.connect() self.recvf = self._sock.makefile('r', encoding='iso8859-1') self.sendf = self._sock.makefile('w', encoding='ascii') self.client_begin() self.client_auth(self.conn_spec.user, self.conn_spec.password) self._result_queue = IterableQueue() self._client_worker = bg_thread( self._client_response_worker, args=(self._result_queue,) ) return self @pfx def shutdown(self): ''' Quit and disconnect. ''' logmsg = debug logmsg("send client QUIT") try: quitR = self.client_quit_bg() logmsg("flush QUIT") self.flush() logmsg("join QUIT") quitR.join() except Exception as e: exception("client quit: %s", e) logmsg = warning if self._result_queue: logmsg("close result queue") self._result_queue.close() self._result_queue = None if self._client_worker: logmsg("join client worker") self._client_worker.join() self._client_worker = None logmsg("close sendf") self.sendf.close() self.sendf = None logmsg("check for uncollected server responses") bs = self.recvf.read() if bs: warning("received %d bytes from the server at shutdown", len(bs)) logmsg("close recvf") self.recvf.close() self.recvf = None logmsg("close socket") self._sock.close() self._sock = None logmsg("shutdown complete") def readline(self): ''' Read a CRLF terminated line from `self.recvf`. Return the text preceeding the CRLF. Return `None` at EOF. ''' line0 = self.recvf.readline() if not line0: return None line = cutsuffix(line0, '\n') assert line is not line0, "missing LF: %r" % (line0,) line = cutsuffix(line, '\r') return line def readlines(self): ''' Generator yielding lines from `self.recf`. ''' while True: line = self.readline() if line is None: break yield line def get_response(self): ''' Read a server response. Return `(ok,status,etc)` where `ok` is true if `status` is `'+OK'`, false otherwise; `status` is the status word and `etc` is the following text. Return `(None,None,None)` on EOF from the receive stream. ''' line = self.readline() if line is None: return None, None, None try: status, etc = line.split(None, 1) except ValueError: status = line etc = '' return status == '+OK', status, etc def get_ok(self): ''' Read server response, require it to be `'OK+'`. Returns the `etc` part. ''' ok, status, etc = self.get_response() if not ok: raise ValueError("no ok from server: %r %r" % (status, etc)) return etc def get_multiline(self): ''' Generator yielding unstuffed lines from a multiline response. ''' for line in self.readlines(): if line == '.': break if line.startswith('.'): line = line[1:] yield line def flush(self): ''' Flush the send stream. ''' self.sendf.flush() def sendline(self, line, do_flush=False): ''' Send a line (excluding its terminating CRLF). If `do_flush` is true (default `False`) also flush the sending stream. ''' assert '\r' not in line and '\n' not in line self.sendf.write(line) self.sendf.write('\r\n') if do_flush: self.flush() def _client_response_worker(self, result_queue): ''' Worker to process queued request responses. Each completed response assigns `(etc,lines)` to the `Result` where `etc` is the addition text from the server ok response and `lines` is a list of the multiline part of the response or `None` if the response is not multiline. ''' for R, is_multiline in result_queue: try: etc = self.get_ok() if is_multiline: lines = list(self.get_multiline()) else: lines = None except Exception as e: # pylint: disable=broad-except warning("%s: %s", R, e) R.exc_info = sys.exc_info else: # save a list so that we can erase it in a handler to release memory R.result = [etc, lines] def client_begin(self): ''' Read the opening server response. ''' etc = self.get_ok() print(etc) def client_auth(self, user, password): ''' Perform a client authentication. ''' self.sendline(f'USER {user}', do_flush=True) print('USER', user, self.get_ok()) self.sendline(f'PASS {password}', do_flush=True) print('PASS', '****', self.get_ok()) def client_uidl(self): ''' Return a mapping of message number to message UID string. ''' self.sendline('UIDL', do_flush=True) self.get_ok() for line in self.get_multiline(): n, msg_uid = line.split(None, 1) n = int(n) yield n, msg_uid def client_bg(self, rq_line, is_multiline=False, notify=None): ''' Dispatch a request `rq_line` in the background. Return a `Result` to collect the request result. Parameters: * `rq_line`: POP3 request text, without any terminating CRLF * `is_multiline`: true if a multiline response is expected, default `False` * `notify`: a optional handler for `Result.notify`, applied if not `None` *Note*: DOES NOT flush the send stream. Call `self.flush()` when a batch of requests has been submitted, before trying to collect the `Result`s. The `Result` will receive `[etc,lines]` on success where: * `etc` is the trailing portion of an ok response line * `lines` is a list of unstuffed text lines from the response if `is_multiline` is true, `None` otherwise The `Result` gets a list instead of a tuple so that a handler may clear it in order to release memory. Example: R = self.client_bg(f'RETR {msg_n}', is_multiline=True, notify=notify) ''' with self._lock: self.sendline(rq_line) R = Result(rq_line) self._result_queue.put((R, is_multiline)) R.extra.update(rq_line=rq_line) if notify is not None: R.notify(notify) return R def client_dele_bg(self, msg_n): ''' Queue a delete request for message `msg_n`, return ` Result` for collection. ''' R = self.client_bg(f'DELE {msg_n}') R.extra.update(msg_n=msg_n) return R def client_quit_bg(self): ''' Queue a QUIT request. return ` Result` for collection. ''' R = self.client_bg('QUIT') return R def client_retr_bg(self, msg_n, notify=None): ''' Queue a retrieve request for message `msg_n`, return ` Result` for collection. If `notify` is not `None`, apply it to the `Result`. ''' R = self.client_bg(f'RETR {msg_n}', is_multiline=True, notify=notify) R.extra.update(msg_n=msg_n) return R def dl_bg(self, msg_n, maildir, deleRs): ''' Download message `msg_n` to Maildir `maildir`. Return the `Result` for the `RETR` request. After a successful save, queue a `DELE` for the message and add its `Result` to `deleRs`. ''' def dl_bg_save_result(R): _, lines = R.result R.result[1] = None # release lines msg_bs = b''.join( map(lambda line: line.encode('iso8859-1') + b'\r\n', lines) ) msg = BytesParser().parsebytes(msg_bs) with self._lock: Mkey = maildir.add(msg) deleRs.add(self.client_dele_bg(msg_n)) print(f'msg {msg_n}: {len(msg_bs)} octets, saved as {Mkey}, deleted.') R = self.client_retr_bg(msg_n, notify=dl_bg_save_result) return R
class SubLater(object): ''' A class for managing a group of deferred tasks using an existing `Later`. ''' def __init__(self, L): ''' Initialise the `SubLater` with its parent `Later`. TODO: accept `discard=False` param to suppress the queue and associated checks. ''' self._later = L self._later.open() self._lock = Lock() self._deferred = 0 self._queued = 0 self._queue = IterableQueue() self.closed = False def __str__(self): return "%s(%s%s,deferred=%d,completed=%d)" % ( type(self), self._later, "[CLOSED]" if self.closed else "", self._deferred, self._queued, ) def __iter__(self): ''' Iteration over the `SubLater` iterates over the queue of completed `LateFUnction`s. ''' return iter(self._queue) def close(self): ''' Close the SubLater. This prevents further deferrals. ''' with self._lock: closed = self.closed if closed: self._later.warning("repeated close of %s", self) else: self.closed = True self._queue.close() self._later.close() def defer(self, func, *a, **kw): ''' Defer a function, return its `LateFunction`. The resulting `LateFunction` will queue itself for collection on completion. ''' with self._lock: LF = self._later.defer(func, *a, **kw) self._deferred += 1 def on_complete(R): with self._lock: self._queue.put(R) self._queued += 1 if self.closed and self._queued >= self._deferred: self._queue.close() LF.notify(on_complete) return LF def reaper(self, handler=None): ''' Dispatch a `Thread` to collect completed `LateFunction`s. Return the `Thread`. `handler`: an optional callable to be passed each `LateFunction` as it completes. ''' @logexc def reap(Q): for LF in Q: if handler: try: handler(LF) except Exception as e: # pylint: disable=broad-except exception("%s: reap %s: %s", self, LF, e) T = Thread(name="reaper(%s)" % (self,), target=reap, args=(self._queue,)) T.start() return T
class PacketConnection(object): ''' A bidirectional binary connection for exchanging requests and responses. ''' # special packet indicating end of stream EOF_Packet = Packet(is_request=True, channel=0, tag=0, flags=0, rq_type=0, payload=b'') # pylint: disable=too-many-arguments def __init__(self, recv, send, request_handler=None, name=None, packet_grace=None, tick=None): ''' Initialise the PacketConnection. Parameters: * `recv`: inbound binary stream. If this is an `int` it is taken to be an OS file descriptor, otherwise it should be a `cs.buffer.CornuCopyBuffer` or a file like object with a `read1` or `read` method. * `send`: outbound binary stream. If this is an `int` it is taken to be an OS file descriptor, otherwise it should be a file like object with `.write(bytes)` and `.flush()` methods. For a file descriptor sending is done via an os.dup() of the supplied descriptor, so the caller remains responsible for closing the original descriptor. * `packet_grace`: default pause in the packet sending worker to allow another packet to be queued before flushing the output stream. Default: `DEFAULT_PACKET_GRACE`s. A value of `0` will flush immediately if the queue is empty. * `request_handler`: an optional callable accepting (`rq_type`, `flags`, `payload`). The request_handler may return one of 5 values on success: * `None`: response will be 0 flags and an empty payload. * `int`: flags only. Response will be the flags and an empty payload. * `bytes`: payload only. Response will be 0 flags and the payload. * `str`: payload only. Response will be 0 flags and the str encoded as bytes using UTF-8. * `(int, bytes)`: Specify flags and payload for response. An unsuccessful request should raise an exception, which will cause a failure response packet. * `tick`: optional tick parameter, default `None`. If `None`, do nothing. If a Boolean, call `tick_fd_2` if true, otherwise do nothing. Otherwise `tick` should be a callable accepting a byteslike value. ''' if name is None: name = str(seq()) self.name = name if isinstance(recv, int): self._recv = CornuCopyBuffer.from_fd(recv) elif isinstance(recv, CornuCopyBuffer): self._recv = recv else: self._recv = CornuCopyBuffer.from_file(recv) if isinstance(send, int): self._send = os.fdopen(os.dup(send), 'wb') else: self._send = send if packet_grace is None: packet_grace = DEFAULT_PACKET_GRACE if tick is None: tick = lambda bs: None elif isinstance(tick, bool): if tick: tick = tick_fd_2 else: tick = lambda bs: None self.packet_grace = packet_grace self.request_handler = request_handler self.tick = tick # tags of requests in play against the local system self._channel_request_tags = {0: set()} self.notify_recv_eof = set() self.notify_send_eof = set() # LateFunctions for the requests we are performing for the remote system self._running = set() # requests we have outstanding against the remote system self._pending = {0: {}} # sequence of tag numbers # TODO: later, reuse old tags to prevent monotonic growth of tag field self._tag_seq = Seq(1) # work queue for local requests self._later = Later(4, name="%s:Later" % (self, )) self._later.open() # dispatch queue of Packets to send self._sendQ = IterableQueue(16) self._lock = Lock() self.closed = False # debugging: check for reuse of (channel,tag) etc self.__sent = set() self.__send_queued = set() # dispatch Thread to process received packets self._recv_thread = bg_thread(self._receive_loop, name="%s[_receive_loop]" % (self.name, )) # dispatch Thread to send data # primary purpose is to bundle output by deferring flushes self._send_thread = bg_thread(self._send_loop, name="%s[_send]" % (self.name, )) def __str__(self): return "PacketConnection[%s]" % (self.name, ) @pfx_method def shutdown(self, block=False): ''' Shut down the PacketConnection, optionally blocking for outstanding requests. Parameters: `block`: block for outstanding requests, default False. ''' with self._lock: if self.closed: # shutdown already called from another thread return # prevent further request submission either local or remote self.closed = True ps = self._pending_states() if ps: warning("PENDING STATES AT SHUTDOWN: %r", ps) # wait for completion of requests we're performing for LF in list(self._running): LF.join() # shut down sender, should trigger shutdown of remote receiver self._sendQ.close(enforce_final_close=True) self._send_thread.join() # we do not wait for the receiver - anyone hanging on outstaning # requests will get them as they come in, and in theory a network # disconnect might leave the receiver hanging anyway self._later.close() if block: self._later.wait() def join(self): ''' Wait for the receive side of the connection to terminate. ''' self._recv_thread.join() def _new_tag(self): return next(self._tag_seq) def _pending_states(self): ''' Return a list of ( (channel, tag), Request_State ) for the currently pending requests. ''' states = [] pending = self._pending for channel, channel_states in sorted(pending.items()): for tag, channel_state in sorted(channel_states.items()): states.append(((channel, tag), channel_state)) return states @locked def _pending_add(self, channel, tag, state): ''' Record some state against a (channel, tag). ''' pending = self._pending if channel not in pending: raise ValueError("unknown channel %d" % (channel, )) channel_info = pending[channel] if tag in channel_info: raise ValueError("tag %d already pending in channel %d" % (tag, channel)) self._pending[channel][tag] = state @locked def _pending_pop(self, channel, tag): ''' Retrieve and remove the state associated with (channel, tag). ''' pending = self._pending if channel not in pending: raise ValueError("unknown channel %d" % (channel, )) channel_info = pending[channel] if tag not in channel_info: raise ValueError("tag %d unknown in channel %d" % (tag, channel)) if False and tag == 15: raise RuntimeError("BANG") return channel_info.pop(tag) def _pending_cancel(self): ''' Cancel all the pending requests. ''' for chtag, _ in self._pending_states(): channel, tag = chtag warning("%s: cancel pending request %d:%s", self, channel, tag) _, result = self._pending_pop(channel, tag) result.cancel() def _queue_packet(self, P): sig = (P.channel, P.tag, P.is_request) if sig in self.__send_queued: raise RuntimeError("requeue of %s: %s" % (sig, P)) self.__send_queued.add(sig) try: self._sendQ.put(P) except ClosedError as e: warning("%s: packet not sent: %s (P=%s)", self._sendQ, e, P) def _reject(self, channel, tag, payload=bytes(())): ''' Issue a rejection of the specified request. ''' error("rejecting request: " + str(payload)) if isinstance(payload, str): payload = payload.encode('utf-8') self._queue_packet( Packet(is_request=False, channel=channel, tag=tag, flags=0, payload=payload)) def _respond(self, channel, tag, flags, payload): ''' Issue a valid response. Tack a 1 (ok) flag onto the flags and dispatch. ''' assert isinstance(channel, int) assert isinstance(tag, int) assert isinstance(flags, int) assert isinstance(payload, bytes) flags = (flags << 1) | 1 self._queue_packet( Packet(is_request=False, channel=channel, tag=tag, flags=flags, payload=payload)) @not_closed # pylint: disable=too-many-arguments def request(self, rq_type, flags=0, payload=b'', decode_response=None, channel=0): ''' Compose and dispatch a new request, returns a `Result`. Allocates a new tag, a Result to deliver the response, and records the response decode function for use when the response arrives. Parameters: * `rq_type`: request type code, an int * `flags`: optional flags to accompany the request, an int; default `0`. * `payload`: optional bytes-like object to accompany the request; default `b''` * `decode_response`: optional callable accepting (response_flags, response_payload_bytes) and returning the decoded response payload value; if unspecified, the response payload bytes are used The Result will yield an `(ok, flags, payload)` tuple, where: * `ok`: whether the request was successful * `flags`: the response flags * `payload`: the response payload, decoded by decode_response if specified ''' if rq_type < 0: raise ValueError("rq_type may not be negative (%s)" % (rq_type, )) # reserve type 0 for end-of-requests rq_type += 1 tag = self._new_tag() R = Result() self._pending_add(channel, tag, Request_State(decode_response, R)) self._queue_packet( Packet(is_request=True, channel=channel, tag=tag, flags=flags, rq_type=rq_type, payload=payload)) return R @not_closed def do(self, *a, **kw): ''' Synchronous request. Submits the request, then calls the `Result` returned from the request. ''' return self.request(*a, **kw)() @logexc # pylint: disable=too-many-arguments def _run_request(self, channel, tag, handler, rq_type, flags, payload): ''' Run a request and queue a response packet. ''' with Pfx( "_run_request[channel=%d,tag=%d,rq_type=%d,flags=0x%02x,payload=%s", channel, tag, rq_type, flags, repr(payload) if len(payload) <= 32 else repr(payload[:32]) + '...'): result_flags = 0 result_payload = b'' try: result = handler(rq_type, flags, payload) if result is not None: if isinstance(result, int): result_flags = result elif isinstance(result, bytes): result_payload = result elif isinstance(result, str): result_payload = result.encode( encoding='utf-8', errors='xmlcharrefreplace') else: result_flags, result_payload = result except Exception as e: # pylint: disable=broad-except exception("exception: %s", e) self._reject(channel, tag, "exception during handler") else: self._respond(channel, tag, result_flags, result_payload) self._channel_request_tags[channel].remove(tag) # pylint: disable=too-many-branches,too-many-statements,too-many-locals def _receive_loop(self): ''' Receive packets from upstream, decode into requests and responses. ''' XX = self.tick with PrePfx("_RECEIVE [%s]", self): with post_condition(("_recv is None", lambda: self._recv is None)): while True: try: XX(b'<') packet = Packet.parse(self._recv) except EOFError: break if packet == self.EOF_Packet: break channel = packet.channel tag = packet.tag flags = packet.flags payload = packet.payload if packet.is_request: # request from upstream client with Pfx("request[%d:%d]", channel, tag): if self.closed: debug("rejecting request: closed") # NB: no rejection packet sent since sender also closed elif self.request_handler is None: self._reject(channel, tag, "no request handler") else: requests = self._channel_request_tags if channel not in requests: # unknown channel self._reject(channel, tag, "unknown channel %d") elif tag in self._channel_request_tags[ channel]: self._reject( channel, tag, "channel %d: tag already in use: %d" % (channel, tag)) else: # payload for requests is the request enum and data rq_type = packet.rq_type if rq_type == 0: # magic EOF rq_type - must be malformed (!=EOF_Packet) error( "malformed EOF packet received: %s", packet) break # normalise rq_type rq_type -= 1 requests[channel].add(tag) # queue the work function and track it LF = self._later.defer( self._run_request, channel, tag, self.request_handler, rq_type, flags, payload) self._running.add(LF) LF.notify(self._running.remove) else: with Pfx("response[%d:%d]", channel, tag): # response: get state of matching pending request, remove state try: rq_state = self._pending_pop(channel, tag) except ValueError as e: # no such pending pair - response to unknown request error("%d.%d: response to unknown request: %s", channel, tag, e) else: decode_response, R = rq_state # first flag is "ok" ok = (flags & 0x01) != 0 flags >>= 1 payload = packet.payload if ok: # successful reply # return (True, flags, decoded-response) if decode_response is None: # return payload bytes unchanged R.result = (True, flags, payload) else: # decode payload try: result = decode_response( flags, payload) except Exception: # pylint: disable=broad-except R.exc_info = sys.exc_info() else: R.result = (True, flags, result) else: # unsuccessful: return (False, other-flags, payload-bytes) R.result = (False, flags, payload) # end of received packets: cancel any outstanding requests self._pending_cancel() # alert any listeners of receive EOF for notify in self.notify_recv_eof: notify(self) self._recv = None self.shutdown() # pylint: disable=too-many-branches def _send_loop(self): ''' Send packets upstream. Write every packet directly to self._send. Flush whenever the queue is empty. ''' XX = self.tick ##with Pfx("%s._send", self): with PrePfx("_SEND [%s]", self): with post_condition(("_send is None", lambda: self._send is None)): fp = self._send Q = self._sendQ grace = self.packet_grace for P in Q: sig = (P.channel, P.tag, P.is_request) if sig in self.__sent: raise RuntimeError("second send of %s" % (P, )) self.__sent.add(sig) try: XX(b'>') for bs in P.transcribe_flat(): fp.write(bs) if Q.empty(): # no immediately ready further packets: flush the output buffer if grace > 0: # allow a little time for further Packets to queue XX(b'Sg') sleep(grace) if Q.empty(): # still nothing XX(b'F') fp.flush() else: XX(b'F') fp.flush() except OSError as e: if e.errno == errno.EPIPE: warning("remote end closed") break raise try: XX(b'>EOF') for bs in self.EOF_Packet.transcribe_flat(): fp.write(bs) fp.close() except (OSError, IOError) as e: if e.errno == errno.EPIPE: debug("remote end closed: %s", e) elif e.errno == errno.EBADF: warning("local end closed: %s", e) else: raise except Exception as e: error("(_SEND) UNEXPECTED EXCEPTION: %s %s", e, e.__class__) raise self._send = None