def __init__(self, L): ''' Initialise the `SubLater` with its parent `Later`. TODO: accept `discard=False` param to suppress the queue and associated checks. ''' self._later = L self._later.open() self._lock = Lock() self._deferred = 0 self._queued = 0 self._queue = IterableQueue() self.closed = False
def startup(self): ''' Connect to the server and log in. ''' self._sock = self.conn_spec.connect() self.recvf = self._sock.makefile('r', encoding='iso8859-1') self.sendf = self._sock.makefile('w', encoding='ascii') self.client_begin() self.client_auth(self.conn_spec.user, self.conn_spec.password) self._result_queue = IterableQueue() self._client_worker = bg_thread( self._client_response_worker, args=(self._result_queue,) ) return self
def pushto(self, dstS, *, capacity=64, progress=None): ''' Allocate a Queue for Blocks to push from this Store to another Store `dstS`. Return `(Q,T)` where `Q` is the new Queue and `T` is the Thread processing the Queue. Parameters: * `dstS`: the secondary Store to receive Blocks. * `capacity`: the Queue capacity, arbitrary default `64`. * `progress`: an optional `Progress` counting submitted and completed data bytes. Once called, the caller can then .put Blocks onto the Queue. When finished, call Q.close() to indicate end of Blocks and T.join() to wait for the processing completion. ''' sem = Semaphore(capacity) ##sem = Semaphore(1) name = "%s.pushto(%s)" % (self.name, dstS.name) with Pfx(name): Q = IterableQueue(capacity=capacity, name=name) srcS = self srcS.open() dstS.open() T = bg_thread(lambda: ( self.push_blocks(name, Q, srcS, dstS, sem, progress), srcS.close(), dstS.close(), )) return Q, T
def report_offsets(bfr, run_parser): ''' Dispatch a parser in a separate Thread, return an IterableQueue yielding offsets. Parameters: * `bfr`: a `CornuCopyBuffer` providing data to parse * `run_parser`: a callable which runs the parser; it should accept a `CornuCopyBuffer` as its sole argument. This function allocates an `IterableQueue` to receive the parser offset reports and sets the `CornuCopyBuffer` with `report_offset` copying offsets to the queue. It is the task of the parser to call `bfr.report_offset` as necessary to indicate suitable offsets. ''' with Pfx("report_offsets(bfr,run_parser=%s)", run_parser): offsetQ = IterableQueue() if bfr.copy_offsets is not None: warning("bfr %s already has copy_offsets, replacing", bfr) bfr.copy_offsets = offsetQ.put def thread_body(): with Pfx("parser-thread"): try: run_parser(bfr) except Exception as e: exception("exception: %s", e) raise finally: offsetQ.close() T = PfxThread(target=thread_body) T.start() return offsetQ
def keys(self): seen = set() Q = IterableQueue() def keys_from(S): for h in S.keys(): Q.put(h) Q.put(None) busy = 0 for S in self.read: bg_thread(partial(keys_from, S)) busy += 1 for h in Q: if h is None: busy -= 1 if not busy: Q.close() elif h not in seen: yield h seen.add(h)
def pipeline(later, actions, inputs=None, outQ=None, name=None): ''' Construct a function pipeline to be mediated by this Later queue. Return: `input, output` where `input`` is a closeable queue on which more data items can be put and `output` is an iterable from which result can be collected. Parameters: * `actions`: an iterable of filter functions accepting single items from the iterable `inputs`, returning an iterable output. * `inputs`: the initial iterable inputs; this may be None. If missing or None, it is expected that the caller will be supplying input items via `input.put()`. * `outQ`: the optional output queue; if None, an IterableQueue() will be allocated. * `name`: name for the PushQueue implementing this pipeline. If `inputs` is None or `open` is true, the returned `input` requires a call to `input.close()` when no further inputs are to be supplied. Example use with presupplied Later `L`: input, output = L.pipeline( [ ls, filter_ls, ( FUNC_MANY_TO_MANY, lambda items: sorted(list(items)) ), ], ('.', '..', '../..'), ) for item in output: print(item) ''' filter_funcs = list(actions) if not filter_funcs: raise ValueError("no actions") if outQ is None: outQ = IterableQueue(name="pipelineIQ") if name is None: name = "pipelinePQ" pipeline = Pipeline(name, later, filter_funcs, outQ) inQ = pipeline.inQ if inputs is not None: later.defer_iterable(inputs, inQ) else: debug( "%s._pipeline: no inputs, NOT setting up _defer_iterable( inputs, inQ=%r)", later, inQ) return pipeline
def dispatch(self, func, retq=None, deliver=None, pfx=None, daemon=None): ''' Dispatch the callable `func` in a separate thread. On completion the result is the sequence `func_result, None, None, None`. On an exception the result is the sequence `None, exec_type, exc_value, exc_traceback`. If `retq` is not None, the result is .put() on retq. If `deliver` is not None, deliver(result) is called. If the parameter `pfx` is not None, submit pfx.partial(func); see the cs.logutils.Pfx.partial method for details. If `daemon` is not None, set the .daemon attribute of the Thread to `daemon`. TODO: high water mark for idle Threads. ''' if self.closed: raise ValueError("%s: closed, but dispatch() called" % (self,)) if pfx is not None: func = pfx.partial(func) if daemon is None: daemon = current_thread().daemon idle = self.idle_daemon if daemon else self.idle_fg with self._lock: debug("dispatch: idle = %s", idle) if idle: # use an idle thread entry = idle.pop() debug("dispatch: reuse %s", entry) else: debug("dispatch: need new thread") # no available threads - make one Targs = [] T = Thread( target=self._handler, args=Targs, name=("%s:worker" % (self.name,)) ) T.daemon = daemon Q = IterableQueue(name="%s:IQ%d" % (self.name, seq())) entry = WTPoolEntry(T, Q) self.all.add(entry) Targs.append(entry) debug("%s: start new worker thread (daemon=%s)", self, T.daemon) T.start() entry.queue.put((func, retq, deliver))
def s3scrape(bucket_pool, srcurl, doit=False, do_delete=False, do_upload=False): ''' Sync website to S3 directory tree. ''' global UPD ok = True L = Later(4, name="s3scrape(%r, %r)" % (bucket_pool.bucket_name, srcurl)) with L: if do_upload: Q = IterableQueue() def dispatch(): for LF in s3scrape_async(L, bucket_pool, srcurl, doit=doit, do_delete=do_delete): Q.put(LF) Q.close() Thread(target=dispatch).start() for LF in Q: diff, ctype, srcU, dstpath, e, error_msg = LF() with Pfx(srcU): if e: error(error_msg) ok = False else: line = "%s %-25s %s" % (diff.summary(), ctype, dstpath) if diff.unchanged: UPD.out(line) ##UPD.nl(line) else: if diff.changed_fields() == ['time']: # be quiet about time changes UPD.out(line) else: UPD.nl(line) ##UPD.nl(" %r", diff.metadata) if do_delete: # now process deletions with bucket_pool.instance() as B: ##if dstdir: ## dstdir_prefix = dstdir + RSEP ##else: ## dstdir_prefix = '' dstdir_prefix = RSEP with Pfx("S3.filter(Prefix=%r)", dstdir_prefix): dstdelpaths = [] for s3obj in B.objects.filter(Prefix=dstdir_prefix): dstpath = s3obj.key with Pfx(dstpath): if not dstpath.startswith(dstdir_prefix): error("unexpected dstpath, not in subdir") continue dstrpath = dstpath[len(dstdir_prefix):] if dstrpath.startswith(RSEP): error("unexpected dstpath, extra %r", RSEP) continue raise RuntimeError("DELETION UNIMPLEMENTED") srcpath = joinpath(srcdir, s32path(dstrpath, unpercent)) if os.path.exists(srcpath): ##info("src exists, not deleting (src=%r)", srcpath) continue ## uncomment if new %hh omissions surface ##UPD.nl("MISSING local %r", srcpath) if dstrpath.endswith(RSEP): # a folder UPD.nl("d DEL %s", dstpath) else: UPD.nl("* DEL %s", dstpath) dstdelpaths.append(dstpath) if dstdelpaths: dstdelpaths = sorted(dstdelpaths, reverse=True) while dstdelpaths: delpaths = dstdelpaths[:S3_MAX_DELETE_OBJECTS] if doit: result = B.delete_objects( Delete={ 'Objects': [{ 'Key': dstpath } for dstpath in delpaths] }) errs = result.get('Errors') if errs: ok = False for err in errors: error("delete: %s: %r", err['Message'], err['Key']) dstdelpaths[:len(delpaths)] = [] L.wait() return ok
def __init__(self): self.Q = IterableQueue(1024) self.bfr = CornuCopyBuffer(self.Q)
def blocked_chunks_of2( chunks, *, scanner=None, min_block=None, max_block=None, ): ''' Generator which connects to a scanner of a chunk stream in order to emit low level edge aligned data chunks. Parameters: * `chunks`: a source iterable of data chunks, handed to `scanner` * `scanner`: optional callable accepting a `CornuCopyBuffer` and returning an iterable of `int`s, such as a generator. `scanner` may be `None`, in which case only the rolling hash is used to locate boundaries. * `min_block`: the smallest amount of data that will be used to create a Block, default from `MIN_BLOCKSIZE` (`{MIN_BLOCKSIZE}`) * `max_block`: the largest amount of data that will be used to create a Block, default from `MAX_BLOCKSIZE` (`{MAX_BLOCKSIZE}`) The iterable returned from `scanner(chunks)` yields `int`s which are considered desirable block boundaries. ''' if min_block is None: min_block = MIN_BLOCKSIZE elif min_block < 8: raise ValueError("rejecting min_block < 8: %s" % (min_block, )) if max_block is None: max_block = MAX_BLOCKSIZE elif max_block >= 1024 * 1024: raise ValueError("rejecting max_block >= 1024*1024: %s" % (max_block, )) if min_block >= max_block: raise ValueError("rejecting min_block:%d >= max_block:%d" % (min_block, max_block)) # source data for aligned chunk construction dataQ = IterableQueue() # queue of offsets from the parser offsetQ = IterableQueue() # copy chunks to the parser and also to the post-parser chunk assembler tee_chunks = tee(chunks, dataQ) parse_bfr = CornuCopyBuffer(tee_chunks) runstate = defaults.runstate def run_parser(runstate, bfr, min_block, max_block, offsetQ): ''' Thread body to scan `chunks` for offsets. The chunks are copied to `parseQ`, then their boundary offsets. If thwere is a scanner we scan the input data with it first. When it terminates (including from some exception), we scan the remaining chunks with scanbuf. The main function processes `parseQ` and uses its chunks and offsets to assemble aligned chunks of data. ''' try: offset = 0 if scanner: # Consume the chunks and offsets via a queue. # The scanner puts offsets onto the queue. # When the scanner fetches from the chunks, those chunks are copied to the queue. # Accordingly, chunks _should_ arrive before offsets within them. # pylint: disable=broad-except try: for offset in scanner(bfr): if runstate.cancelled: break # the scanner should yield only offsets, not chunks and offsets if not isinstance(offset, int): warning("discarding non-int from scanner %s: %s", scanner, offset) else: offsetQ.put(offset) except Exception as e: warning("exception from scanner %s: %s", scanner, e) # Consume the remainder of chunk_iter; the tee() will copy it to parseQ. # This is important to ensure that no chunk is missed. # We run these blocks through scanbuf() to find offsets. cso = bfr.offset # offset after all the chunks so far assert offset <= cso sofar = cso - offset if sofar >= max_block: offsetQ.put(cso) sofar = 0 for offset in scan(bfr, sofar=sofar, min_block=min_block, max_block=max_block): if runstate.cancelled: break offsetQ.put(cso + offset) finally: # end of offsets and chunks offsetQ.close() dataQ.close() # dispatch the parser bg_thread(run_parser, args=(runstate, parse_bfr, min_block, max_block, offsetQ), daemon=True) # data source for assembling aligned chunks data_bfr = CornuCopyBuffer(dataQ) sofar = 0 offset = None for offset in offsetQ: assert offset >= sofar block_size = offset - sofar assert block_size >= 0, ("block_size:%d <= 0 -- sofar=%d, offset=%d" % (block_size, sofar, offset)) if block_size < min_block: # skip over small edges assert scanner is not None, ( "scanner=None but still got an overly near offset" " (sofar=%d, offset=%d => block_size=%d < min_block:%d)" % (sofar, offset, block_size, min_block)) continue subchunks = data_bfr.takev(block_size) assert sum(map(len, subchunks)) == block_size if block_size > max_block: # break up overly long blocks without a parser assert scanner is not None, ( "scanner=None but still got an overly distant offset" " (sofar=%d, offset=%d => block_size=%d > max_block:%d)" % (sofar, offset, block_size, max_block)) yield from blocked_chunks_of2(subchunks, min_block=min_block, max_block=max_block) else: yield b''.join(subchunks) sofar += block_size bs = b''.join(data_bfr) if bs: assert len(bs) <= max_block yield bs
def blocked_chunks_of( chunks, *, scanner=None, min_block=None, max_block=None, histogram=None, ): ''' Generator which connects to a scanner of a chunk stream in order to emit low level edge aligned data chunks. *OBSOLETE*: we now use the simpler and faster `blocked_chunks_of2`. Parameters: * `chunks`: a source iterable of data chunks, handed to `scanner` * `scanner`: optional callable accepting a `CornuCopyBuffer` and returning an iterable of `int`s, such as a generator. `scanner` may be `None`, in which case only the rolling hash is used to locate boundaries. * `min_block`: the smallest amount of data that will be used to create a Block, default from `MIN_BLOCKSIZE` (`{MIN_BLOCKSIZE}`) * `max_block`: the largest amount of data that will be used to create a Block, default from `MAX_BLOCKSIZE` (`{MAX_BLOCKSIZE}`) * `histogram`: if not `None`, a `defaultdict(int)` to collate counts. Integer indices count block sizes and string indices are used for `'bytes_total'` and `'bytes_hash_scanned'`. The iterable returned from `scanner(chunks)` yields `int`s which are considered desirable block boundaries. ''' # pylint: disable=too-many-nested-blocks,too-many-statements # pylint: disable=too-many-branches,too-many-locals with Pfx("blocked_chunks_of"): if min_block is None: min_block = MIN_BLOCKSIZE elif min_block < 8: raise ValueError("rejecting min_block < 8: %s" % (min_block, )) if max_block is None: max_block = MAX_BLOCKSIZE elif max_block >= 1024 * 1024: raise ValueError("rejecting max_block >= 1024*1024: %s" % (max_block, )) if min_block >= max_block: raise ValueError("rejecting min_block:%d >= max_block:%d" % (min_block, max_block)) # obtain iterator of chunks; this avoids accidentally reusing the chunks # if for example chunks is a sequence chunk_iter = iter(chunks) # Set up parseQ, an iterable yielding a mix of source data and # offsets representing desirable block boundaries. # If there is no scanner, this is just chunk_iter. # If there is a scanner we dispatch the scanner in a separate # Thread and feed it a tee() of chunk_iter, which copies chunks # to the parseQ when chunks are obtained by the scanner. The # Thread runs the scanner and copies its output offsets to the # parseQ. # The tee() arranges that chunks arrive before any offsets within them. if scanner is None: # No scanner, consume the chunks directly. parseQ = chunk_iter else: # Consume the chunks and offsets via a queue. # The scanner puts offsets onto the queue. # When the scanner fetches from the chunks, those chunks are copied to the queue. # When the scanner terminates, any remaining chunks are also copied to the queue. parseQ = IterableQueue() chunk_iter = tee(chunk_iter, parseQ) def run_parser(): ''' Thread body to run the supplied scanner against the input data. ''' bfr = CornuCopyBuffer(chunk_iter) # pylint: disable=broad-except try: for offset in scanner(bfr): # the scanner should yield only offsets, not chunks and offsets if not isinstance(offset, int): warning("discarding non-int from scanner %s: %s", scanner, offset) else: parseQ.put(offset) except Exception as e: exception("exception from scanner %s: %s", scanner, e) # Consume the remainder of chunk_iter; the tee() will copy it to parseQ. for _ in chunk_iter: pass # end of offsets and chunks parseQ.close() bg_thread(run_parser) # inbound chunks and offsets in_offsets = [] # heap of unprocessed edge offsets # prime `available_chunk` with the first data chunk, ready for get_next_chunk try: available_chunk = next(parseQ) except StopIteration: # no data! just return return def get_next_chunk(): ''' Fetch and return the next data chunk from the `parseQ`. Return None at end of input. Also gather all the following offsets from the queue before return. Because this inherently means collecting the chunk beyond these offsets, we keep that in `available_chunk` for the next call. Sets parseQ to None if the end of the iterable is reached. ''' nonlocal parseQ, in_offsets, hash_value, available_chunk if parseQ is None: assert available_chunk is None return None next_chunk = available_chunk available_chunk = None assert not isinstance(next_chunk, int) # scan the new chunk and load potential edges into the offset heap hash_value, chunk_scan_offsets = scanbuf(hash_value, next_chunk) for cso in chunk_scan_offsets: heappush(in_offsets, offset + cso) # gather items from the parseQ until the following chunk # or end of input while True: try: item = next(parseQ) except StopIteration: parseQ = None break else: if isinstance(item, int): heappush(in_offsets, item) else: available_chunk = item break return next_chunk last_offset = None first_possible_point = None max_possible_point = None def recompute_offsets(): ''' Recompute relevant offsets from the block parameters. The first_possible_point is last_offset+min_block, the earliest point at which we will accept a block boundary. The max_possible_point is last_offset+max_block, the latest point at which we will accept a block boundary; we will choose this if no next_offset or hash offset is found earlier. ''' nonlocal last_offset, first_possible_point, max_possible_point first_possible_point = last_offset + min_block max_possible_point = last_offset + max_block # prepare initial state last_offset = 0 # latest released boundary recompute_offsets( ) # compute first_possible_point and max_possible_point hash_value = 0 offset = 0 chunk0 = None offset0 = None # unblocked outbound data pending = _PendingBuffer(max_block) # Read data chunks and locate desired boundaries. while True: chunk = get_next_chunk() if chunk is None: break # verify current chunk start offset against end of previous chunk assert chunk0 is None or offset == offset0 + len(chunk0), \ "offset0=%s, len(chunk0)=%d: sum(%d) != current offset %d" \ % (offset0, len(chunk0), offset0 + len(chunk0), offset) chunk0 = chunk offset0 = offset chunk = memoryview(chunk) chunk_end_offset = offset + len(chunk) # process current chunk advance_by = 0 # how much data to add to the pending buffer release = False # whether we hit a boundary ==> flush the buffer while chunk: if advance_by > 0: # advance through this chunk # buffer the advance # release ==> flush the buffer and update last_offset assert advance_by is not None assert advance_by >= 0 assert advance_by <= len(chunk) # save the advance bytes and yield any overflow for out_chunk in pending.append(chunk[:advance_by]): yield out_chunk if histogram is not None: out_chunk_size = len(out_chunk) histogram['bytes_total'] += out_chunk_size histogram[out_chunk_size] += 1 histogram['buffer_overflow_chunks'] += 1 offset += advance_by chunk = chunk[advance_by:] if last_offset != pending.offset: # if the flush discarded a full buffer we need to adjust our boundaries last_offset = pending.offset recompute_offsets() if release: release = False # becomes true if we should flush after taking data # yield the current pending data for out_chunk in pending.flush(): yield out_chunk if histogram is not None: out_chunk_size = len(out_chunk) histogram['bytes_total'] += out_chunk_size histogram[out_chunk_size] += 1 last_offset = pending.offset recompute_offsets() if not chunk: # consumed the end of the chunk, need a new one break advance_by = None # fetch the next available edge, None if nothing available or suitable while True: if in_offsets: next_offset = heappop(in_offsets) if next_offset > offset and next_offset >= first_possible_point: break else: next_offset = None break if next_offset is None or next_offset > chunk_end_offset: # no suitable edge: consume the chunk and advance take_to = chunk_end_offset else: # edge before end of chunk: use it take_to = next_offset release = True advance_by = take_to - offset assert advance_by > 0 # yield any left over data for out_chunk in pending.flush(): yield out_chunk if histogram is not None: out_chunk_size = len(out_chunk) histogram['bytes_total'] += out_chunk_size histogram[out_chunk_size] += 1
def greedy(g=None, queue_depth=0): ''' A decorator or function for greedy computation of iterables. If `g` is omitted or callable this is a decorator for a generator function causing it to compute greedily, capacity limited by `queue_depth`. If `g` is iterable this function dispatches it in a `Thread` to compute greedily, capacity limited by `queue_depth`. Example with an iterable: for packet in greedy(parse_data_stream(stream)): ... process packet ... which does some readahead of the stream. Example as a function decorator: @greedy def g(n): for item in range(n): yield n This can also be used directly on an existing iterable: for item in greedy(range(n)): yield n Normally a generator runs on demand. This function dispatches a `Thread` to run the iterable (typically a generator) putting yielded values to a queue and returns a new generator yielding from the queue. The `queue_depth` parameter specifies the depth of the queue and therefore how many values the original generator can compute before blocking at the queue's capacity. The default `queue_depth` is `0` which creates a `Channel` as the queue - a zero storage buffer - which lets the generator compute only a single value ahead of time. A larger `queue_depth` allocates a `Queue` with that much storage allowing the generator to compute as many as `queue_depth+1` values ahead of time. Here's a comparison of the behaviour: Example without `@greedy` where the "yield 1" step does not occur until after the "got 0": >>> from time import sleep >>> def g(): ... for i in range(2): ... print("yield", i) ... yield i ... print("g done") ... >>> G = g(); sleep(0.1) >>> for i in G: ... print("got", i) ... sleep(0.1) ... yield 0 got 0 yield 1 got 1 g done Example with `@greedy` where the "yield 1" step computes before the "got 0": >>> from time import sleep >>> @greedy ... def g(): ... for i in range(2): ... print("yield", i) ... yield i ... print("g done") ... >>> G = g(); sleep(0.1) yield 0 >>> for i in G: ... print("got", repr(i)) ... sleep(0.1) ... yield 1 got 0 g done got 1 Example with `@greedy(queue_depth=1)` where the "yield 1" step computes before the "got 0": >>> from cs.x import X >>> from time import sleep >>> @greedy ... def g(): ... for i in range(3): ... X("Y") ... print("yield", i) ... yield i ... print("g done") ... >>> G = g(); sleep(2) yield 0 yield 1 >>> for i in G: ... print("got", repr(i)) ... sleep(0.1) ... yield 2 got 0 yield 3 got 1 g done got 2 ''' assert queue_depth >= 0 if g is None: # the parameterised @greedy(queue_depth=n) form # pylint: disable=no-value-for-parameter return _greedy_decorator(queue_depth=queue_depth) if callable(g): # the direct @greedy form return _greedy_decorator(g, queue_depth=queue_depth) # presumably an iterator - dispatch it in a Thread try: it = iter(g) except TypeError as e: # pylint: disable=raise-missing-from raise TypeError("g=%r: neither callable nor iterable: %s" % (g, e)) # pylint: disable=import-outside-toplevel from cs.queues import Channel, IterableQueue if queue_depth == 0: q = Channel() else: q = IterableQueue(queue_depth) def run_generator(): ''' Thread body for greedy generator. ''' try: for item in it: q.put(item) finally: q.close() Thread(target=run_generator).start() return iter(q)
def startup_shutdown(self): ''' Start up and shut down the `FilesDir`: take locks, start worker threads etc. ''' self.initdir() self._rfds = {} self._unindexed = {} self._filemap = SqliteFilemap(self, self.statefilepath) hashname = self.hashname self.index = self.indexclass( self.pathto(self.INDEX_FILENAME_BASE_FORMAT.format(hashname=hashname)) ) self.index.open() self.runstate.start() # cache of open DataFiles self._cache = LRU_Cache( maxsize=4, on_remove=lambda k, datafile: datafile.close() ) # Set up data queue. # The .add() method adds the data to self._unindexed, puts the # data onto the data queue, and returns. # The data queue worker saves the data to backing files and # updates the indices. self._data_progress = Progress( name=str(self) + " data queue ", total=0, units_scale=BINARY_BYTES_SCALE, ) if defaults.show_progress: proxy_cmgr = upd_state.upd.insert(1) else: proxy_cmgr = nullcontext() with proxy_cmgr as data_proxy: self._data_proxy = data_proxy self._dataQ = IterableQueue(65536) self._data_Thread = bg_thread( self._data_queue, name="%s._data_queue" % (self,), ) self._monitor_Thread = bg_thread( self._monitor_datafiles, name="%s-datafile-monitor" % (self,), ) yield self.runstate.cancel() self.flush() # shut down the monitor Thread mon_thread = self._monitor_Thread if mon_thread is not None: mon_thread.join() self._monitor_Thread = None # drain the data queue self._dataQ.close() self._data_Thread.join() self._dataQ = None self._data_thread = None # update state to substrate self._cache = None self._filemap.close() self._filemap = None self.index.close() # close the read file descriptors for rfd in self._rfds.values(): with Pfx("os.close(rfd:%d)", rfd): os.close(rfd) del self._rfds self.runstate.stop()
class FilesDir(SingletonMixin, HashCodeUtilsMixin, MultiOpenMixin, RunStateMixin, FlaggedMixin, Mapping): ''' Base class indexing locally stored data in files for a specific hashclass. There are two main subclasses of this at present: * `DataDir`: the data are kept in a subdirectory of UUID-named files, supporting easy merging and updating. * `PlatonicDataDir`: the data are present in a normal file tree, such as a preexisting media server directory or the like. ''' STATE_FILENAME_FORMAT = 'index-{hashname}-state.sqlite' INDEX_FILENAME_BASE_FORMAT = 'index-{hashname}' DATA_ROLLOVER = DEFAULT_ROLLOVER _FD_Singleton_Key_Tuple = namedtuple( 'FilesDir_FD_Singleton_Key_Tuple', 'cls realdirpath hashclass indexclass rollover flags_id' ) @classmethod def _resolve(cls, *, hashclass, indexclass, rollover, flags, flags_prefix): ''' Resolve the `__init__()` arguments, shared by `__init__` and `_singleton_key`. ''' if indexclass is None: indexclass = choose_indexclass( cls.INDEX_FILENAME_BASE_FORMAT.format(hashname=hashclass.HASHNAME) ) if rollover is None: rollover = cls.DATA_ROLLOVER elif rollover < 1024: raise ValueError( "rollover < 1024" " (a more normal size would be in megabytes or gigabytes): %r" % (rollover,) ) if flags is None: if flags_prefix is None: flags = DummyFlags() flags_prefix = 'DUMMY' else: if flags_prefix is None: raise ValueError("flags provided but no flags_prefix") return SimpleNamespace( hashclass=hashclass, indexclass=indexclass, rollover=rollover, flags=flags, flags_prefix=flags_prefix ) @classmethod def _singleton_key( cls, topdirpath, *, hashclass, indexclass=None, rollover=None, flags=None, flags_prefix=None, **_, ): resolved = cls._resolve( hashclass=hashclass, indexclass=indexclass, rollover=rollover, flags=flags, flags_prefix=flags_prefix ) return cls._FD_Singleton_Key_Tuple( cls=cls, realdirpath=realpath(topdirpath), hashclass=resolved.hashclass, indexclass=resolved.indexclass, rollover=resolved.rollover, flags_id=id(resolved.flags) ) @require(lambda topdirpath: isinstance(topdirpath, str)) @require(lambda hashclass: issubclass(hashclass, HashCode)) def __init__( self, topdirpath, *, hashclass, indexclass=None, rollover=None, flags=None, flags_prefix=None, ): ''' Initialise the `DataDir` with `topdirpath`. Parameters: * `topdirpath`: a directory containing state information about the `DataFile`s; this contains the index-state.csv file and the associated index dbm-ish files. * `hashclass`: the hashclass used for indexing * `indexclass`: the `IndexClass` providing the index to chunks in the `DataFile`s. If not specified, a supported index class with an existing index file will be chosen, otherwise the most favoured indexclass available will be chosen. * `rollover`: data file roll over size; if a data file grows beyond this a new datafile is commenced for new blocks. Default: `self.DATA_ROLLOVER`. * `flags`: optional `Flags` object for control; if specified then `flags_prefix` is also required. * `flags_prefix`: prefix for control flag names. Note that `__init__` only saves the settings such as the `indexclass` and ensures that requisite directories exist. The monitor thread and runtime state are set up by the `startup` method and closed down by the `shutdown` method. ''' if hasattr(self, '_filemap'): return resolved = self._resolve( hashclass=hashclass, indexclass=indexclass, rollover=rollover, flags=flags, flags_prefix=flags_prefix ) RunStateMixin.__init__(self) MultiOpenMixin.__init__(self) FlaggedMixin.__init__( self, flags=resolved.flags, prefix=resolved.flags_prefix ) self.indexclass = resolved.indexclass self.rollover = resolved.rollover self.hashclass = hashclass self.hashname = hashclass.HASHNAME self.topdirpath = topdirpath self.statefilepath = joinpath( topdirpath, self.STATE_FILENAME_FORMAT.format(hashname=self.hashname) ) self.index = None self._filemap = None self._unindexed = None self._cache = None self._data_proxy = None self._dataQ = None self._data_progress = None self._monitor_Thread = None self._WDFstate = None self._lock = RLock() def __str__(self): return '%s(%s)' % (self.__class__.__name__, shortpath(self.topdirpath)) def __repr__(self): return ( '%s(topdirpath=%r,indexclass=%s)' % (self.__class__.__name__, self.topdirpath, self.indexclass) ) def initdir(self): ''' Init a directory and its "data" subdirectory. ''' topdirpath = self.topdirpath if not isdirpath(topdirpath): info("mkdir %r", topdirpath) with Pfx("mkdir(%r)", topdirpath): os.mkdir(topdirpath) datasubdirpath = joinpath(topdirpath, 'data') if not isdirpath(datasubdirpath): info("mkdir %r", datasubdirpath) with Pfx("mkdir(%r)", datasubdirpath): os.mkdir(datasubdirpath) @contextmanager def startup_shutdown(self): ''' Start up and shut down the `FilesDir`: take locks, start worker threads etc. ''' self.initdir() self._rfds = {} self._unindexed = {} self._filemap = SqliteFilemap(self, self.statefilepath) hashname = self.hashname self.index = self.indexclass( self.pathto(self.INDEX_FILENAME_BASE_FORMAT.format(hashname=hashname)) ) self.index.open() self.runstate.start() # cache of open DataFiles self._cache = LRU_Cache( maxsize=4, on_remove=lambda k, datafile: datafile.close() ) # Set up data queue. # The .add() method adds the data to self._unindexed, puts the # data onto the data queue, and returns. # The data queue worker saves the data to backing files and # updates the indices. self._data_progress = Progress( name=str(self) + " data queue ", total=0, units_scale=BINARY_BYTES_SCALE, ) if defaults.show_progress: proxy_cmgr = upd_state.upd.insert(1) else: proxy_cmgr = nullcontext() with proxy_cmgr as data_proxy: self._data_proxy = data_proxy self._dataQ = IterableQueue(65536) self._data_Thread = bg_thread( self._data_queue, name="%s._data_queue" % (self,), ) self._monitor_Thread = bg_thread( self._monitor_datafiles, name="%s-datafile-monitor" % (self,), ) yield self.runstate.cancel() self.flush() # shut down the monitor Thread mon_thread = self._monitor_Thread if mon_thread is not None: mon_thread.join() self._monitor_Thread = None # drain the data queue self._dataQ.close() self._data_Thread.join() self._dataQ = None self._data_thread = None # update state to substrate self._cache = None self._filemap.close() self._filemap = None self.index.close() # close the read file descriptors for rfd in self._rfds.values(): with Pfx("os.close(rfd:%d)", rfd): os.close(rfd) del self._rfds self.runstate.stop() def pathto(self, rpath): ''' Return the path to `rpath`, which is relative to the `topdirpath`. ''' return joinpath(self.topdirpath, rpath) def datapathto(self, rpath): ''' Return the path to `rpath`, which is relative to the `datadirpath`. ''' return self.pathto(joinpath('data', rpath)) @typechecked def new_datafile(self) -> DataFileState: ''' Create a new datafile. Return its `DataFileState`. ''' while True: filename = str(uuid4()) + self.DATA_DOT_EXT pathname = self.datapathto(filename) if existspath(pathname): error("new datafile path already exists, retrying: %r", pathname) continue with Pfx(pathname): try: createpath(pathname) except OSError as e: if e.errno == errno.EEXIST: error("new datafile path already exists") continue raise break return self._filemap.add_path(filename) def add(self, data): ''' Add `data` to the cache, queue data for indexing, return hashcode. ''' hashcode = self.hashclass.from_chunk(data) if hashcode not in self._unindexed: self._unindexed[hashcode] = data self._data_progress.total += len(data) self._dataQ.put(data) return hashcode def _data_queue(self): wf = None DFstate = None filenum = None index = self.index unindexed = self._unindexed dataQ = self._dataQ progress = self._data_progress hashchunk = self.hashclass.from_chunk batch_size = 128 def data_batches(dataQ, batch_size): for data in dataQ: # assemble up to 64 chunks at a time data_batch = [data] while not dataQ.empty() and len(data_batch) < batch_size: data_batch.append(next(dataQ)) yield data_batch data_batch = None batches = data_batches(dataQ, batch_size) if defaults.show_progress: batches = progress.iterbar( batches, itemlenfunc=lambda batch: sum(map(len, batch)), proxy=self._data_proxy ) for data_batch in batches: batch_length = len(data_batch) ##print("data batch of", batch_length) # FileDataIndexEntry by hashcode for batch update of index after flush entry_bs_by_hashcode = {} for data in data_batch: hashcode = hashchunk(data) if hashcode not in index: # new data, save to a datafile and update the index # pretranscribe the in-file data record # save the data record to the current file if wf is None: DFstate = self.new_datafile() filenum = DFstate.filenum wf = open(DFstate.pathname, 'ab') self._WDFstate = DFstate bs, data_offset, data_length, flags = self.data_save_information( data ) offset = wf.tell() wf.write(bs) length = len(bs) post_offset = offset + length # make a record for this chunk entry_bs_by_hashcode[hashcode] = bytes( FileDataIndexEntry( filenum=filenum, data_offset=offset + data_offset, data_length=data_length, flags=flags, ) ) # after the batch, flush and roll over if beyond the high water mark if wf is not None: wf.flush() with self._lock: for hashcode, entry_bs in entry_bs_by_hashcode.items(): index[hashcode] = entry_bs try: del unindexed[hashcode] except KeyError: # this can happen when the same key is indexed twice # entirely plausible if a new datafile is added to the datadir pass # note that the index is up to post_offset DFstate.indexed_to = post_offset rollover = self.rollover if rollover is not None and wf.tell() >= rollover: # file now full, close it so as to start a new one on next write os.close(wfd) wfd = None self._filemap.set_indexed_to(DFstate.filenum, DFstate.indexed_to) DFstate = None if batch_length < batch_size: sleep(0.2) if wf is not None: wf.close() wf = None if DFstate is not None: self._filemap.set_indexed_to(DFstate.filenum, DFstate.indexed_to) def get_Archive(self, name=None, **kw): ''' Return the Archive named `name`. If `name` is omitted or `None` the Archive path is the `topdirpath` plus the extension `'.vt'`. Otherwise it is the `topdirpath` plus a dash plus the `name` plus the extension `'.vt'`. The `name` may not be empty or contain a dot or a dash. ''' with Pfx("%s.get_Archive", self): if name is None or not name: archivepath = self.topdirpath + '.vt' else: if '.' in name or '/' in name: raise ValueError("invalid name: %r" % (name,)) archivepath = self.topdirpath + '-' + name + '.vt' return Archive(archivepath, **kw) @locked def flush(self): ''' Flush all the components. ''' self._cache.flush() self.index.flush() def __setitem__(self, hashcode, data): h = self.add(data) if hashcode != h: raise ValueError( 'supplied hashcode %s does not match data, data added under %s instead' % (hashcode, h) ) def __len__(self): return len(self.index) @pfx_method def hashcodes_from(self, *, start_hashcode=None): ''' Generator yielding the hashcodes from the database in order starting with optional `start_hashcode`. Parameters: * `start_hashcode`: the first hashcode; if missing or `None`, iteration starts with the first key in the index ''' # important: consult this BEFORE self.index.keys otherwise items might # flow from unindexed to the index unseen with self._lock: unindexed = list(self._unindexed) if start_hashcode is not None and unindexed: unindexed = filter(lambda h: h >= start_hashcode, unindexed) hs = map( self.hashclass, self.index.sorted_keys(start_hashcode=start_hashcode), ) unindexed = set(unindexed) if unindexed: hs = filter(lambda h: h not in unindexed, hs) return imerge(hs, sorted(unindexed)) def __iter__(self): return self.hashcodes_from() # without this "in" tries to iterate over the mapping with int indices def __contains__(self, hashcode): return hashcode in self._unindexed or hashcode in self.index def __getitem__(self, hashcode): ''' Return the decompressed data associated with the supplied `hashcode`. ''' unindexed = self._unindexed try: return unindexed[hashcode] except KeyError: index = self.index try: with self._lock: entry_bs = index[hashcode] except KeyError: raise KeyError("%s[%s]: hash not in index" % (self, hashcode)) entry = FileDataIndexEntry.from_bytes(entry_bs) filenum = entry.filenum try: try: rfd = self._rfds[filenum] except KeyError: # TODO: shove this sideways to self.open_datafile # which releases an existing datafile if too many are open DFstate = self._filemap[filenum] rfd = self._rfds[filenum] = openfd_read(DFstate.pathname) return entry.fetch_fd(rfd) except Exception as e: exception("%s[%s]:%s not available: %s", self, hashcode, entry, e) raise KeyError(str(hashcode)) from e
def _monitor_datafiles(self): ''' Thread body to poll the ideal tree for new or changed files. ''' proxy = upd_state.proxy proxy.prefix = str(self) + " monitor " meta_store = self.meta_store filemap = self._filemap datadirpath = self.pathto('data') if meta_store is not None: topdir = self.topdir else: warning("%s: no meta_store!", self) updated = False disabled = False while not self.cancelled: sleep(self.DELAY_INTERSCAN) if self.flag_scan_disable: if not disabled: info("scan %r DISABLED", shortpath(datadirpath)) disabled = True continue if disabled: info("scan %r ENABLED", shortpath(datadirpath)) disabled = False # scan for new datafiles with Pfx("%r", datadirpath): seen = set() info("scan tree...") with proxy.extend_prefix(" scan"): for dirpath, dirnames, filenames in os.walk(datadirpath, followlinks=True): dirnames[:] = sorted(dirnames) filenames = sorted(filenames) sleep(self.DELAY_INTRASCAN) if self.cancelled or self.flag_scan_disable: break rdirpath = relpath(dirpath, datadirpath) with Pfx(rdirpath): with (proxy.extend_prefix(" " + rdirpath) if filenames else nullcontext()): # this will be the subdirectories into which to recurse pruned_dirnames = [] for dname in dirnames: if self.exclude_dir(joinpath(rdirpath, dname)): # unwanted continue subdirpath = joinpath(dirpath, dname) try: S = os.stat(subdirpath) except OSError as e: # inaccessable warning("stat(%r): %s, skipping", subdirpath, e) continue ino = S.st_dev, S.st_ino if ino in seen: # we have seen this subdir before, probably via a symlink # TODO: preserve symlinks? attach alter ego directly as a Dir? debug( "seen %r (dev=%s,ino=%s), skipping", subdirpath, ino[0], ino[1] ) continue seen.add(ino) pruned_dirnames.append(dname) dirnames[:] = pruned_dirnames if meta_store is None: warning("no meta_store") D = None else: with meta_store: D = topdir.makedirs(rdirpath, force=True) # prune removed names names = list(D.keys()) for name in names: if name not in dirnames and name not in filenames: info("del %r", name) del D[name] for filename in filenames: with Pfx(filename): if self.cancelled or self.flag_scan_disable: break rfilepath = joinpath(rdirpath, filename) if self.exclude_file(rfilepath): continue filepath = joinpath(dirpath, filename) if not isfilepath(filepath): continue # look up this file in our file state index DFstate = filemap.get(rfilepath) if (DFstate is not None and D is not None and filename not in D): # in filemap, but not in dir: start again warning("in filemap but not in Dir, rescanning") filemap.del_path(rfilepath) DFstate = None if DFstate is None: DFstate = filemap.add_path(rfilepath) try: new_size = DFstate.stat_size(self.follow_symlinks) except OSError as e: if e.errno == errno.ENOENT: warning("forgetting missing file") self._del_datafilestate(DFstate) else: warning("stat: %s", e) continue if new_size is None: # skip non files debug("SKIP non-file") continue if meta_store: try: E = D[filename] except KeyError: E = FileDirent(filename) D[filename] = E else: if not E.isfile: info( "new FileDirent replacing previous nonfile: %s", E ) E = FileDirent(filename) D[filename] = E if new_size > DFstate.scanned_to: with proxy.extend_prefix( " scan %s[%d:%d]" % (filename, DFstate.scanned_to, new_size)): if DFstate.scanned_to > 0: info("scan from %d", DFstate.scanned_to) if meta_store is not None: blockQ = IterableQueue() R = meta_store._defer( lambda B, Q: top_block_for(spliced_blocks(B, Q)), E.block, blockQ ) scan_from = DFstate.scanned_to scan_start = time() scanner = DFstate.scanfrom(offset=DFstate.scanned_to) if defaults.show_progress: scanner = progressbar( DFstate.scanfrom(offset=DFstate.scanned_to), "scan " + rfilepath, position=DFstate.scanned_to, total=new_size, units_scale=BINARY_BYTES_SCALE, itemlenfunc=lambda t3: t3[2] - t3[0], update_frequency=128, ) for pre_offset, data, post_offset in scanner: hashcode = self.hashclass.from_chunk(data) entry = FileDataIndexEntry( filenum=DFstate.filenum, data_offset=pre_offset, data_length=len(data), flags=0, ) entry_bs = bytes(entry) with self._lock: index[hashcode] = entry_bs if meta_store is not None: B = Block(data=data, hashcode=hashcode, added=True) blockQ.put((pre_offset, B)) DFstate.scanned_to = post_offset if self.cancelled or self.flag_scan_disable: break if meta_store is not None: blockQ.close() try: top_block = R() except MissingHashcodeError as e: error("missing data, forcing rescan: %s", e) DFstate.scanned_to = 0 else: E.block = top_block D.changed = True updated = True elapsed = time() - scan_start scanned = DFstate.scanned_to - scan_from if elapsed > 0: scan_rate = scanned / elapsed else: scan_rate = None if scan_rate is None: info( "scanned to %d: %s", DFstate.scanned_to, transcribe_bytes_geek(scanned) ) else: info( "scanned to %d: %s at %s/s", DFstate.scanned_to, transcribe_bytes_geek(scanned), transcribe_bytes_geek(scan_rate) ) # stall after a file scan, briefly, to limit impact if elapsed > 0: sleep(min(elapsed, self.DELAY_INTRASCAN)) # update the archive after updating from a directory if updated and meta_store is not None: self.sync_meta() updated = False self.flush()
def __init__(self, recv, send, request_handler=None, name=None, packet_grace=None, tick=None): ''' Initialise the PacketConnection. Parameters: * `recv`: inbound binary stream. If this is an `int` it is taken to be an OS file descriptor, otherwise it should be a `cs.buffer.CornuCopyBuffer` or a file like object with a `read1` or `read` method. * `send`: outbound binary stream. If this is an `int` it is taken to be an OS file descriptor, otherwise it should be a file like object with `.write(bytes)` and `.flush()` methods. For a file descriptor sending is done via an os.dup() of the supplied descriptor, so the caller remains responsible for closing the original descriptor. * `packet_grace`: default pause in the packet sending worker to allow another packet to be queued before flushing the output stream. Default: `DEFAULT_PACKET_GRACE`s. A value of `0` will flush immediately if the queue is empty. * `request_handler`: an optional callable accepting (`rq_type`, `flags`, `payload`). The request_handler may return one of 5 values on success: * `None`: response will be 0 flags and an empty payload. * `int`: flags only. Response will be the flags and an empty payload. * `bytes`: payload only. Response will be 0 flags and the payload. * `str`: payload only. Response will be 0 flags and the str encoded as bytes using UTF-8. * `(int, bytes)`: Specify flags and payload for response. An unsuccessful request should raise an exception, which will cause a failure response packet. * `tick`: optional tick parameter, default `None`. If `None`, do nothing. If a Boolean, call `tick_fd_2` if true, otherwise do nothing. Otherwise `tick` should be a callable accepting a byteslike value. ''' if name is None: name = str(seq()) self.name = name if isinstance(recv, int): self._recv = CornuCopyBuffer.from_fd(recv) elif isinstance(recv, CornuCopyBuffer): self._recv = recv else: self._recv = CornuCopyBuffer.from_file(recv) if isinstance(send, int): self._send = os.fdopen(os.dup(send), 'wb') else: self._send = send if packet_grace is None: packet_grace = DEFAULT_PACKET_GRACE if tick is None: tick = lambda bs: None elif isinstance(tick, bool): if tick: tick = tick_fd_2 else: tick = lambda bs: None self.packet_grace = packet_grace self.request_handler = request_handler self.tick = tick # tags of requests in play against the local system self._channel_request_tags = {0: set()} self.notify_recv_eof = set() self.notify_send_eof = set() # LateFunctions for the requests we are performing for the remote system self._running = set() # requests we have outstanding against the remote system self._pending = {0: {}} # sequence of tag numbers # TODO: later, reuse old tags to prevent monotonic growth of tag field self._tag_seq = Seq(1) # work queue for local requests self._later = Later(4, name="%s:Later" % (self, )) self._later.open() # dispatch queue of Packets to send self._sendQ = IterableQueue(16) self._lock = Lock() self.closed = False # debugging: check for reuse of (channel,tag) etc self.__sent = set() self.__send_queued = set() # dispatch Thread to process received packets self._recv_thread = bg_thread(self._receive_loop, name="%s[_receive_loop]" % (self.name, )) # dispatch Thread to send data # primary purpose is to bundle output by deferring flushes self._send_thread = bg_thread(self._send_loop, name="%s[_send]" % (self.name, ))
class FileDataMappingProxy(MultiOpenMixin, RunStateMixin): ''' Mapping-like class to cache data chunks to bypass gdbm indices and the like. Data are saved immediately into an in memory cache and an asynchronous worker copies new data into a cache file and also to the backend storage. ''' @pfx_method def __init__( self, backend, *, dirpath=None, max_cachefile_size=None, max_cachefiles=None, runstate=None, ): ''' Initialise the cache. Parameters: * `backend`: mapping underlying us * `dirpath`: directory to store cache files * `max_cachefile_size`: maximum cache file size; a new cache file is created if this is exceeded; default: DEFAULT_CACHEFILE_HIGHWATER * `max_cachefiles`: number of cache files to keep around; no more than this many cache files are kept at a time; default: DEFAULT_MAX_CACHEFILES ''' RunStateMixin.__init__(self, runstate=runstate) if max_cachefile_size is None: max_cachefile_size = DEFAULT_CACHEFILE_HIGHWATER if max_cachefiles is None: max_cachefiles = DEFAULT_MAX_CACHEFILES self.backend = backend if not isdirpath(dirpath): raise ValueError("dirpath=%r: not a directory" % (dirpath, )) self.dirpath = dirpath self.max_cachefile_size = max_cachefile_size self.max_cachefiles = max_cachefiles self.cached = {} # map h => data self.saved = {} # map h => _CachedData(cachefile, offset, length) self._lock = Lock() self.cachefiles = [] self._add_cachefile() self._workQ = None self._worker = None self.runstate.notify_cancel.add(lambda rs: self.close()) def startup(self): ''' Startup the proxy. ''' self._workQ = IterableQueue() self._worker = Thread(name="%s WORKER" % (self, ), target=self._work) self._worker.start() @pfx_method def shutdown(self): ''' Shut down the cache. Stop the worker, close the file cache. ''' self._workQ.close() self._worker.join() if self.cached: error("blocks still in memory cache: %r", self.cached) for cachefile in self.cachefiles: cachefile.close() def _add_cachefile(self): cachefile = RWFileBlockCache(dirpath=self.dirpath) self.cachefiles.insert(0, cachefile) if len(self.cachefiles) > self.max_cachefiles: old_cachefile = self.cachefiles.pop() old_cachefile.close() def _getref(self, h): ''' Fetch a cache reference from self.saved, return None if missing. Automatically prune stale saved entries if the cachefile is closed. ''' saved = self.saved ref = saved.get(h) if ref is not None: if ref.cachefile.closed: ref = None del saved[h] return ref def __contains__(self, h): ''' Mapping method supporting "in". ''' with self._lock: if h in self.cached: return True if self._getref(h) is not None: return True backend = self.backend if backend: return h in backend return False def keys(self): ''' Mapping method for .keys. ''' seen = set() for h in list(self.cached.keys()): yield h seen.add(h) saved = self.saved with self._lock: saved_keys = list(saved.keys()) for h in saved_keys: if h not in seen and self._getref(h): yield h seen.add(h) backend = self.backend if backend: for h in backend.keys(): if h not in seen: yield h def __getitem__(self, h): ''' Fetch the data with key `h`. Raise KeyError if missing. ''' with self._lock: # fetch from memory try: data = self.cached[h] except KeyError: # fetch from file ref = self._getref(h) if ref is not None: return ref.fetch() else: # straight from memory cache return data # not in memory or file cache: fetch from backend, queue store into cache backend = self.backend if not backend: raise KeyError('no backend: h=%s' % (h, )) data = backend[h] with self._lock: self.cached[h] = data self._workQ.put((h, data, False)) return data def __setitem__(self, h, data): ''' Store `data` against key `h`. ''' with self._lock: if h in self.cached: # in memory cache, do not save return if self._getref(h): # in file cache, do not save return # save in memory cache self.cached[h] = data # queue for file cache and backend self._workQ.put((h, data, True)) def _work(self): for h, data, in_backend in self._workQ: with self._lock: if self._getref(h): # already in file cache, therefore already sent to backend continue cachefile = self.cachefiles[0] offset = cachefile.put(data) with self._lock: self.saved[h] = CachedData(cachefile, offset, len(data)) # release memory cache entry try: del self.cached[h] except KeyError: pass if offset + len(data) >= self.max_cachefile_size: # roll over to new cache file self._add_cachefile() # store into the backend if not in_backend: backend = self.backend if backend: self.backend[h] = data
class POP3(MultiOpenMixin): ''' Simple POP3 class with support for streaming use. ''' def __init__(self, conn_spec): if isinstance(conn_spec, str): conn_spec = ConnectionSpec.from_spec(conn_spec) self.conn_spec = conn_spec self._result_queue = None self._client_worker = None self._sock = None self.recvf = None self.sendf = None self._lock = RLock() @pfx def startup(self): ''' Connect to the server and log in. ''' self._sock = self.conn_spec.connect() self.recvf = self._sock.makefile('r', encoding='iso8859-1') self.sendf = self._sock.makefile('w', encoding='ascii') self.client_begin() self.client_auth(self.conn_spec.user, self.conn_spec.password) self._result_queue = IterableQueue() self._client_worker = bg_thread( self._client_response_worker, args=(self._result_queue,) ) return self @pfx def shutdown(self): ''' Quit and disconnect. ''' logmsg = debug logmsg("send client QUIT") try: quitR = self.client_quit_bg() logmsg("flush QUIT") self.flush() logmsg("join QUIT") quitR.join() except Exception as e: exception("client quit: %s", e) logmsg = warning if self._result_queue: logmsg("close result queue") self._result_queue.close() self._result_queue = None if self._client_worker: logmsg("join client worker") self._client_worker.join() self._client_worker = None logmsg("close sendf") self.sendf.close() self.sendf = None logmsg("check for uncollected server responses") bs = self.recvf.read() if bs: warning("received %d bytes from the server at shutdown", len(bs)) logmsg("close recvf") self.recvf.close() self.recvf = None logmsg("close socket") self._sock.close() self._sock = None logmsg("shutdown complete") def readline(self): ''' Read a CRLF terminated line from `self.recvf`. Return the text preceeding the CRLF. Return `None` at EOF. ''' line0 = self.recvf.readline() if not line0: return None line = cutsuffix(line0, '\n') assert line is not line0, "missing LF: %r" % (line0,) line = cutsuffix(line, '\r') return line def readlines(self): ''' Generator yielding lines from `self.recf`. ''' while True: line = self.readline() if line is None: break yield line def get_response(self): ''' Read a server response. Return `(ok,status,etc)` where `ok` is true if `status` is `'+OK'`, false otherwise; `status` is the status word and `etc` is the following text. Return `(None,None,None)` on EOF from the receive stream. ''' line = self.readline() if line is None: return None, None, None try: status, etc = line.split(None, 1) except ValueError: status = line etc = '' return status == '+OK', status, etc def get_ok(self): ''' Read server response, require it to be `'OK+'`. Returns the `etc` part. ''' ok, status, etc = self.get_response() if not ok: raise ValueError("no ok from server: %r %r" % (status, etc)) return etc def get_multiline(self): ''' Generator yielding unstuffed lines from a multiline response. ''' for line in self.readlines(): if line == '.': break if line.startswith('.'): line = line[1:] yield line def flush(self): ''' Flush the send stream. ''' self.sendf.flush() def sendline(self, line, do_flush=False): ''' Send a line (excluding its terminating CRLF). If `do_flush` is true (default `False`) also flush the sending stream. ''' assert '\r' not in line and '\n' not in line self.sendf.write(line) self.sendf.write('\r\n') if do_flush: self.flush() def _client_response_worker(self, result_queue): ''' Worker to process queued request responses. Each completed response assigns `(etc,lines)` to the `Result` where `etc` is the addition text from the server ok response and `lines` is a list of the multiline part of the response or `None` if the response is not multiline. ''' for R, is_multiline in result_queue: try: etc = self.get_ok() if is_multiline: lines = list(self.get_multiline()) else: lines = None except Exception as e: # pylint: disable=broad-except warning("%s: %s", R, e) R.exc_info = sys.exc_info else: # save a list so that we can erase it in a handler to release memory R.result = [etc, lines] def client_begin(self): ''' Read the opening server response. ''' etc = self.get_ok() print(etc) def client_auth(self, user, password): ''' Perform a client authentication. ''' self.sendline(f'USER {user}', do_flush=True) print('USER', user, self.get_ok()) self.sendline(f'PASS {password}', do_flush=True) print('PASS', '****', self.get_ok()) def client_uidl(self): ''' Return a mapping of message number to message UID string. ''' self.sendline('UIDL', do_flush=True) self.get_ok() for line in self.get_multiline(): n, msg_uid = line.split(None, 1) n = int(n) yield n, msg_uid def client_bg(self, rq_line, is_multiline=False, notify=None): ''' Dispatch a request `rq_line` in the background. Return a `Result` to collect the request result. Parameters: * `rq_line`: POP3 request text, without any terminating CRLF * `is_multiline`: true if a multiline response is expected, default `False` * `notify`: a optional handler for `Result.notify`, applied if not `None` *Note*: DOES NOT flush the send stream. Call `self.flush()` when a batch of requests has been submitted, before trying to collect the `Result`s. The `Result` will receive `[etc,lines]` on success where: * `etc` is the trailing portion of an ok response line * `lines` is a list of unstuffed text lines from the response if `is_multiline` is true, `None` otherwise The `Result` gets a list instead of a tuple so that a handler may clear it in order to release memory. Example: R = self.client_bg(f'RETR {msg_n}', is_multiline=True, notify=notify) ''' with self._lock: self.sendline(rq_line) R = Result(rq_line) self._result_queue.put((R, is_multiline)) R.extra.update(rq_line=rq_line) if notify is not None: R.notify(notify) return R def client_dele_bg(self, msg_n): ''' Queue a delete request for message `msg_n`, return ` Result` for collection. ''' R = self.client_bg(f'DELE {msg_n}') R.extra.update(msg_n=msg_n) return R def client_quit_bg(self): ''' Queue a QUIT request. return ` Result` for collection. ''' R = self.client_bg('QUIT') return R def client_retr_bg(self, msg_n, notify=None): ''' Queue a retrieve request for message `msg_n`, return ` Result` for collection. If `notify` is not `None`, apply it to the `Result`. ''' R = self.client_bg(f'RETR {msg_n}', is_multiline=True, notify=notify) R.extra.update(msg_n=msg_n) return R def dl_bg(self, msg_n, maildir, deleRs): ''' Download message `msg_n` to Maildir `maildir`. Return the `Result` for the `RETR` request. After a successful save, queue a `DELE` for the message and add its `Result` to `deleRs`. ''' def dl_bg_save_result(R): _, lines = R.result R.result[1] = None # release lines msg_bs = b''.join( map(lambda line: line.encode('iso8859-1') + b'\r\n', lines) ) msg = BytesParser().parsebytes(msg_bs) with self._lock: Mkey = maildir.add(msg) deleRs.add(self.client_dele_bg(msg_n)) print(f'msg {msg_n}: {len(msg_bs)} octets, saved as {Mkey}, deleted.') R = self.client_retr_bg(msg_n, notify=dl_bg_save_result) return R
class SubLater(object): ''' A class for managing a group of deferred tasks using an existing `Later`. ''' def __init__(self, L): ''' Initialise the `SubLater` with its parent `Later`. TODO: accept `discard=False` param to suppress the queue and associated checks. ''' self._later = L self._later.open() self._lock = Lock() self._deferred = 0 self._queued = 0 self._queue = IterableQueue() self.closed = False def __str__(self): return "%s(%s%s,deferred=%d,completed=%d)" % ( type(self), self._later, "[CLOSED]" if self.closed else "", self._deferred, self._queued, ) def __iter__(self): ''' Iteration over the `SubLater` iterates over the queue of completed `LateFUnction`s. ''' return iter(self._queue) def close(self): ''' Close the SubLater. This prevents further deferrals. ''' with self._lock: closed = self.closed if closed: self._later.warning("repeated close of %s", self) else: self.closed = True self._queue.close() self._later.close() def defer(self, func, *a, **kw): ''' Defer a function, return its `LateFunction`. The resulting `LateFunction` will queue itself for collection on completion. ''' with self._lock: LF = self._later.defer(func, *a, **kw) self._deferred += 1 def on_complete(R): with self._lock: self._queue.put(R) self._queued += 1 if self.closed and self._queued >= self._deferred: self._queue.close() LF.notify(on_complete) return LF def reaper(self, handler=None): ''' Dispatch a `Thread` to collect completed `LateFunction`s. Return the `Thread`. `handler`: an optional callable to be passed each `LateFunction` as it completes. ''' @logexc def reap(Q): for LF in Q: if handler: try: handler(LF) except Exception as e: # pylint: disable=broad-except exception("%s: reap %s: %s", self, LF, e) T = Thread(name="reaper(%s)" % (self,), target=reap, args=(self._queue,)) T.start() return T
def startup(self): ''' Startup the proxy. ''' self._workQ = IterableQueue() self._worker = Thread(name="%s WORKER" % (self, ), target=self._work) self._worker.start()
class PacketConnection(object): ''' A bidirectional binary connection for exchanging requests and responses. ''' # special packet indicating end of stream EOF_Packet = Packet(is_request=True, channel=0, tag=0, flags=0, rq_type=0, payload=b'') # pylint: disable=too-many-arguments def __init__(self, recv, send, request_handler=None, name=None, packet_grace=None, tick=None): ''' Initialise the PacketConnection. Parameters: * `recv`: inbound binary stream. If this is an `int` it is taken to be an OS file descriptor, otherwise it should be a `cs.buffer.CornuCopyBuffer` or a file like object with a `read1` or `read` method. * `send`: outbound binary stream. If this is an `int` it is taken to be an OS file descriptor, otherwise it should be a file like object with `.write(bytes)` and `.flush()` methods. For a file descriptor sending is done via an os.dup() of the supplied descriptor, so the caller remains responsible for closing the original descriptor. * `packet_grace`: default pause in the packet sending worker to allow another packet to be queued before flushing the output stream. Default: `DEFAULT_PACKET_GRACE`s. A value of `0` will flush immediately if the queue is empty. * `request_handler`: an optional callable accepting (`rq_type`, `flags`, `payload`). The request_handler may return one of 5 values on success: * `None`: response will be 0 flags and an empty payload. * `int`: flags only. Response will be the flags and an empty payload. * `bytes`: payload only. Response will be 0 flags and the payload. * `str`: payload only. Response will be 0 flags and the str encoded as bytes using UTF-8. * `(int, bytes)`: Specify flags and payload for response. An unsuccessful request should raise an exception, which will cause a failure response packet. * `tick`: optional tick parameter, default `None`. If `None`, do nothing. If a Boolean, call `tick_fd_2` if true, otherwise do nothing. Otherwise `tick` should be a callable accepting a byteslike value. ''' if name is None: name = str(seq()) self.name = name if isinstance(recv, int): self._recv = CornuCopyBuffer.from_fd(recv) elif isinstance(recv, CornuCopyBuffer): self._recv = recv else: self._recv = CornuCopyBuffer.from_file(recv) if isinstance(send, int): self._send = os.fdopen(os.dup(send), 'wb') else: self._send = send if packet_grace is None: packet_grace = DEFAULT_PACKET_GRACE if tick is None: tick = lambda bs: None elif isinstance(tick, bool): if tick: tick = tick_fd_2 else: tick = lambda bs: None self.packet_grace = packet_grace self.request_handler = request_handler self.tick = tick # tags of requests in play against the local system self._channel_request_tags = {0: set()} self.notify_recv_eof = set() self.notify_send_eof = set() # LateFunctions for the requests we are performing for the remote system self._running = set() # requests we have outstanding against the remote system self._pending = {0: {}} # sequence of tag numbers # TODO: later, reuse old tags to prevent monotonic growth of tag field self._tag_seq = Seq(1) # work queue for local requests self._later = Later(4, name="%s:Later" % (self, )) self._later.open() # dispatch queue of Packets to send self._sendQ = IterableQueue(16) self._lock = Lock() self.closed = False # debugging: check for reuse of (channel,tag) etc self.__sent = set() self.__send_queued = set() # dispatch Thread to process received packets self._recv_thread = bg_thread(self._receive_loop, name="%s[_receive_loop]" % (self.name, )) # dispatch Thread to send data # primary purpose is to bundle output by deferring flushes self._send_thread = bg_thread(self._send_loop, name="%s[_send]" % (self.name, )) def __str__(self): return "PacketConnection[%s]" % (self.name, ) @pfx_method def shutdown(self, block=False): ''' Shut down the PacketConnection, optionally blocking for outstanding requests. Parameters: `block`: block for outstanding requests, default False. ''' with self._lock: if self.closed: # shutdown already called from another thread return # prevent further request submission either local or remote self.closed = True ps = self._pending_states() if ps: warning("PENDING STATES AT SHUTDOWN: %r", ps) # wait for completion of requests we're performing for LF in list(self._running): LF.join() # shut down sender, should trigger shutdown of remote receiver self._sendQ.close(enforce_final_close=True) self._send_thread.join() # we do not wait for the receiver - anyone hanging on outstaning # requests will get them as they come in, and in theory a network # disconnect might leave the receiver hanging anyway self._later.close() if block: self._later.wait() def join(self): ''' Wait for the receive side of the connection to terminate. ''' self._recv_thread.join() def _new_tag(self): return next(self._tag_seq) def _pending_states(self): ''' Return a list of ( (channel, tag), Request_State ) for the currently pending requests. ''' states = [] pending = self._pending for channel, channel_states in sorted(pending.items()): for tag, channel_state in sorted(channel_states.items()): states.append(((channel, tag), channel_state)) return states @locked def _pending_add(self, channel, tag, state): ''' Record some state against a (channel, tag). ''' pending = self._pending if channel not in pending: raise ValueError("unknown channel %d" % (channel, )) channel_info = pending[channel] if tag in channel_info: raise ValueError("tag %d already pending in channel %d" % (tag, channel)) self._pending[channel][tag] = state @locked def _pending_pop(self, channel, tag): ''' Retrieve and remove the state associated with (channel, tag). ''' pending = self._pending if channel not in pending: raise ValueError("unknown channel %d" % (channel, )) channel_info = pending[channel] if tag not in channel_info: raise ValueError("tag %d unknown in channel %d" % (tag, channel)) if False and tag == 15: raise RuntimeError("BANG") return channel_info.pop(tag) def _pending_cancel(self): ''' Cancel all the pending requests. ''' for chtag, _ in self._pending_states(): channel, tag = chtag warning("%s: cancel pending request %d:%s", self, channel, tag) _, result = self._pending_pop(channel, tag) result.cancel() def _queue_packet(self, P): sig = (P.channel, P.tag, P.is_request) if sig in self.__send_queued: raise RuntimeError("requeue of %s: %s" % (sig, P)) self.__send_queued.add(sig) try: self._sendQ.put(P) except ClosedError as e: warning("%s: packet not sent: %s (P=%s)", self._sendQ, e, P) def _reject(self, channel, tag, payload=bytes(())): ''' Issue a rejection of the specified request. ''' error("rejecting request: " + str(payload)) if isinstance(payload, str): payload = payload.encode('utf-8') self._queue_packet( Packet(is_request=False, channel=channel, tag=tag, flags=0, payload=payload)) def _respond(self, channel, tag, flags, payload): ''' Issue a valid response. Tack a 1 (ok) flag onto the flags and dispatch. ''' assert isinstance(channel, int) assert isinstance(tag, int) assert isinstance(flags, int) assert isinstance(payload, bytes) flags = (flags << 1) | 1 self._queue_packet( Packet(is_request=False, channel=channel, tag=tag, flags=flags, payload=payload)) @not_closed # pylint: disable=too-many-arguments def request(self, rq_type, flags=0, payload=b'', decode_response=None, channel=0): ''' Compose and dispatch a new request, returns a `Result`. Allocates a new tag, a Result to deliver the response, and records the response decode function for use when the response arrives. Parameters: * `rq_type`: request type code, an int * `flags`: optional flags to accompany the request, an int; default `0`. * `payload`: optional bytes-like object to accompany the request; default `b''` * `decode_response`: optional callable accepting (response_flags, response_payload_bytes) and returning the decoded response payload value; if unspecified, the response payload bytes are used The Result will yield an `(ok, flags, payload)` tuple, where: * `ok`: whether the request was successful * `flags`: the response flags * `payload`: the response payload, decoded by decode_response if specified ''' if rq_type < 0: raise ValueError("rq_type may not be negative (%s)" % (rq_type, )) # reserve type 0 for end-of-requests rq_type += 1 tag = self._new_tag() R = Result() self._pending_add(channel, tag, Request_State(decode_response, R)) self._queue_packet( Packet(is_request=True, channel=channel, tag=tag, flags=flags, rq_type=rq_type, payload=payload)) return R @not_closed def do(self, *a, **kw): ''' Synchronous request. Submits the request, then calls the `Result` returned from the request. ''' return self.request(*a, **kw)() @logexc # pylint: disable=too-many-arguments def _run_request(self, channel, tag, handler, rq_type, flags, payload): ''' Run a request and queue a response packet. ''' with Pfx( "_run_request[channel=%d,tag=%d,rq_type=%d,flags=0x%02x,payload=%s", channel, tag, rq_type, flags, repr(payload) if len(payload) <= 32 else repr(payload[:32]) + '...'): result_flags = 0 result_payload = b'' try: result = handler(rq_type, flags, payload) if result is not None: if isinstance(result, int): result_flags = result elif isinstance(result, bytes): result_payload = result elif isinstance(result, str): result_payload = result.encode( encoding='utf-8', errors='xmlcharrefreplace') else: result_flags, result_payload = result except Exception as e: # pylint: disable=broad-except exception("exception: %s", e) self._reject(channel, tag, "exception during handler") else: self._respond(channel, tag, result_flags, result_payload) self._channel_request_tags[channel].remove(tag) # pylint: disable=too-many-branches,too-many-statements,too-many-locals def _receive_loop(self): ''' Receive packets from upstream, decode into requests and responses. ''' XX = self.tick with PrePfx("_RECEIVE [%s]", self): with post_condition(("_recv is None", lambda: self._recv is None)): while True: try: XX(b'<') packet = Packet.parse(self._recv) except EOFError: break if packet == self.EOF_Packet: break channel = packet.channel tag = packet.tag flags = packet.flags payload = packet.payload if packet.is_request: # request from upstream client with Pfx("request[%d:%d]", channel, tag): if self.closed: debug("rejecting request: closed") # NB: no rejection packet sent since sender also closed elif self.request_handler is None: self._reject(channel, tag, "no request handler") else: requests = self._channel_request_tags if channel not in requests: # unknown channel self._reject(channel, tag, "unknown channel %d") elif tag in self._channel_request_tags[ channel]: self._reject( channel, tag, "channel %d: tag already in use: %d" % (channel, tag)) else: # payload for requests is the request enum and data rq_type = packet.rq_type if rq_type == 0: # magic EOF rq_type - must be malformed (!=EOF_Packet) error( "malformed EOF packet received: %s", packet) break # normalise rq_type rq_type -= 1 requests[channel].add(tag) # queue the work function and track it LF = self._later.defer( self._run_request, channel, tag, self.request_handler, rq_type, flags, payload) self._running.add(LF) LF.notify(self._running.remove) else: with Pfx("response[%d:%d]", channel, tag): # response: get state of matching pending request, remove state try: rq_state = self._pending_pop(channel, tag) except ValueError as e: # no such pending pair - response to unknown request error("%d.%d: response to unknown request: %s", channel, tag, e) else: decode_response, R = rq_state # first flag is "ok" ok = (flags & 0x01) != 0 flags >>= 1 payload = packet.payload if ok: # successful reply # return (True, flags, decoded-response) if decode_response is None: # return payload bytes unchanged R.result = (True, flags, payload) else: # decode payload try: result = decode_response( flags, payload) except Exception: # pylint: disable=broad-except R.exc_info = sys.exc_info() else: R.result = (True, flags, result) else: # unsuccessful: return (False, other-flags, payload-bytes) R.result = (False, flags, payload) # end of received packets: cancel any outstanding requests self._pending_cancel() # alert any listeners of receive EOF for notify in self.notify_recv_eof: notify(self) self._recv = None self.shutdown() # pylint: disable=too-many-branches def _send_loop(self): ''' Send packets upstream. Write every packet directly to self._send. Flush whenever the queue is empty. ''' XX = self.tick ##with Pfx("%s._send", self): with PrePfx("_SEND [%s]", self): with post_condition(("_send is None", lambda: self._send is None)): fp = self._send Q = self._sendQ grace = self.packet_grace for P in Q: sig = (P.channel, P.tag, P.is_request) if sig in self.__sent: raise RuntimeError("second send of %s" % (P, )) self.__sent.add(sig) try: XX(b'>') for bs in P.transcribe_flat(): fp.write(bs) if Q.empty(): # no immediately ready further packets: flush the output buffer if grace > 0: # allow a little time for further Packets to queue XX(b'Sg') sleep(grace) if Q.empty(): # still nothing XX(b'F') fp.flush() else: XX(b'F') fp.flush() except OSError as e: if e.errno == errno.EPIPE: warning("remote end closed") break raise try: XX(b'>EOF') for bs in self.EOF_Packet.transcribe_flat(): fp.write(bs) fp.close() except (OSError, IOError) as e: if e.errno == errno.EPIPE: debug("remote end closed: %s", e) elif e.errno == errno.EBADF: warning("local end closed: %s", e) else: raise except Exception as e: error("(_SEND) UNEXPECTED EXCEPTION: %s %s", e, e.__class__) raise self._send = None