def mergesort(filename, output=None, key=None, maxitems=1e6, progress=True): """Given an input file sort it by performing a merge sort on disk. :param filename: Either a filename as a ``str`` or a ``py._path.local.LocalPath`` instance. :type filename: ``str`` or ``py._path.local.LocalPath`` :param output: An optional output filename as a ``str`` or a ``py._path.local.LocalPath`` instance. :type output: ``str`` or ``py._path.local.LocalPath`` or ``None`` :param key: An optional key to sort the data on. :type key: ``function`` or ``None`` :param maxitems: Maximum number of items to hold in memory at a time. :type maxitems: ``int`` :param progress: Whether or not to display a progress bar :type progress: ``bool`` This uses ``py._path.local.LocalPath.make_numbered_dir`` to create temporry scratch space to work with when splitting the input file into sorted chunks. The mergesort is processed iteratively in-memory using the ``~merge`` function which is almost identical to ``~heapq.merge`` but adds in the support of an optional key function. """ p = filename if isinstance(filename, LocalPath) else LocalPath(filename) output = p if output is None else output key = key if key is not None else lambda x: x scratch = LocalPath.make_numbered_dir(prefix="mergesort-") nlines = sum(1 for line in p.open("r")) # Compute a reasonable chunksize < maxitems chunksize = first(ifilter(lambda x: x < maxitems, imap(lambda x: nlines / (2**x), count(1)))) # Split the file up into n sorted files if progress: bar = ProgressBar("Split/Sorting Data", max=(nlines / chunksize)) for i, items in enumerate(ichunks(chunksize, jsonstream(p))): with scratch.ensure("{0:d}.json".format(i)).open("w") as f: f.write("\n".join(map(dumps, sorted(items, key=key)))) if progress: bar.next() if progress: bar.finish() q = scratch.listdir("*.json") with output.open("w") as f: if progress: bar = ProgressBar("Merge/Sorting Data", max=nlines) for item in merge(*imap(jsonstream, q)): f.write("{0:s}\n".format(dumps(item))) if progress: bar.next() if progress: bar.finish()
def walk_down(root, skip=constantly(False), include_self=True): """Yield each node from here downward, myself included, in depth-first pre-order. :arg skip: A predicate decribing nodes to not descend into. We always return ourselves, even if the predicate says to skip us. :arg include_self: A flag for including the root in the walk down. The AST we get from Reflect.parse is somewhat unsatisfying. It's not a uniform tree shape; it seems to have already been turned into more specialized objects. Thus, we have to traverse into different fields depending on node type. """ if include_self: yield root for child in ifilter(is_node, iflatten(root.itervalues())): if skip(child): yield child continue # Just a "yield from": for ret in walk_down(child, skip=skip): yield ret
def search(self, query, treat_as_regex=True): if treat_as_regex: return map(Task, select(query, self.iterate(raw=True))) return map(Task, ifilter(lambda t: query in t, self.iterate(raw=True)))
def delete(self, task): tasks = ifilter(partial(operator.ne, self.task(task)), self) self._atomic_write(tasks)
def get(self, task_uuid): return first(ifilter(partial(operator.eq, self.task(task_uuid)), self))
def process_function(props): # Compute FuncSig based on args: input_args = tuple( ifilter(bool, imap(str.lstrip, props['args'][1:-1].split(",")))) props['type'] = c_type_sig(input_args, props['type']) return props
def process_function(props): # Compute FuncSig based on args: input_args = tuple(ifilter( bool, imap(str.lstrip, props['args'][1:-1].split(",")))) props['type'] = c_type_sig(input_args, props['type']) return props