def stream2stream(stream, serializer=_PICKLE_IO, jobs=1, **kwargs): """ Take an unsorted stream of data and turn it into a sorted stream of data. Data is chunked into tempfiles with `_stream2tempfiles()`, and then merged with `_mergefiles2stream()`. Intermediary tempfiles are written and read with `serializer` and are deleted automatically. Parameters ---------- stream : iter Sort this stream of data. serializer : tinysort.io.BaseSerializer, optional Instance of the class to use for writing and reading intermediary tempfiles. jobs : int, optional Process data in parallel with a pool of N workers. Passed to `_mergefiles2stream()`. kwargs : **kwargs, optional Keyword arguments for `_stream2tempfiles()`. The `key` and `reverse` value are extracted for `_mergefiles2stream()` as well. Yields ------ object Sorted objects. """ # We know we already have the data in-memory, so just doing a straight up # sort is almost certainly faster if isinstance(stream, (list, tuple, dict)): for item in sorted( stream, key=kwargs.get('key'), reverse=kwargs.get('reverse', False)): yield item else: # Reader, writer, and serializer have different meanings from an API and # documentation perspective, so we don't want this to create an error. kwargs.update(writer=serializer) chunk_paths = _stream2tempfiles( stream, jobs=jobs, **kwargs) with tools.delete_files(*chunk_paths) as paths: for item in _mergefiles2stream( *paths, reader=serializer, key=kwargs.get('key'), reverse=kwargs.get('reverse', False)): yield item
def test_delete_files(tmpdir): base = tmpdir.mkdir('test_delete_files') paths = [str(base.join(str(i))) for i in range(5)] for p in paths: assert not os.path.exists(p) with open(p, 'w') as f: pass assert os.path.exists(p) with tools.delete_files(*paths) as pths: for p in pths: assert os.path.exists(p) for p in pths: assert not os.path.exists(p) # Run it again to make sure we don't get an exception with tools.delete_files(*paths) as pths: pass
def test_sort_into_files(): # Use an odd number so one chunk only has 1 value values = tuple(range(9)) results = _sort._stream2tempfiles(reversed(values), chunksize=2) assert len(results) == 5 with tools.delete_files(*results) as paths: for p in paths: with tinysort.io.Pickle().open(p) as f: lines = [int(l) for l in list(f)] # Input values are reversed, so the odd chunk is 0, not 9 if len(lines) == 1: assert lines[0] == 0 elif len(lines) == 2: assert lines[0] + 1 == lines[1] else: raise ValueError("Unexpected condition")
def files2stream(*infiles, **kwargs): """ Sort a batch of files into a single stream. Parameters ---------- paths : *str Input files to sort. reader : tinysort.io.BaseSerializer Instance of the serializer for reading `infile`. kwargs : **kwargs, optional Keyword arguments for `file2stream()`. Yields ------ object """ if 'reader' not in kwargs: raise TypeError("reader parameter is required") else: reader = kwargs.pop('reader') tfiles = [] try: srt = functools.partial(_file2tempfiles, reader=reader, **kwargs) tfiles += list(it.chain(*map(srt, infiles))) finally: with tools.delete_files(*tfiles) as merge: for item in _mergefiles2stream( *merge, reader=reader, key=kwargs.get('key'), reverse=kwargs.get('reverse', False)): yield item