Esempio n. 1
0
def stream2stream(stream, serializer=_PICKLE_IO, jobs=1, **kwargs):

    """
    Take an unsorted stream of data and turn it into a sorted stream of data.
    Data is chunked into tempfiles with `_stream2tempfiles()`, and then merged
    with `_mergefiles2stream()`.  Intermediary tempfiles are written and read
    with `serializer` and are deleted automatically.

    Parameters
    ----------
    stream : iter
        Sort this stream of data.
    serializer : tinysort.io.BaseSerializer, optional
        Instance of the class to use for writing and reading intermediary
        tempfiles.
    jobs : int, optional
        Process data in parallel with a pool of N workers.  Passed to
        `_mergefiles2stream()`.
    kwargs : **kwargs, optional
        Keyword arguments for `_stream2tempfiles()`.  The `key` and
        `reverse` value are extracted for `_mergefiles2stream()` as well.
    
    Yields
    ------
    object
        Sorted objects.
    """

    # We know we already have the data in-memory, so just doing a straight up
    # sort is almost certainly faster
    if isinstance(stream, (list, tuple, dict)):
        for item in sorted(
                stream,
                key=kwargs.get('key'),
                reverse=kwargs.get('reverse', False)):
            yield item

    else:

        # Reader, writer, and serializer have different meanings from an API and
        # documentation perspective, so we don't want this to create an error.
        kwargs.update(writer=serializer)

        chunk_paths = _stream2tempfiles(
            stream,
            jobs=jobs,
            **kwargs)

        with tools.delete_files(*chunk_paths) as paths:
            for item in _mergefiles2stream(
                    *paths,
                    reader=serializer,
                    key=kwargs.get('key'),
                    reverse=kwargs.get('reverse', False)):
                yield item
Esempio n. 2
0
def test_delete_files(tmpdir):

    base = tmpdir.mkdir('test_delete_files')
    paths = [str(base.join(str(i))) for i in range(5)]

    for p in paths:
        assert not os.path.exists(p)
        with open(p, 'w') as f:
            pass
        assert os.path.exists(p)

    with tools.delete_files(*paths) as pths:
        for p in pths:
            assert os.path.exists(p)

    for p in pths:
        assert not os.path.exists(p)

    # Run it again to make sure we don't get an exception
    with tools.delete_files(*paths) as pths:
        pass
Esempio n. 3
0
def test_sort_into_files():

    # Use an odd number so one chunk only has 1 value
    values = tuple(range(9))

    results = _sort._stream2tempfiles(reversed(values), chunksize=2)
    assert len(results) == 5

    with tools.delete_files(*results) as paths:
        for p in paths:
            with tinysort.io.Pickle().open(p) as f:
                lines = [int(l) for l in list(f)]

                # Input values are reversed, so the odd chunk is 0, not 9
                if len(lines) == 1:
                    assert lines[0] == 0
                elif len(lines) == 2:
                    assert lines[0] + 1 == lines[1]

                else:
                    raise ValueError("Unexpected condition")
Esempio n. 4
0
def files2stream(*infiles, **kwargs):

    """
    Sort a batch of files into a single stream.

    Parameters
    ----------
    paths : *str
        Input files to sort.
    reader : tinysort.io.BaseSerializer
        Instance of the serializer for reading `infile`.
    kwargs : **kwargs, optional
        Keyword arguments for `file2stream()`.

    Yields
    ------
    object
    """

    if 'reader' not in kwargs:
        raise TypeError("reader parameter is required")
    else:
        reader = kwargs.pop('reader')

    tfiles = []
    try:
        srt = functools.partial(_file2tempfiles, reader=reader, **kwargs)
        tfiles += list(it.chain(*map(srt, infiles)))
    finally:
        with tools.delete_files(*tfiles) as merge:
            for item in _mergefiles2stream(
                    *merge,
                    reader=reader,
                    key=kwargs.get('key'),
                    reverse=kwargs.get('reverse', False)):
                yield item