Ejemplo n.º 1
0
    def value(self):
        """
        Return the parsed btye string as a DataFrame
        """
        try:
            return self._value
        except AttributeError:
            from io import BytesIO
            from dask.bytes.utils import read_block

            # read the relevant bytes
            with open(self.filename, 'rb') as f:
                block = read_block(f, self.offset, self.blocksize,
                                   self.delimiter)

            # parse the byte string
            b = BytesIO()
            b.write(block)
            b.seek(0)
            self._value = read_csv(b, **self.config)

            return self._value
def test_read_block():
    delimiter = b'\n'
    data = delimiter.join([b'123', b'456', b'789'])
    f = io.BytesIO(data)

    assert read_block(f, 1, 2) == b'23'
    assert read_block(f, 0, 1, delimiter=b'\n') == b'123\n'
    assert read_block(f, 0, 2, delimiter=b'\n') == b'123\n'
    assert read_block(f, 0, 3, delimiter=b'\n') == b'123\n'
    assert read_block(f, 0, 5, delimiter=b'\n') == b'123\n456\n'
    assert read_block(f, 0, 8, delimiter=b'\n') == b'123\n456\n789'
    assert read_block(f, 0, 100, delimiter=b'\n') == b'123\n456\n789'
    assert read_block(f, 1, 1, delimiter=b'\n') == b''
    assert read_block(f, 1, 5, delimiter=b'\n') == b'456\n'
    assert read_block(f, 1, 8, delimiter=b'\n') == b'456\n789'

    for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)],
                [(0, 4), (4, 4), (8, 4)]]:
        out = [read_block(f, o, l, b'\n') for o, l in ols]
        assert b"".join(filter(None, out)) == data
Ejemplo n.º 3
0
def make_partitions(filename, blocksize, config, delimiter="\n"):
    """
    Partition a CSV file into blocks, using the preferred blocksize
    in bytes, returning the partititions and number of rows in
    each partition

    This divides the input file into partitions with size
    roughly equal to blocksize, reads the bytes, and counts
    the number of delimiters to compute the size of each block

    Parameters
    ----------
    filename : str
        the name of the CSV file to load
    blocksize : int
        the desired number of bytes per block
    delimiter : str, optional
        the character separating lines; default is
        the newline character
    config : dict
        any keyword options to pass to :func:`pandas.read_csv`

    Returns
    -------
    partitions : list of CSVPartition
        list of objects storing the data content of each file partition,
        stored as a bytestring
    sizes : list of int
        the list of the number of rows in each partition
    """
    from dask.bytes.utils import read_block

    config = config.copy()

    # search for lines separated by this character
    delimiter = delimiter.encode()

    # size in bytes and byte offsets of each partition
    size = os.path.getsize(filename)
    offsets = list(range(0, size, int(blocksize)))

    # skip blank lines
    skip_blank_lines = config.get('skip_blank_lines', True)

    # number of rows to read
    nrows = config.pop('nrows', None)

    sizes = []
    partitions = []
    with open(filename, 'rb') as f:
        for i, offset in enumerate(offsets):

            # skiprows only valid for first block
            if i > 0 and 'skiprows' in config:
                config.pop('skiprows')

            # set nrows for this block
            config['nrows'] = nrows

            block = read_block(f, offset, blocksize, delimiter)
            partitions.append(
                CSVPartition(filename, offset, blocksize, delimiter, **config))

            # count delimiter to get size
            size = block.count(delimiter)

            # account for blank lines
            if skip_blank_lines:
                size -= block.count(delimiter + delimiter)
                if i == 0 and block.startswith(delimiter):
                    size -= 1

            # account for skiprows
            skiprows = config.get('skiprows', 0)
            size -= skiprows

            # account for nrows
            if nrows is not None and nrows > 0:
                if nrows < size:
                    sizes.append(nrows)
                    break
                else:
                    nrows -= size  # update for next block

            # manually increase size if at end of the file and no newline
            if i == len(offsets) - 1 and not block.endswith(delimiter):
                size += 1

            sizes.append(size)

    return partitions, sizes
Ejemplo n.º 4
0
def test_read_block():
    delimiter = b"\n"
    data = delimiter.join([b"123", b"456", b"789"])
    f = io.BytesIO(data)

    assert read_block(f, 1, 2) == b"23"
    assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n"
    assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n"
    assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n"
    assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n"
    assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789"
    assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789"
    assert read_block(f, 1, 1, delimiter=b"\n") == b""
    assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n"
    assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789"

    for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]:
        out = [read_block(f, o, l, b"\n") for o, l in ols]
        assert b"".join(filter(None, out)) == data