def value(self): """ Return the parsed btye string as a DataFrame """ try: return self._value except AttributeError: from io import BytesIO try: from dask.bytes.core import read_block except ImportError: from dask.bytes.utils import read_block # read the relevant bytes with open(self.filename, 'rb') as f: block = read_block(f, self.offset, self.blocksize, self.delimiter) # parse the byte string b = BytesIO() b.write(block) b.seek(0) self._value = read_csv(b, **self.config) return self._value
def read_chunk(fobj, off, l, head): """Get rows from raw bytes block""" import fastavro from dask.bytes.core import read_block with fobj as f: chunk = read_block(f, off, l, head['sync']) head_bytes = head['head_bytes'] if not chunk.startswith(MAGIC): chunk = head_bytes + chunk i = io.BytesIO(chunk) return list(fastavro.iter_avro(i))
def read_chunk(fobj, off, l, head): """Get rows from raw bytes block""" import fastavro from dask.bytes.core import read_block if hasattr(fastavro, "iter_avro"): reader = fastavro.iter_avro else: reader = fastavro.reader with fobj as f: chunk = read_block(f, off, l, head["sync"]) head_bytes = head["head_bytes"] if not chunk.startswith(MAGIC): chunk = head_bytes + chunk i = io.BytesIO(chunk) return list(reader(i))
def make_partitions(filename, blocksize, config, delimiter="\n"): """ Partition a CSV file into blocks, using the preferred blocksize in bytes, returning the partititions and number of rows in each partition This divides the input file into partitions with size roughly equal to blocksize, reads the bytes, and counts the number of delimiters to compute the size of each block Parameters ---------- filename : str the name of the CSV file to load blocksize : int the desired number of bytes per block delimiter : str, optional the character separating lines; default is the newline character config : dict any keyword options to pass to :func:`pandas.read_csv` Returns ------- partitions : list of CSVPartition list of objects storing the data content of each file partition, stored as a bytestring sizes : list of int the list of the number of rows in each partition """ try: from dask.bytes.core import read_block except ImportError: from dask.bytes.utils import read_block config = config.copy() # search for lines separated by this character delimiter = delimiter.encode() # size in bytes and byte offsets of each partition size = os.path.getsize(filename) offsets = list(range(0, size, int(blocksize))) # skip blank lines skip_blank_lines = config.get('skip_blank_lines', True) # number of rows to read nrows = config.pop('nrows', None) sizes = [] partitions = [] with open(filename, 'rb') as f: for i, offset in enumerate(offsets): # skiprows only valid for first block if i > 0 and 'skiprows' in config: config.pop('skiprows') # set nrows for this block config['nrows'] = nrows block = read_block(f, offset, blocksize, delimiter) partitions.append( CSVPartition(filename, offset, blocksize, delimiter, **config)) # count delimiter to get size size = block.count(delimiter) # account for blank lines if skip_blank_lines: size -= block.count(delimiter + delimiter) if i == 0 and block.startswith(delimiter): size -= 1 # account for skiprows skiprows = config.get('skiprows', 0) size -= skiprows # account for nrows if nrows is not None and nrows > 0: if nrows < size: sizes.append(nrows) break else: nrows -= size # update for next block # manually increase size if at end of the file and no newline if i == len(offsets) - 1 and not block.endswith(delimiter): size += 1 sizes.append(size) return partitions, sizes
def test_read_block(): delimiter = b"\n" data = delimiter.join([b"123", b"456", b"789"]) f = io.BytesIO(data) assert read_block(f, 1, 2) == b"23" assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n" assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789" assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789" assert read_block(f, 1, 1, delimiter=b"\n") == b"" assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n" assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789" for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]: out = [read_block(f, o, l, b"\n") for o, l in ols] assert b"".join(filter(None, out)) == data