def read_bytes( vineyard_socket: str, path: str, storage_options: Dict, read_options: Dict, proc_num: int, proc_index: int, ): client = vineyard.connect(vineyard_socket) builder = ByteStreamBuilder(client) header_row = read_options.get("header_row", False) for k, v in read_options.items(): if k in ("header_row", "include_all_columns"): builder[k] = "1" if v else "0" elif k == "delimiter": builder[k] = bytes(v, "utf-8").decode("unicode_escape") else: builder[k] = v offset = 0 chunk_size = 1024 * 1024 * 4 of = fsspec.open(path, mode="rb", **storage_options) with of as f: header_line = read_block(f, 0, 1, b'\n') builder["header_line"] = header_line.decode("unicode_escape") if header_row: offset = len(header_line) stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) try: total_size = f.size() except TypeError: total_size = f.size part_size = (total_size - offset) // proc_num begin = part_size * proc_index + offset end = min(begin + part_size, total_size) if proc_index == 0: begin -= int(header_row) while begin < end: buf = read_block(f, begin, min(chunk_size, end - begin), delimiter=b"\n") size = len(buf) if not size: break begin += size - 1 chunk = writer.next(size) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish()
def read_byte_stream( client, fs: AbstractFileSystem, stream: ByteStream, path: str, chunk_size: int = CHUNK_SIZE, ): logger.info('start reading blob at %s', path) with fs.open(path, mode="rb") as f: try: total_size = f.size() except TypeError: total_size = f.size writer = stream.open_writer(client) try: begin, end = 0, total_size while begin < end: buffer = read_block(f, begin, min(chunk_size, end - begin)) chunk = writer.next(len(buffer)) vineyard.memory_copy(chunk, 0, buffer) begin += len(buffer) except Exception: report_exception() writer.fail() sys.exit(-1) writer.finish() return total_size
def test_read_block(ftp_writable): # not the same as test_read_block in test_utils, this depends on the # behaviour of the bytest caching from fsspec.utils import read_block host, port, user, pw = ftp_writable fs = FTPFileSystem(host=host, port=port, username=user, password=pw) fn = "/myfile" with fs.open(fn, 'wb') as f: f.write(b'a,b\n1,2') f = fs.open(fn, 'rb', cache_type='bytes') assert read_block(f, 0, 6400, b'\n') == b'a,b\n1,2'
def read_chunk(fobj, off, l, head): """Get rows from raw bytes block""" import fastavro if hasattr(fastavro, "iter_avro"): reader = fastavro.iter_avro else: reader = fastavro.reader with fobj as f: chunk = read_block(f, off, l, head["sync"]) head_bytes = head["head_bytes"] if not chunk.startswith(MAGIC): chunk = head_bytes + chunk i = io.BytesIO(chunk) return list(reader(i))
def test_read_block(): delimiter = b"\n" data = delimiter.join([b"123", b"456", b"789"]) f = io.BytesIO(data) assert read_block(f, 1, 2) == b"23" assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n" assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n" assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789" assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789" assert read_block(f, 1, 1, delimiter=b"\n") == b"" assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n" assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789" for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]: out = [read_block(f, o, l, b"\n") for o, l in ols] assert b"".join(filter(None, out)) == data
def test_read_block_split_before(): """Test start/middle/end cases of split_before.""" # noqa: I d = ( "#header" + "".join(">foo{i}\nFOOBAR{i}\n".format(i=i) for i in range(100000)) ).encode() # Read single record at beginning. # All reads include beginning of file and read through termination of # delimited record. assert read_block(io.BytesIO(d), 0, 10, delimiter=b"\n") == b"#header>foo0\n" assert ( read_block(io.BytesIO(d), 0, 10, delimiter=b"\n", split_before=True) == b"#header>foo0" ) assert ( read_block(io.BytesIO(d), 0, 10, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>" ) assert ( read_block(io.BytesIO(d), 0, 10, delimiter=b">", split_before=True) == b"#header>foo0\nFOOBAR0\n" ) # Read multiple records at beginning. # All reads include beginning of file and read through termination of # delimited record. assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b"\n") == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n" ) assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b"\n", split_before=True) == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1" ) assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n>" ) assert ( read_block(io.BytesIO(d), 0, 27, delimiter=b">", split_before=True) == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n" ) # Read with offset spanning into next record, splits on either side of delimiter. # Read not spanning the full record returns nothing. assert read_block(io.BytesIO(d), 10, 3, delimiter=b"\n") == b"FOOBAR0\n" assert ( read_block(io.BytesIO(d), 10, 3, delimiter=b"\n", split_before=True) == b"\nFOOBAR0" ) assert read_block(io.BytesIO(d), 10, 3, delimiter=b">") == b"" assert read_block(io.BytesIO(d), 10, 3, delimiter=b">", split_before=True) == b"" # Read with offset spanning multiple records, splits on either side of delimiter assert ( read_block(io.BytesIO(d), 10, 20, delimiter=b"\n") == b"FOOBAR0\n>foo1\nFOOBAR1\n" ) assert ( read_block(io.BytesIO(d), 10, 20, delimiter=b"\n", split_before=True) == b"\nFOOBAR0\n>foo1\nFOOBAR1" ) assert read_block(io.BytesIO(d), 10, 20, delimiter=b">") == b"foo1\nFOOBAR1\n>" assert ( read_block(io.BytesIO(d), 10, 20, delimiter=b">", split_before=True) == b">foo1\nFOOBAR1\n" ) # Read record at end, all records read to end tlen = len(d) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n") == b">foo99999\nFOOBAR99999\n" ) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n", split_before=True) == b"\n>foo99999\nFOOBAR99999\n" ) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">") == b"foo99999\nFOOBAR99999\n" ) assert ( read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">", split_before=True) == b">foo99999\nFOOBAR99999\n" )
def read_bytes( vineyard_socket: str, path: str, storage_options: Dict, read_options: Dict, proc_num: int, proc_index: int, ): """Read bytes from external storage and produce a ByteStream, which will later be assembled into a ParallelStream. Args: vineyard_socket (str): Ipc socket path (str): External storage path to write to storage_options (dict): Configurations of external storage read_options (dict): Additional options that could control the behavior of read proc_num (int): Total amount of process proc_index (int): The sequence of this process Raises: ValueError: If the stream is invalid. """ client = vineyard.connect(vineyard_socket) builder = ByteStreamBuilder(client) serialization_mode = read_options.pop('serialization_mode', False) if serialization_mode: parsed = urlparse(path) fs = fsspec.filesystem(parsed.scheme) meta_file = f"{path}_{proc_index}.meta" blob_file = f"{path}_{proc_index}" if not fs.exists(meta_file) or not fs.exists(blob_file): ret = { "type": "error", "content": "Some serialization file cannot be found. " "Expected: {} and {}".format(meta_file, blob_file) } print(json.dumps(ret), flush=True) raise FileNotFoundError('{}, {}'.format(meta_file, blob_file)) # Used for read bytes of serialized graph meta_file = fsspec.open(meta_file, mode="rb", **storage_options) with meta_file as f: meta = f.read().decode('utf-8') meta = json.loads(meta) lengths = meta.pop("lengths") for k, v in meta.items(): builder[k] = v stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) of = fsspec.open(blob_file, mode="rb", **storage_options) with of as f: try: total_size = f.size() except TypeError: total_size = f.size assert total_size == sum(lengths), "Target file is corrupted" for length in lengths: buf = f.read(length) chunk = writer.next(length) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish() else: # Used when reading tables from external storage. # Usually for load a property graph header_row = read_options.get("header_row", False) for k, v in read_options.items(): if k in ("header_row", "include_all_columns"): builder[k] = "1" if v else "0" elif k == "delimiter": builder[k] = bytes(v, "utf-8").decode("unicode_escape") else: builder[k] = v offset = 0 chunk_size = 1024 * 1024 * 4 of = fsspec.open(path, mode="rb", **storage_options) with of as f: header_line = read_block(f, 0, 1, b'\n') builder["header_line"] = header_line.decode("unicode_escape") if header_row: offset = len(header_line) stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) try: total_size = f.size() except TypeError: total_size = f.size part_size = (total_size - offset) // proc_num begin = part_size * proc_index + offset end = min(begin + part_size, total_size) if proc_index == 0: begin -= int(header_row) while begin < end: buf = read_block(f, begin, min(chunk_size, end - begin), delimiter=b"\n") size = len(buf) if not size: break begin += size - 1 chunk = writer.next(size) buf_writer = pa.FixedSizeBufferWriter(chunk) buf_writer.write(buf) buf_writer.close() writer.finish()
def read_block_from_file(lazy_file, off, bs, delimiter): with copy.copy(lazy_file) as f: if off == 0 and bs is None: return f.read() return read_block(f, off, bs, delimiter)
def read_bytes( vineyard_socket: str, path: str, storage_options: Dict, read_options: Dict, proc_num: int, proc_index: int, ): """Read bytes from external storage and produce a ByteStream, which will later be assembled into a ParallelStream. Args: vineyard_socket (str): Ipc socket path (str): External storage path to write to storage_options (dict): Configurations of external storage read_options (dict): Additional options that could control the behavior of read proc_num (int): Total amount of process proc_index (int): The sequence of this process Raises: ValueError: If the stream is invalid. """ client = vineyard.connect(vineyard_socket) builder = ByteStreamBuilder(client) serialization_mode = read_options.pop('serialization_mode', False) if serialization_mode: parsed = urlparse(path) try: fs = fsspec.filesystem(parsed.scheme) except ValueError as e: report_status("error", str(e)) raise meta_file = f"{path}_{proc_index}.meta" blob_file = f"{path}_{proc_index}" if not fs.exists(meta_file) or not fs.exists(blob_file): report_status( "error", f"Some serialization file cannot be found. Expected: {meta_file} and {blob_file}" ) raise FileNotFoundError('{}, {}'.format(meta_file, blob_file)) # Used for read bytes of serialized graph meta_file = fsspec.open(meta_file, mode="rb", **storage_options) with meta_file as f: meta = f.read().decode('utf-8') meta = json.loads(meta) lengths = meta.pop("lengths") for k, v in meta.items(): builder[k] = v stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) of = fsspec.open(blob_file, mode="rb", **storage_options) with of as f: try: total_size = f.size() except TypeError: total_size = f.size assert total_size == sum(lengths), "Target file is corrupted" for length in lengths: buf = f.read(length) chunk = writer.next(length) buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk)) buf_writer.write(buf) buf_writer.close() writer.finish() else: # Used when reading tables from external storage. # Usually for load a property graph header_row = read_options.get("header_row", False) for k, v in read_options.items(): if k in ("header_row", "include_all_columns"): builder[k] = "1" if v else "0" elif k == "delimiter": builder[k] = bytes(v, "utf-8").decode("unicode_escape") else: builder[k] = v try: protocol = split_protocol(path)[0] fs = fsspec.filesystem(protocol, **storage_options) except Exception as e: report_status("error", f"Cannot initialize such filesystem for '{path}'") raise if fs.isfile(path): files = [path] else: try: files = fs.glob(path + '*') assert files, f"Cannot find such files: {path}" except: report_status("error", f"Cannot find such files for '{path}'") raise ''' Note [Semantic of read_block with delimiter]: read_block(fp, begin, size, delimiter) will: - find the first `delimiter` from `begin`, then starts read - after `size`, go through util the next `delimiter` or EOF, then finishes read. Note that the returned size may exceed `size`. ''' chunk_size = 1024 * 1024 * 4 for index, file_path in enumerate(files): with fs.open(file_path, mode="rb") as f: offset = 0 # Only process header line when processing first file # And open the writer when processing first file if index == 0: header_line = read_block(f, 0, 1, b'\n') builder["header_line"] = header_line.decode( "unicode_escape") if header_row: offset = len(header_line) stream = builder.seal(client) client.persist(stream) ret = {"type": "return", "content": repr(stream.id)} print(json.dumps(ret), flush=True) writer = stream.open_writer(client) try: total_size = f.size() except TypeError: total_size = f.size part_size = (total_size - offset) // proc_num begin = part_size * proc_index + offset end = min(begin + part_size, total_size) # See Note [Semantic of read_block with delimiter]. if index == 0 and proc_index == 0: begin -= int(header_row) while begin < end: buf = read_block(f, begin, min(chunk_size, end - begin), delimiter=b"\n") size = len(buf) if not size: break begin += size - 1 chunk = writer.next(size) buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk)) buf_writer.write(buf) buf_writer.close() writer.finish()
def read_bytes( # noqa: C901 vineyard_socket: str, path: str, storage_options: Dict, read_options: Dict, proc_num: int, proc_index: int, ): """Read bytes from external storage and produce a ByteStream, which will later be assembled into a ParallelStream. Args: vineyard_socket (str): Ipc socket path (str): External storage path to write to storage_options (dict): Configurations of external storage read_options (dict): Additional options that could control the behavior of read proc_num (int): Total amount of process proc_index (int): The sequence of this process Raises: ValueError: If the stream is invalid. """ client = vineyard.connect(vineyard_socket) params = dict() read_block_delimiter = read_options.pop('read_block_delimiter', '\n') if read_block_delimiter is not None: read_block_delimiter = read_block_delimiter.encode('utf-8') # Used when reading tables from external storage. # Usually for load a property graph header_row = read_options.get("header_row", False) for k, v in read_options.items(): if k in ("header_row", "include_all_columns"): params[k] = "1" if v else "0" elif k == "delimiter": params[k] = bytes(v, "utf-8").decode("unicode_escape") else: params[k] = v try: protocol = split_protocol(path)[0] fs = fsspec.filesystem(protocol, **storage_options) except Exception: report_error( f"Cannot initialize such filesystem for '{path}', " f"exception is:\n{traceback.format_exc()}" ) sys.exit(-1) if fs.isfile(path): files = [path] else: try: files = fs.glob(path + '*') assert files, f"Cannot find such files: {path}" except Exception: report_error(f"Cannot find such files for '{path}'") sys.exit(-1) ''' Note [Semantic of read_block with delimiter]: read_block(fp, begin, size, delimiter) will: - find the first `delimiter` from `begin`, then starts read - after `size`, go through util the next `delimiter` or EOF, then finishes read. Note that the returned size may exceed `size`. ''' stream, writer = None, None if 'chunk_size' in storage_options: chunk_size = parse_readable_size(storage_options['chunk_size']) else: chunk_size = 1024 * 1024 * 64 # default: 64MB try: for index, file_path in enumerate(files): with fs.open(file_path, mode="rb") as f: offset = 0 # Only process header line when processing first file # And open the writer when processing first file if index == 0: if header_row: header_line = read_block(f, 0, 1, read_block_delimiter) params["header_line"] = header_line.decode("unicode_escape") offset = len(header_line) stream = ByteStream.new(client, params) client.persist(stream.id) report_success(stream.id) writer = stream.open_writer(client) try: total_size = f.size() except TypeError: total_size = f.size part_size = (total_size - offset) // proc_num begin = part_size * proc_index + offset end = min(begin + part_size, total_size) # See Note [Semantic of read_block with delimiter]. if index == 0 and proc_index == 0: begin -= int(header_row) while begin < end: buffer = read_block( f, begin, min(chunk_size, end - begin), delimiter=read_block_delimiter, ) size = len(buffer) if size <= 0: break begin += size - 1 chunk = writer.next(size) vineyard.memory_copy(chunk, 0, buffer) writer.finish() except Exception: report_exception() if writer is not None: writer.fail() sys.exit(-1)