Beispiel #1
0
def read_bytes(
    vineyard_socket: str,
    path: str,
    storage_options: Dict,
    read_options: Dict,
    proc_num: int,
    proc_index: int,
):
    client = vineyard.connect(vineyard_socket)
    builder = ByteStreamBuilder(client)

    header_row = read_options.get("header_row", False)
    for k, v in read_options.items():
        if k in ("header_row", "include_all_columns"):
            builder[k] = "1" if v else "0"
        elif k == "delimiter":
            builder[k] = bytes(v, "utf-8").decode("unicode_escape")
        else:
            builder[k] = v

    offset = 0
    chunk_size = 1024 * 1024 * 4
    of = fsspec.open(path, mode="rb", **storage_options)
    with of as f:
        header_line = read_block(f, 0, 1, b'\n')
        builder["header_line"] = header_line.decode("unicode_escape")
        if header_row:
            offset = len(header_line)
        stream = builder.seal(client)
        client.persist(stream)
        ret = {"type": "return", "content": repr(stream.id)}
        print(json.dumps(ret), flush=True)

        writer = stream.open_writer(client)
        try:
            total_size = f.size()
        except TypeError:
            total_size = f.size
        part_size = (total_size - offset) // proc_num
        begin = part_size * proc_index + offset
        end = min(begin + part_size, total_size)
        if proc_index == 0:
            begin -= int(header_row)

        while begin < end:
            buf = read_block(f,
                             begin,
                             min(chunk_size, end - begin),
                             delimiter=b"\n")
            size = len(buf)
            if not size:
                break
            begin += size - 1
            chunk = writer.next(size)
            buf_writer = pa.FixedSizeBufferWriter(chunk)
            buf_writer.write(buf)
            buf_writer.close()

        writer.finish()
Beispiel #2
0
def read_byte_stream(
    client,
    fs: AbstractFileSystem,
    stream: ByteStream,
    path: str,
    chunk_size: int = CHUNK_SIZE,
):
    logger.info('start reading blob at %s', path)
    with fs.open(path, mode="rb") as f:
        try:
            total_size = f.size()
        except TypeError:
            total_size = f.size

        writer = stream.open_writer(client)
        try:
            begin, end = 0, total_size
            while begin < end:
                buffer = read_block(f, begin, min(chunk_size, end - begin))
                chunk = writer.next(len(buffer))
                vineyard.memory_copy(chunk, 0, buffer)
                begin += len(buffer)
        except Exception:
            report_exception()
            writer.fail()
            sys.exit(-1)

        writer.finish()
        return total_size
def test_read_block(ftp_writable):
    # not the same as test_read_block in test_utils, this depends on the
    # behaviour of the bytest caching
    from fsspec.utils import read_block
    host, port, user, pw = ftp_writable
    fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
    fn = "/myfile"
    with fs.open(fn, 'wb') as f:
        f.write(b'a,b\n1,2')
    f = fs.open(fn, 'rb', cache_type='bytes')
    assert read_block(f, 0, 6400, b'\n') == b'a,b\n1,2'
Beispiel #4
0
def read_chunk(fobj, off, l, head):
    """Get rows from raw bytes block"""
    import fastavro

    if hasattr(fastavro, "iter_avro"):
        reader = fastavro.iter_avro
    else:
        reader = fastavro.reader

    with fobj as f:
        chunk = read_block(f, off, l, head["sync"])
    head_bytes = head["head_bytes"]
    if not chunk.startswith(MAGIC):
        chunk = head_bytes + chunk
    i = io.BytesIO(chunk)
    return list(reader(i))
Beispiel #5
0
def test_read_block():
    delimiter = b"\n"
    data = delimiter.join([b"123", b"456", b"789"])
    f = io.BytesIO(data)

    assert read_block(f, 1, 2) == b"23"
    assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n"
    assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n"
    assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n"
    assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n"
    assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789"
    assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789"
    assert read_block(f, 1, 1, delimiter=b"\n") == b""
    assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n"
    assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789"

    for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]:
        out = [read_block(f, o, l, b"\n") for o, l in ols]
        assert b"".join(filter(None, out)) == data
def test_read_block_split_before():
    """Test start/middle/end cases of split_before."""  # noqa: I
    d = (
        "#header" + "".join(">foo{i}\nFOOBAR{i}\n".format(i=i) for i in range(100000))
    ).encode()

    # Read single record at beginning.
    # All reads include beginning of file and read through termination of
    # delimited record.
    assert read_block(io.BytesIO(d), 0, 10, delimiter=b"\n") == b"#header>foo0\n"
    assert (
        read_block(io.BytesIO(d), 0, 10, delimiter=b"\n", split_before=True)
        == b"#header>foo0"
    )
    assert (
        read_block(io.BytesIO(d), 0, 10, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>"
    )
    assert (
        read_block(io.BytesIO(d), 0, 10, delimiter=b">", split_before=True)
        == b"#header>foo0\nFOOBAR0\n"
    )

    # Read multiple records at beginning.
    # All reads include beginning of file and read through termination of
    # delimited record.
    assert (
        read_block(io.BytesIO(d), 0, 27, delimiter=b"\n")
        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n"
    )
    assert (
        read_block(io.BytesIO(d), 0, 27, delimiter=b"\n", split_before=True)
        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1"
    )
    assert (
        read_block(io.BytesIO(d), 0, 27, delimiter=b">")
        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n>"
    )
    assert (
        read_block(io.BytesIO(d), 0, 27, delimiter=b">", split_before=True)
        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n"
    )

    # Read with offset spanning into next record, splits on either side of delimiter.
    # Read not spanning the full record returns nothing.
    assert read_block(io.BytesIO(d), 10, 3, delimiter=b"\n") == b"FOOBAR0\n"
    assert (
        read_block(io.BytesIO(d), 10, 3, delimiter=b"\n", split_before=True)
        == b"\nFOOBAR0"
    )
    assert read_block(io.BytesIO(d), 10, 3, delimiter=b">") == b""
    assert read_block(io.BytesIO(d), 10, 3, delimiter=b">", split_before=True) == b""

    # Read with offset spanning multiple records, splits on either side of delimiter
    assert (
        read_block(io.BytesIO(d), 10, 20, delimiter=b"\n")
        == b"FOOBAR0\n>foo1\nFOOBAR1\n"
    )
    assert (
        read_block(io.BytesIO(d), 10, 20, delimiter=b"\n", split_before=True)
        == b"\nFOOBAR0\n>foo1\nFOOBAR1"
    )
    assert read_block(io.BytesIO(d), 10, 20, delimiter=b">") == b"foo1\nFOOBAR1\n>"
    assert (
        read_block(io.BytesIO(d), 10, 20, delimiter=b">", split_before=True)
        == b">foo1\nFOOBAR1\n"
    )

    # Read record at end, all records read to end

    tlen = len(d)

    assert (
        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n")
        == b">foo99999\nFOOBAR99999\n"
    )

    assert (
        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n", split_before=True)
        == b"\n>foo99999\nFOOBAR99999\n"
    )

    assert (
        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">")
        == b"foo99999\nFOOBAR99999\n"
    )

    assert (
        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">", split_before=True)
        == b">foo99999\nFOOBAR99999\n"
    )
Beispiel #7
0
def read_bytes(
    vineyard_socket: str,
    path: str,
    storage_options: Dict,
    read_options: Dict,
    proc_num: int,
    proc_index: int,
):
    """Read bytes from external storage and produce a ByteStream,
    which will later be assembled into a ParallelStream.

    Args:
        vineyard_socket (str): Ipc socket
        path (str): External storage path to write to
        storage_options (dict): Configurations of external storage
        read_options (dict): Additional options that could control the behavior of read
        proc_num (int): Total amount of process
        proc_index (int): The sequence of this process

    Raises:
        ValueError: If the stream is invalid.
    """
    client = vineyard.connect(vineyard_socket)
    builder = ByteStreamBuilder(client)

    serialization_mode = read_options.pop('serialization_mode', False)
    if serialization_mode:
        parsed = urlparse(path)
        fs = fsspec.filesystem(parsed.scheme)
        meta_file = f"{path}_{proc_index}.meta"
        blob_file = f"{path}_{proc_index}"
        if not fs.exists(meta_file) or not fs.exists(blob_file):
            ret = {
                "type":
                "error",
                "content":
                "Some serialization file cannot be found. "
                "Expected: {} and {}".format(meta_file, blob_file)
            }
            print(json.dumps(ret), flush=True)
            raise FileNotFoundError('{}, {}'.format(meta_file, blob_file))
        # Used for read bytes of serialized graph
        meta_file = fsspec.open(meta_file, mode="rb", **storage_options)
        with meta_file as f:
            meta = f.read().decode('utf-8')
            meta = json.loads(meta)
        lengths = meta.pop("lengths")
        for k, v in meta.items():
            builder[k] = v
        stream = builder.seal(client)
        client.persist(stream)
        ret = {"type": "return", "content": repr(stream.id)}
        print(json.dumps(ret), flush=True)
        writer = stream.open_writer(client)
        of = fsspec.open(blob_file, mode="rb", **storage_options)
        with of as f:
            try:
                total_size = f.size()
            except TypeError:
                total_size = f.size
            assert total_size == sum(lengths), "Target file is corrupted"
            for length in lengths:
                buf = f.read(length)
                chunk = writer.next(length)
                buf_writer = pa.FixedSizeBufferWriter(chunk)
                buf_writer.write(buf)
                buf_writer.close()
        writer.finish()
    else:
        # Used when reading tables from external storage.
        # Usually for load a property graph
        header_row = read_options.get("header_row", False)
        for k, v in read_options.items():
            if k in ("header_row", "include_all_columns"):
                builder[k] = "1" if v else "0"
            elif k == "delimiter":
                builder[k] = bytes(v, "utf-8").decode("unicode_escape")
            else:
                builder[k] = v

        offset = 0
        chunk_size = 1024 * 1024 * 4
        of = fsspec.open(path, mode="rb", **storage_options)
        with of as f:
            header_line = read_block(f, 0, 1, b'\n')
            builder["header_line"] = header_line.decode("unicode_escape")
            if header_row:
                offset = len(header_line)
            stream = builder.seal(client)
            client.persist(stream)
            ret = {"type": "return", "content": repr(stream.id)}
            print(json.dumps(ret), flush=True)

            writer = stream.open_writer(client)
            try:
                total_size = f.size()
            except TypeError:
                total_size = f.size
            part_size = (total_size - offset) // proc_num
            begin = part_size * proc_index + offset
            end = min(begin + part_size, total_size)
            if proc_index == 0:
                begin -= int(header_row)

            while begin < end:
                buf = read_block(f,
                                 begin,
                                 min(chunk_size, end - begin),
                                 delimiter=b"\n")
                size = len(buf)
                if not size:
                    break
                begin += size - 1
                chunk = writer.next(size)
                buf_writer = pa.FixedSizeBufferWriter(chunk)
                buf_writer.write(buf)
                buf_writer.close()

            writer.finish()
Beispiel #8
0
def read_block_from_file(lazy_file, off, bs, delimiter):
    with copy.copy(lazy_file) as f:
        if off == 0 and bs is None:
            return f.read()
        return read_block(f, off, bs, delimiter)
Beispiel #9
0
def read_bytes(
    vineyard_socket: str,
    path: str,
    storage_options: Dict,
    read_options: Dict,
    proc_num: int,
    proc_index: int,
):
    """Read bytes from external storage and produce a ByteStream,
    which will later be assembled into a ParallelStream.

    Args:
        vineyard_socket (str): Ipc socket
        path (str): External storage path to write to
        storage_options (dict): Configurations of external storage
        read_options (dict): Additional options that could control the behavior of read
        proc_num (int): Total amount of process
        proc_index (int): The sequence of this process

    Raises:
        ValueError: If the stream is invalid.
    """
    client = vineyard.connect(vineyard_socket)
    builder = ByteStreamBuilder(client)

    serialization_mode = read_options.pop('serialization_mode', False)
    if serialization_mode:
        parsed = urlparse(path)
        try:
            fs = fsspec.filesystem(parsed.scheme)
        except ValueError as e:
            report_status("error", str(e))
            raise
        meta_file = f"{path}_{proc_index}.meta"
        blob_file = f"{path}_{proc_index}"
        if not fs.exists(meta_file) or not fs.exists(blob_file):
            report_status(
                "error",
                f"Some serialization file cannot be found. Expected: {meta_file} and {blob_file}"
            )
            raise FileNotFoundError('{}, {}'.format(meta_file, blob_file))
        # Used for read bytes of serialized graph
        meta_file = fsspec.open(meta_file, mode="rb", **storage_options)
        with meta_file as f:
            meta = f.read().decode('utf-8')
            meta = json.loads(meta)
        lengths = meta.pop("lengths")
        for k, v in meta.items():
            builder[k] = v
        stream = builder.seal(client)
        client.persist(stream)
        ret = {"type": "return", "content": repr(stream.id)}
        print(json.dumps(ret), flush=True)
        writer = stream.open_writer(client)
        of = fsspec.open(blob_file, mode="rb", **storage_options)
        with of as f:
            try:
                total_size = f.size()
            except TypeError:
                total_size = f.size
            assert total_size == sum(lengths), "Target file is corrupted"
            for length in lengths:
                buf = f.read(length)
                chunk = writer.next(length)
                buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk))
                buf_writer.write(buf)
                buf_writer.close()
        writer.finish()
    else:
        # Used when reading tables from external storage.
        # Usually for load a property graph
        header_row = read_options.get("header_row", False)
        for k, v in read_options.items():
            if k in ("header_row", "include_all_columns"):
                builder[k] = "1" if v else "0"
            elif k == "delimiter":
                builder[k] = bytes(v, "utf-8").decode("unicode_escape")
            else:
                builder[k] = v

        try:
            protocol = split_protocol(path)[0]
            fs = fsspec.filesystem(protocol, **storage_options)
        except Exception as e:
            report_status("error",
                          f"Cannot initialize such filesystem for '{path}'")
            raise

        if fs.isfile(path):
            files = [path]
        else:
            try:
                files = fs.glob(path + '*')
                assert files, f"Cannot find such files: {path}"
            except:
                report_status("error", f"Cannot find such files for '{path}'")
                raise
        ''' Note [Semantic of read_block with delimiter]:

        read_block(fp, begin, size, delimiter) will:

            - find the first `delimiter` from `begin`, then starts read
            - after `size`, go through util the next `delimiter` or EOF, then finishes read.
              Note that the returned size may exceed `size`.
        '''

        chunk_size = 1024 * 1024 * 4
        for index, file_path in enumerate(files):
            with fs.open(file_path, mode="rb") as f:
                offset = 0
                # Only process header line when processing first file
                # And open the writer when processing first file
                if index == 0:
                    header_line = read_block(f, 0, 1, b'\n')
                    builder["header_line"] = header_line.decode(
                        "unicode_escape")
                    if header_row:
                        offset = len(header_line)
                    stream = builder.seal(client)
                    client.persist(stream)
                    ret = {"type": "return", "content": repr(stream.id)}
                    print(json.dumps(ret), flush=True)
                    writer = stream.open_writer(client)

                try:
                    total_size = f.size()
                except TypeError:
                    total_size = f.size
                part_size = (total_size - offset) // proc_num
                begin = part_size * proc_index + offset
                end = min(begin + part_size, total_size)

                # See Note [Semantic of read_block with delimiter].
                if index == 0 and proc_index == 0:
                    begin -= int(header_row)

                while begin < end:
                    buf = read_block(f,
                                     begin,
                                     min(chunk_size, end - begin),
                                     delimiter=b"\n")
                    size = len(buf)
                    if not size:
                        break
                    begin += size - 1
                    chunk = writer.next(size)
                    buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk))
                    buf_writer.write(buf)
                    buf_writer.close()
        writer.finish()
Beispiel #10
0
def read_bytes(  # noqa: C901
    vineyard_socket: str,
    path: str,
    storage_options: Dict,
    read_options: Dict,
    proc_num: int,
    proc_index: int,
):
    """Read bytes from external storage and produce a ByteStream,
    which will later be assembled into a ParallelStream.

    Args:
        vineyard_socket (str): Ipc socket
        path (str): External storage path to write to
        storage_options (dict): Configurations of external storage
        read_options (dict): Additional options that could control the behavior of read
        proc_num (int): Total amount of process
        proc_index (int): The sequence of this process

    Raises:
        ValueError: If the stream is invalid.
    """
    client = vineyard.connect(vineyard_socket)
    params = dict()

    read_block_delimiter = read_options.pop('read_block_delimiter', '\n')
    if read_block_delimiter is not None:
        read_block_delimiter = read_block_delimiter.encode('utf-8')

    # Used when reading tables from external storage.
    # Usually for load a property graph
    header_row = read_options.get("header_row", False)
    for k, v in read_options.items():
        if k in ("header_row", "include_all_columns"):
            params[k] = "1" if v else "0"
        elif k == "delimiter":
            params[k] = bytes(v, "utf-8").decode("unicode_escape")
        else:
            params[k] = v

    try:
        protocol = split_protocol(path)[0]
        fs = fsspec.filesystem(protocol, **storage_options)
    except Exception:
        report_error(
            f"Cannot initialize such filesystem for '{path}', "
            f"exception is:\n{traceback.format_exc()}"
        )
        sys.exit(-1)

    if fs.isfile(path):
        files = [path]
    else:
        try:
            files = fs.glob(path + '*')
            assert files, f"Cannot find such files: {path}"
        except Exception:
            report_error(f"Cannot find such files for '{path}'")
            sys.exit(-1)
    ''' Note [Semantic of read_block with delimiter]:

    read_block(fp, begin, size, delimiter) will:

        - find the first `delimiter` from `begin`, then starts read
        - after `size`, go through util the next `delimiter` or EOF, then finishes read.
            Note that the returned size may exceed `size`.
    '''

    stream, writer = None, None
    if 'chunk_size' in storage_options:
        chunk_size = parse_readable_size(storage_options['chunk_size'])
    else:
        chunk_size = 1024 * 1024 * 64  # default: 64MB

    try:
        for index, file_path in enumerate(files):
            with fs.open(file_path, mode="rb") as f:
                offset = 0
                # Only process header line when processing first file
                # And open the writer when processing first file
                if index == 0:
                    if header_row:
                        header_line = read_block(f, 0, 1, read_block_delimiter)
                        params["header_line"] = header_line.decode("unicode_escape")
                        offset = len(header_line)
                    stream = ByteStream.new(client, params)
                    client.persist(stream.id)
                    report_success(stream.id)
                    writer = stream.open_writer(client)

                try:
                    total_size = f.size()
                except TypeError:
                    total_size = f.size
                part_size = (total_size - offset) // proc_num
                begin = part_size * proc_index + offset
                end = min(begin + part_size, total_size)

                # See Note [Semantic of read_block with delimiter].
                if index == 0 and proc_index == 0:
                    begin -= int(header_row)

                while begin < end:
                    buffer = read_block(
                        f,
                        begin,
                        min(chunk_size, end - begin),
                        delimiter=read_block_delimiter,
                    )
                    size = len(buffer)
                    if size <= 0:
                        break
                    begin += size - 1
                    chunk = writer.next(size)
                    vineyard.memory_copy(chunk, 0, buffer)
        writer.finish()
    except Exception:
        report_exception()
        if writer is not None:
            writer.fail()
        sys.exit(-1)