Esempio n. 1
0
def put_df(df):
    id_num = df['rIDs'].values[0]
    id_num = str(id_num)
    if id_num == '23':
        id_num = 'X'
    elif id_num == '24':
        id_num = 'Y'
    elif id_num == '25':
        id_num = 'M'

    record_batch = pa.RecordBatch.from_pandas(df)
    record_batch_rows = record_batch.num_rows
    record_batch_rows_actual = record_batch_rows
    index = 0
    limit = 5714285
    check = False
    print(record_batch_rows_actual)
    i = 0
    while record_batch_rows > limit:

        check = True
        record_batch_rows = record_batch_rows - limit
        record_batch_slice = record_batch.slice(index, limit)
        index = index + limit

        # Get size of record batch and schema
        mock_sink = pa.MockOutputStream()
        stream_writer = pa.RecordBatchStreamWriter(mock_sink,
                                                   record_batch_slice.schema)
        stream_writer.write_batch(record_batch_slice)
        data_size = mock_sink.size()

        # Generate an ID and allocate a buffer in the object store for the
        # serialized DataFrame
        object_id = plasma.ObjectID(''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(20)))
        #print(id_num)
        buf = client.create(object_id, data_size)

        # Write the serialized DataFrame to the object store
        sink = pa.FixedSizeBufferWriter(buf)
        stream_writer = pa.RecordBatchStreamWriter(sink,
                                                   record_batch_slice.schema)
        stream_writer.write_batch(record_batch_slice)

        # Seal the object
        client.seal(object_id)

        f = open("/home/tahmad/bulk/apps/objIDsPy.txt", "a")
        f.write('Chr' + id_num + '_' + str(i) + '\t' + object_id.binary() +
                '\n')
        f.close()
        i = i + 1

    if check == True:
        record_batch = record_batch.slice(index, record_batch_rows)

    # Get size of record batch and schema
    mock_sink = pa.MockOutputStream()
    stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema)
    stream_writer.write_batch(record_batch)
    data_size = mock_sink.size()

    # Generate an ID and allocate a buffer in the object store for the
    # serialized DataFrame
    object_id = plasma.ObjectID(''.join(
        random.choice(string.ascii_uppercase + string.digits)
        for _ in range(20)))
    #print(id_num)
    buf = client.create(object_id, data_size)

    # Write the serialized DataFrame to the object store
    sink = pa.FixedSizeBufferWriter(buf)
    stream_writer = pa.RecordBatchStreamWriter(sink, record_batch.schema)
    stream_writer.write_batch(record_batch)

    # Seal the object
    client.seal(object_id)

    #get_df(object_id) #Loopback

    return object_id, id_num
Esempio n. 2
0
    def __init__(self,
                 vineyard_client,
                 session_id,
                 data_key,
                 data_id,
                 mode='w',
                 nbytes=None,
                 packed=False,
                 compress=None,
                 auto_register=True,
                 pin_token=None,
                 handler=None):
        from .objectholder import SharedHolderActor

        logger.debug(
            'create vineyard bytes IO: mode = %s, packed = %s, compress = %r',
            mode, packed, compress)

        super().__init__(session_id, data_key, mode=mode, handler=handler)
        self._client = vineyard_client
        self._data_id = data_id
        self._buffer = None
        self._offset = 0
        self._nbytes = nbytes
        self._holder_ref = self._storage_ctx.actor_ctx.actor_ref(
            SharedHolderActor.default_uid())
        self._compress = compress or dataserializer.CompressType.NONE
        self._packed = packed
        self._auto_register = auto_register
        self._pin_token = pin_token

        block_size = options.worker.copy_block_size

        if self.is_writable:
            logger.debug(
                'bytes io write: session_id = %s, data_key = %s, size = %d',
                session_id, data_key, nbytes)
            self._buffer = pyarrow.allocate_buffer(nbytes, resizable=False)
            if packed:
                self._buf = ArrowBufferIO(self._buffer,
                                          'w',
                                          block_size=block_size)
            else:
                self._buf = pyarrow.FixedSizeBufferWriter(self._buffer)
                self._buf.set_memcopy_threads(6)
        elif self.is_readable:
            logger.debug(
                'bytes io get: session_id = %s, data_key = %s, data_id = %r',
                session_id, data_key, data_id)
            data = self._client.get(data_id)

            self._buffer = pyarrow.serialize(
                data, dataserializer.mars_serialize_context()).to_buffer()
            if packed:
                self._buf = ArrowBufferIO(self._buffer,
                                          'r',
                                          compress_out=compress,
                                          block_size=block_size)
                self._nbytes = len(self._buffer)
            else:
                self._mv = memoryview(self._buffer)
                self._nbytes = len(self._buffer)
        else:
            raise NotImplementedError
Esempio n. 3
0
 def _write_init(self):
     self._buffer = buf = self._plasma_client.create(
         self._object_id, self._size)
     file = self._file = pa.FixedSizeBufferWriter(buf)
     file.set_memcopy_threads(6)
Esempio n. 4
0
def hello():
    channel = grpc.insecure_channel('untrusted:50051')
    stub = codeRunner_pb2_grpc.codeRunnerStub(channel)

    rand = random.choice([True, False])

    from pyarrow import csv
    fn = "IRAhandle_tweets_1.csv" if rand else "mimic.csv"
    table = csv.read_csv(fn)
    start = time.clock()

    print("data loaded")

    batches = table.to_batches()
    print(1)
    client = plasma.connect("/tmp/plasma")

    print(2)

    code = """
import time
while True:
    print(7)
    time.sleep(0.5)
""" if False else """
import os
import pyarrow
import sys

authors = dataTable.column("author")
newData = []
for i in range(len(authors)):
    newData.append(1 if i == 0 or authors[i] != authors[i-1] else newData[-1]+1)
newColumn = dataTable.column(3).from_array("authorTweetCount", [newData])
newTable = dataTable.append_column(newColumn)
    """ if rand else """
import os
import pyarrow
import sys

ages = dataTable.column("age")
maxV = max(ages.to_pylist())
newData = []
for i in ages:
    newData.append(1 if i == maxV else 0)
newColumn = dataTable.column(3).from_array("oldest", [newData])
newTable = dataTable.append_column(newColumn)
    """

    tables = []

    for i in range(len(batches)):
        id_ = randString()

        strId = makeID(id_)

        mock_sink = pyarrow.MockOutputStream()  #find data size
        stream_writer = pyarrow.RecordBatchStreamWriter(
            mock_sink, batches[0].schema)
        stream_writer.write_batch(batches[i])
        stream_writer.close()
        data_size = mock_sink.size()
        buf = client.create(strId, data_size)

        stream = pyarrow.FixedSizeBufferWriter(buf)
        stream_writer = pyarrow.RecordBatchStreamWriter(
            stream, batches[0].schema)
        stream_writer.write_batch(batches[i])
        stream_writer.close()

        client.seal(strId)
        print("sent batch " + str(i + 1))

        codeToSend = codeRunner_pb2.code(toRun=code, id_=id_)

        newId = stub.runCode(codeToSend, timeout=1)
        newId = newId.id_

        [data] = client.get_buffers([makeID(newId)])
        outputBuf = pyarrow.py_buffer(data.to_pybytes())
        buffer_ = pyarrow.BufferReader(outputBuf)
        reader = pyarrow.RecordBatchStreamReader(buffer_)
        if i == 0:
            datatable = reader.read_all()
        else:
            datatable = pyarrow.concat_tables([
                datatable,
                datatable.from_batches(reader.read_all().to_batches())
            ])

    html = str(datatable.column("authorTweetCount" if rand else "oldest").data)
    print("data received after " + str(time.clock() - start))

    return html
Esempio n. 5
0
    def testPlasmaSharedStore(self):
        import pyarrow
        from pyarrow import plasma

        store_size = 10 * 1024**2
        test_addr = f'127.0.0.1:{get_next_port()}'
        options.worker.plasma_dir = '/dev/shm' if os.path.exists(
            '/dev/shm') else '/tmp'
        with plasma.start_plasma_store(store_size) as (sckt, _), \
                create_actor_pool(n_process=1, address=test_addr) as pool:
            km_ref = pool.create_actor(PlasmaKeyMapActor,
                                       uid=PlasmaKeyMapActor.default_uid())
            try:
                plasma_client = plasma.connect(sckt)
            except TypeError:
                plasma_client = plasma.connect(sckt, '', 0)
            store = PlasmaSharedStore(plasma_client, km_ref)

            self.assertGreater(store.get_actual_capacity(store_size),
                               store_size / 2)

            session_id = str(uuid.uuid4())
            data_list = [
                np.random.randint(0, 32767, (655360, ), np.int16)
                for _ in range(20)
            ]
            key_list = [str(uuid.uuid4()) for _ in range(20)]

            self.assertFalse(store.contains(session_id, str(uuid.uuid4())))
            with self.assertRaises(KeyError):
                store.get(session_id, str(uuid.uuid4()))
            with self.assertRaises(KeyError):
                store.get_actual_size(session_id, str(uuid.uuid4()))
            with self.assertRaises(KeyError):
                store.seal(session_id, str(uuid.uuid4()))

            fake_data_key = str(uuid.uuid4())
            km_ref.put(session_id, fake_data_key,
                       plasma.ObjectID.from_random())
            self.assertFalse(store.contains(session_id, fake_data_key))
            self.assertIsNone(km_ref.get(session_id, fake_data_key))
            with self.assertRaises(KeyError):
                km_ref.put(session_id, fake_data_key,
                           plasma.ObjectID.from_random())
                store.get(session_id, fake_data_key)
            self.assertIsNone(km_ref.get(session_id, fake_data_key))
            with self.assertRaises(KeyError):
                km_ref.put(session_id, fake_data_key,
                           plasma.ObjectID.from_random())
                store.seal(session_id, fake_data_key)
            self.assertIsNone(km_ref.get(session_id, fake_data_key))
            with self.assertRaises(KeyError):
                km_ref.put(session_id, fake_data_key,
                           plasma.ObjectID.from_random())
                store.get_actual_size(session_id, fake_data_key)
            self.assertIsNone(km_ref.get(session_id, fake_data_key))
            with self.assertRaises(KeyError):
                km_ref.put(session_id, fake_data_key,
                           plasma.ObjectID.from_random())
                store.get_buffer(session_id, fake_data_key)
            self.assertIsNone(km_ref.get(session_id, fake_data_key))
            store.delete(session_id, fake_data_key)

            with self.assertRaises(SerializationFailed):
                non_serial = type('non_serial', (object, ), dict(nbytes=10))
                store.put(session_id, fake_data_key, non_serial())
            self.assertIsNone(km_ref.get(session_id, fake_data_key))
            with self.assertRaises(Exception):
                store.create(session_id, fake_data_key, 'abcd')
            self.assertIsNone(km_ref.get(session_id, fake_data_key))
            with self.assertRaises(StorageFull):
                store.create(session_id, fake_data_key, store_size * 2)
            self.assertIsNone(km_ref.get(session_id, fake_data_key))

            arrow_ser = pyarrow.serialize(data_list[0])
            buf = store.create(session_id, key_list[0], arrow_ser.total_bytes)
            writer = pyarrow.FixedSizeBufferWriter(buf)
            arrow_ser.write_to(writer)
            writer.close()
            store.seal(session_id, key_list[0])

            self.assertTrue(store.contains(session_id, key_list[0]))
            self.assertEqual(store.get_actual_size(session_id, key_list[0]),
                             arrow_ser.total_bytes)
            assert_allclose(store.get(session_id, key_list[0]), data_list[0])
            assert_allclose(
                pyarrow.deserialize(store.get_buffer(session_id, key_list[0])),
                data_list[0])

            with self.assertRaises(StorageDataExists):
                store.create(session_id, key_list[0], arrow_ser.total_bytes)
            self.assertIsNotNone(km_ref.get(session_id, key_list[0]))
            store.delete(session_id, key_list[0])
            del buf

            bufs = []
            for key, data in zip(key_list, data_list):
                try:
                    bufs.append(store.put(session_id, key, data))
                except StorageFull:
                    break
            del bufs

            store._plasma_limit = 0
            with self.assertRaises(StorageFull):
                store.create(session_id, fake_data_key, store_size * 2)
Esempio n. 6
0
def read_bytes(
    vineyard_socket: str,
    path: str,
    storage_options: Dict,
    read_options: Dict,
    proc_num: int,
    proc_index: int,
):
    """Read bytes from external storage and produce a ByteStream,
    which will later be assembled into a ParallelStream.

    Args:
        vineyard_socket (str): Ipc socket
        path (str): External storage path to write to
        storage_options (dict): Configurations of external storage
        read_options (dict): Additional options that could control the behavior of read
        proc_num (int): Total amount of process
        proc_index (int): The sequence of this process

    Raises:
        ValueError: If the stream is invalid.
    """
    client = vineyard.connect(vineyard_socket)
    builder = ByteStreamBuilder(client)

    serialization_mode = read_options.pop('serialization_mode', False)
    if serialization_mode:
        parsed = urlparse(path)
        try:
            fs = fsspec.filesystem(parsed.scheme)
        except ValueError as e:
            report_status("error", str(e))
            raise
        meta_file = f"{path}_{proc_index}.meta"
        blob_file = f"{path}_{proc_index}"
        if not fs.exists(meta_file) or not fs.exists(blob_file):
            report_status("error", f"Some serialization file cannot be found. Expected: {meta_file} and {blob_file}")
            raise FileNotFoundError('{}, {}'.format(meta_file, blob_file))
        # Used for read bytes of serialized graph
        meta_file = fsspec.open(meta_file, mode="rb", **storage_options)
        with meta_file as f:
            meta = f.read().decode('utf-8')
            meta = json.loads(meta)
        lengths = meta.pop("lengths")
        for k, v in meta.items():
            builder[k] = v
        stream = builder.seal(client)
        client.persist(stream)
        ret = {"type": "return", "content": repr(stream.id)}
        print(json.dumps(ret), flush=True)
        writer = stream.open_writer(client)
        of = fsspec.open(blob_file, mode="rb", **storage_options)
        with of as f:
            try:
                total_size = f.size()
            except TypeError:
                total_size = f.size
            assert total_size == sum(lengths), "Target file is corrupted"
            for length in lengths:
                buf = f.read(length)
                chunk = writer.next(length)
                buf_writer = pa.FixedSizeBufferWriter(chunk)
                buf_writer.write(buf)
                buf_writer.close()
        writer.finish()
    else:
        # Used when reading tables from external storage.
        # Usually for load a property graph
        header_row = read_options.get("header_row", False)
        for k, v in read_options.items():
            if k in ("header_row", "include_all_columns"):
                builder[k] = "1" if v else "0"
            elif k == "delimiter":
                builder[k] = bytes(v, "utf-8").decode("unicode_escape")
            else:
                builder[k] = v

        offset = 0
        chunk_size = 1024 * 1024 * 4
        try:
            of = fsspec.open(path, mode="rb", **storage_options)
        except Exception as e:
            report_status("error", str(e))
            raise
        with of as f:
            header_line = read_block(f, 0, 1, b'\n')
            builder["header_line"] = header_line.decode("unicode_escape")
            if header_row:
                offset = len(header_line)
            stream = builder.seal(client)
            client.persist(stream)
            ret = {"type": "return", "content": repr(stream.id)}
            print(json.dumps(ret), flush=True)

            writer = stream.open_writer(client)
            try:
                total_size = f.size()
            except TypeError:
                total_size = f.size
            part_size = (total_size - offset) // proc_num
            begin = part_size * proc_index + offset
            end = min(begin + part_size, total_size)
            if proc_index == 0:
                begin -= int(header_row)

            while begin < end:
                buf = read_block(f, begin, min(chunk_size, end - begin), delimiter=b"\n")
                size = len(buf)
                if not size:
                    break
                begin += size - 1
                chunk = writer.next(size)
                buf_writer = pa.FixedSizeBufferWriter(chunk)
                buf_writer.write(buf)
                buf_writer.close()

            writer.finish()
Esempio n. 7
0
 def _write_init(self):
     self._buffer = buf = self._client.create_blob(self._size)
     self._object_id = buf.id
     file = self._file = pa.FixedSizeBufferWriter(buf.buffer)
     file.set_memcopy_threads(6)
Esempio n. 8
0
def _output_to_memory(
    obj: pa.Buffer,
    client: plasma.PlasmaClient,
    obj_id: Optional[plasma.ObjectID] = None,
    metadata: Optional[bytes] = None,
    memcopy_threads: int = 6,
) -> plasma.ObjectID:
    """Outputs an object to memory.

    Args:
        obj: Object to output to memory.
        client: A PlasmaClient to interface with the in-memory object
            store.
        obj_id: The ID to assign to the `obj` inside the plasma store.
            If ``None`` then one is randomly generated.
        metadata: Metadata to add to the `obj` inside the store.
        memcopy_threads: The number of threads to use to write the
            `obj` into the object store for large objects.

    Returns:
        The ID of the object inside the store. Either the given `obj_id`
        or a randomly generated one.

    Raises:
        MemoryError: If the `obj` does not fit in memory.
    """
    # Check whether the object to be passed in memory actually fits in
    # memory. We check explicitely instead of trying to insert it,
    # because inserting an already full Plasma store will start evicting
    # objects to free up space. However, we want to maintain control
    # over what objects get evicted.
    # obj.size -> "The buffer size in bytes."
    total_size = obj.size
    if metadata is not None:
        total_size += len(metadata)

    occupied_size = sum(obj["data_size"] + obj["metadata_size"]
                        for obj in client.list().values())
    # Take a percentage of the maximum capacity such that the message
    # for object eviction always fits inside the store.
    store_capacity = Config.MAX_RELATIVE_STORE_CAPACITY * client.store_capacity(
    )
    available_size = store_capacity - occupied_size

    if total_size > available_size:
        raise MemoryError("Object does not fit in memory")

    # In case no `obj_id` is specified, one has to be generated because
    # an ID is required for an object to be inserted in the store.
    if obj_id is None:
        obj_id = plasma.ObjectID.from_random()

    # Write the object to the plasma store. If the obj_id already
    # exists, then it first has to be deleted. Essentially we are
    # overwriting the data (just like we do for disk)
    try:
        buffer = client.create(obj_id, obj.size, metadata=metadata)
    except plasma.PlasmaObjectExists:
        client.delete([obj_id])
        buffer = client.create(obj_id, obj.size, metadata=metadata)

    stream = pa.FixedSizeBufferWriter(buffer)
    stream.set_memcopy_threads(memcopy_threads)

    stream.write(obj)
    client.seal(obj_id)

    return obj_id
Esempio n. 9
0
 def __init__(self, buf):
     import pyarrow
     self._buf = buf
     self._writer = pyarrow.FixedSizeBufferWriter(buf)
     self._writer.set_memcopy_threads(6)
     self._decompressor = None
Esempio n. 10
0
def read_hdfs_bytes(vineyard_socket, path, proc_num, proc_index):
    client = vineyard.connect(vineyard_socket)
    builder = ByteStreamBuilder(client)

    host, port = urlparse(path).netloc.split(':')
    hdfs = HDFileSystem(host=host,
                        port=int(port),
                        pars={"dfs.client.read.shortcircuit": "false"})

    header_row = False
    fragments = urlparse(path).fragment.split('&')
    path = urlparse(path).path

    for frag in fragments:
        try:
            k, v = frag.split('=')
        except:
            pass
        else:
            if k == 'header_row':
                header_row = (v.upper() == 'TRUE')
                if header_row:
                    builder[k] = '1'
                else:
                    builder[k] = '0'
            elif k == 'delimiter':
                builder[k] = bytes(v, "utf-8").decode("unicode_escape")
            elif k == 'include_all_columns':
                if v.upper() == 'TRUE':
                    builder[k] = '1'
                else:
                    builder[k] = '0'
            else:
                builder[k] = v

    offset = 0
    chunk_size = 1024 * 1024 * 4

    header_line = hdfs.read_block(path, 0, 1, b'\n')
    builder['header_line'] = header_line.decode('unicode_escape')
    if header_row:
        offset = len(header_line)

    stream = builder.seal(client)
    client.persist(stream)

    ret = {'type': 'return'}
    ret['content'] = repr(stream.id)
    print(json.dumps(ret), flush=True)

    writer = stream.open_writer(client)

    total_size = hdfs.info(path)['size']
    begin = (total_size - offset) // proc_num * proc_index + offset
    end = (total_size - offset) // proc_num + begin
    if proc_index + 1 == proc_num:
        end = total_size
    if proc_index:
        begin = next_delimiter(hdfs, path, begin, end, b'\n')
    else:
        begin -= int(header_row)

    offset = begin
    while offset < end:
        buf = hdfs.read_block(path, offset, min(chunk_size, end - offset),
                              b'\n')
        size = len(buf)
        if not size:
            break
        offset += size - 1
        chunk = writer.next(size)
        buf_writer = pa.FixedSizeBufferWriter(chunk)
        buf_writer.write(buf)
        buf_writer.close()

    writer.finish()