def dump_stream(self, iterator, stream): import pyarrow as pa import pyjava.utils as utils is_dev = utils.is_dev() if is_dev: print("----pyarrow version---") print(pa.__version__) writer = None try: for batch in iterator: if is_dev: print(batch.to_pandas()) if writer is None: writer = pa.RecordBatchStreamWriter(stream, batch.schema) writer.write_batch(batch) # if iterator is empty, we should write default schema if writer is None: if is_dev: print("----dump empty arrow---") rb = pa.RecordBatch.from_arrays([[]], schema=pa.schema([('value', pa.string())])) writer = pa.RecordBatchStreamWriter(stream, rb.schema) writer.write_batch(rb) finally: if writer is not None: writer.close()
def __init__(self, host, port, timezone): self.host = host self.port = port self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.socket.settimeout(5 * 60) self.out_ser = ArrowStreamPandasSerializer(timezone, False, None) self.is_bind = False self.is_dev = utils.is_dev()
def __init__(self, python_context): self.python_context = python_context self.servers = [] self.server_ids_in_ray = [] self.is_setup = False self.is_dev = utils.is_dev() self.is_in_mlsql = True self.mock_data = [] for item in self.python_context.fetch_once_as_rows(): self.server_ids_in_ray.append(str(uuid.uuid4())) self.servers.append( DataServer(item["host"], int(item["port"]), item["timezone"]))
def __init__(self, server_id, java_server, port=0, timezone="Asia/Harbin"): self.server = OnceServer( self.get_address(), port, java_server.timezone) try: (rel_host, rel_port) = self.server.bind() except Exception: print(traceback.format_exc()) self.host = rel_host self.port = rel_port self.timezone = timezone self.server_id = server_id self.java_server = java_server self.is_dev = utils.is_dev()