def get_pid(self) -> typing.Optional[int]: pid_path = self._path.joinpath('pid') # get pid stdout_list: typing.List[bytes] = [] if connection.run_ssh(self._ssh_client, [ 'cat', str(pid_path), ], iteration.empty_in(), iteration.collect_out(stdout_list), iteration.ignore_out())(): return None pid = int(b''.join(stdout_list).decode().strip()) # find process if connection.run_ssh(self._ssh_client, [ 'kill', '-0', str(pid), ], iteration.empty_in(), iteration.empty_out(), iteration.ignore_out())(): return None return pid
def _require_ssh(self) -> None: # connect if self._ssh_client is None: self._ssh_client = connection.connect_ssh(self._host, self._ssh_port, self._ssh_username, self._ssh_password, self._ssh_public_key) # lookup stdout_list: typing.List[bytes] = [] stderr_list: typing.List[bytes] = [] if connection.run_ssh(self._ssh_client, [ *self._ssh_command_prefix, 'python3', '-m', 'ck.clickhouse.lookup', ], iteration.empty_in(), iteration.collect_out(stdout_list), iteration.collect_out(stderr_list))(): raise exception.ShellError(self._host, b''.join(stderr_list).decode()) ( self._ssh_default_data_dir, self._ssh_binary_file, ) = b''.join(stdout_list).decode().splitlines()
def stop(self, ping_interval: float = 0.1, ping_retry: int = 50) -> typing.Optional[int]: pid = self.get_pid() if pid is None: return None # kill process stderr_list: typing.List[bytes] = [] assert self._ssh_client is not None if connection.run_ssh(self._ssh_client, [ 'kill', '-15', str(pid), ], iteration.empty_in(), iteration.empty_out(), iteration.collect_out(stderr_list))(): raise exception.ShellError(self._host, b''.join(stderr_list)) for _ in range(ping_retry): if self.get_pid() is None: break time.sleep(ping_interval) else: stderr_list = [] if connection.run_ssh(self._ssh_client, [ 'kill', '-9', str(pid), ], iteration.empty_in(), iteration.empty_out(), iteration.collect_out(stderr_list))(): raise exception.ShellError(self._host, b''.join(stderr_list)) while self.get_pid() is not None: time.sleep(ping_interval) return pid
def start(self, ping_interval: float = 0.1, ping_retry: int = 50) -> typing.Optional[int]: pid = self.get_pid() if pid is not None: return None config_path = self._path.joinpath('config.xml') pid_path = self._path.joinpath('pid') # create dir self._path.mkdir(parents=True, exist_ok=True) # setup clickhouse.create_config(self._tcp_port, self._http_port, self._user, self._password, str(self._path), self._memory_limit, self._config) # run if connection.run_process([ clickhouse.binary_file(), 'server', '--daemon', f'--config-file={config_path}', f'--pid-file={pid_path}', ], iteration.empty_in(), iteration.empty_out(), iteration.empty_out())(): raise exception.ServiceError(self._host, 'daemon') # wait for server initialization for _ in range(ping_retry): pid = self.get_pid() if pid is not None: break time.sleep(ping_interval) else: raise exception.ServiceError(self._host, 'pid') while not self.ping(): time.sleep(ping_interval) if self.get_pid() is None: raise exception.ServiceError(self._host, f'pid_{pid}') return pid
def test_iteration_adhoc_in() -> None: gen_in = iteration.empty_in() assert list(gen_in) == [] gen_in = iteration.given_in([b'1', b'2', b'3']) assert list(gen_in) == [b'1', b'2', b'3'] gen_in = iteration.concat_in( iteration.given_in([b'1', b'2']), iteration.given_in([b'3']) ) assert list(gen_in) == [b'1', b'2', b'3']
def query_file_async( self, query: str, path_in: typing.Optional[str] = None, path_out: typing.Optional[str] = None, method: typing.Optional[typing_extensions.Literal['tcp', 'http', 'ssh']] = None, settings: typing.Optional[typing.Dict[str, str]] = None ) -> typing.Callable[[], None]: if path_in is None: gen_in = iteration.empty_in() else: gen_in = iteration.file_in(path_in) if path_out is None: gen_out = iteration.empty_out() else: gen_out = iteration.file_out(path_out) return self._run(query, gen_in, gen_out, method, settings)
def start(self, ping_interval: float = 0.1, ping_retry: int = 50) -> typing.Optional[int]: pid = self.get_pid() if pid is not None: return None config_path = self._path.joinpath('config.xml') pid_path = self._path.joinpath('pid') # create dir stderr_list: typing.List[bytes] = [] if connection.run_ssh(self._ssh_client, [ 'mkdir', '--parents', str(self._path), ], iteration.empty_in(), iteration.empty_out(), iteration.collect_out(stderr_list))(): raise exception.ShellError(self._host, b''.join(stderr_list)) # setup stderr_list = [] if connection.run_ssh( self._ssh_client, [ *self._ssh_command_prefix, 'python3', '-m', 'ck.clickhouse.setup', ], iteration.given_in([ repr({ 'tcp_port': self._tcp_port, 'http_port': self._http_port, 'user': self._user, 'password': self._password, 'data_dir': str(self._path), 'config': self._config, }).encode() ]), iteration.empty_out(), iteration.collect_out(stderr_list))(): raise exception.ShellError(self._host, b''.join(stderr_list)) # run assert self._ssh_binary_file is not None if connection.run_ssh(self._ssh_client, [ *self._ssh_command_prefix, self._ssh_binary_file, 'server', '--daemon', f'--config-file={config_path}', f'--pid-file={pid_path}', ], iteration.empty_in(), iteration.empty_out(), iteration.empty_out())(): raise exception.ServiceError(self._host, 'daemon') # wait for server initialization for _ in range(ping_retry): pid = self.get_pid() if pid is not None: break time.sleep(ping_interval) else: raise exception.ServiceError(self._host, 'pid') while not self.ping(): time.sleep(ping_interval) if self.get_pid() is None: raise exception.ServiceError(self._host, f'pid_{pid}') return pid
def query_pandas_async( self, query: str, dataframe: typing.Optional[pandas.DataFrame] = None, method: typing.Optional[typing_extensions.Literal['tcp', 'http', 'ssh']] = None, settings: typing.Optional[typing.Dict[str, str]] = None, join_interval: float = 0.1 ) -> typing.Callable[[], typing.Optional[pandas.DataFrame]]: batch = None error = None # prepare read_stream, write_stream = iteration.echo_io() if dataframe is None: gen_in = iteration.empty_in() gen_out = iteration.stream_out(write_stream) else: gen_in = iteration.stream_in(read_stream) gen_out = iteration.empty_out() raw_join = self._run(f'{query} format ArrowStream', gen_in, gen_out, method, settings) # create thread def handle_batch() -> None: nonlocal dataframe nonlocal batch nonlocal error try: if dataframe is None: batch = pyarrow.RecordBatchStreamReader(read_stream) dataframe = batch.read_pandas() else: table = pyarrow.Table.from_pandas(dataframe) batch = pyarrow.RecordBatchStreamWriter( write_stream, table.schema) batch.write_table(table) dataframe = None batch.close() write_stream.close() except BaseException as raw_error: # pylint: disable=broad-except error = raw_error thread = threading.Thread(target=handle_batch) thread.start() # join thread def join() -> typing.Optional[pandas.DataFrame]: while error is None and thread.is_alive(): thread.join(join_interval) if error is not None: raise error # pylint: disable=raising-bad-type raw_join() return dataframe return join
def query_pandas_async( self, query: str, dataframe: typing.Optional[pandas.DataFrame] = None, encoding: typing.Optional[str] = 'utf-8', method: typing.Optional[typing_extensions.Literal['tcp', 'http', 'ssh']] = None, settings: typing.Optional[typing.Dict[str, str]] = None, join_interval: float = 0.1 ) -> typing.Callable[[], typing.Optional[pandas.DataFrame]]: batch = None error = None # prepare read_stream, write_stream = iteration.echo_io() if dataframe is None: gen_in = iteration.empty_in() gen_out = iteration.stream_out(write_stream) else: gen_in = iteration.stream_in(read_stream) gen_out = iteration.empty_out() raw_join = self._run(f'{query} format ArrowStream', gen_in, gen_out, method, settings) # create thread def handle_batch() -> None: nonlocal dataframe nonlocal batch nonlocal error try: if dataframe is None: batch = pyarrow.RecordBatchStreamReader(read_stream) dataframe = batch.read_pandas() if encoding is not None: def decode(value: typing.Any) -> typing.Any: if type(value) is bytes: assert encoding is not None return value.decode(encoding) if type(value) is bytearray: assert encoding is not None return value.decode(encoding) if type(value) is tuple: return tuple(decode(child) for child in value) if type(value) is list: return [decode(child) for child in value] if type(value) is numpy.ndarray: return numpy.array( [decode(child) for child in value]) if type(value) is set: return {decode(child) for child in value} if type(value) is frozenset: return frozenset( decode(child) for child in value) if type(value) is dict: return { key: decode(child) for key, child in value.items() } return value dataframe = pandas.DataFrame({ column: (dataframe[column].apply(decode) if dataframe[column].dtype == 'O' else dataframe[column]) for column in dataframe }) else: if encoding is not None: def encode(value: typing.Any) -> typing.Any: if type(value) is str: assert encoding is not None return value.encode(encoding) if type(value) is tuple: return tuple(encode(child) for child in value) if type(value) is list: return [encode(child) for child in value] if type(value) is numpy.ndarray: return numpy.array( [encode(child) for child in value]) if type(value) is set: return {encode(child) for child in value} if type(value) is frozenset: return frozenset( encode(child) for child in value) if type(value) is dict: return { key: encode(child) for key, child in value.items() } return value dataframe = pandas.DataFrame({ column: (dataframe[column].apply(encode) if dataframe[column].dtype == 'O' else dataframe[column]) for column in dataframe }) table = pyarrow.Table.from_arrays([ pyarrow.array(dataframe[column].values) for column in dataframe ], dataframe.columns) batch = pyarrow.RecordBatchStreamWriter( write_stream, table.schema) batch.write_table(table) dataframe = None batch.close() write_stream.close() except pyarrow.ArrowInvalid: pass except BaseException as raw_error: # pylint: disable=broad-except error = raw_error thread = threading.Thread(target=handle_batch) thread.start() # join thread def join() -> typing.Optional[pandas.DataFrame]: while error is None and thread.is_alive(): thread.join(join_interval) if error is not None: raise error # pylint: disable=raising-bad-type raw_join() return dataframe return join
def test_iteration_empty_in() -> None: gen_in = iteration.empty_in() assert list(gen_in) == []