def tile(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) o = ODPS(None, None, account=account, **op.odps_params) cupid_session = CupidSession(o) data_src = o.get_table(op.table_name) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) input_df = op.inputs[0] out_chunks = [] out_chunk_shape = (0,) * len(input_df.shape) blocks = {} for chunk in input_df.chunks: block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace('-', '') chunk_op = DataFrameWriteTableSplit(dtypes=op.dtypes, table_name=op.table_name, partition_spec=op.partition_spec, cupid_handle=to_str(upload_session.handle), block_id=block_id, write_batch_size=op.write_batch_size) out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, dtypes=chunk.dtypes) out_chunks.append(out_chunk) blocks[block_id] = op.partition_spec # build commit tree combine_size = 8 chunks = out_chunks while len(chunks) > combine_size: new_chunks = [] for i in range(0, len(chunks), combine_size): chks = chunks[i: i + combine_size] if len(chks) == 1: chk = chks[0] else: chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False) chk = chk_op.new_chunk(chks, shape=out_chunk_shape, dtypes=op.dtypes) new_chunks.append(chk) chunks = new_chunks assert len(chunks) < combine_size commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks, cupid_handle=to_str(upload_session.handle), overwrite=op.overwrite, odps_params=op.odps_params, is_terminal=True) commit_table_chunk = commit_table_op.new_chunk(chunks, shape=out_chunk_shape, dtypes=op.dtypes) out_df = op.outputs[0] new_op = op.copy() return new_op.new_dataframes(op.inputs, shape=out_df.shape, dtypes=out_df.dtypes, chunks=[commit_table_chunk], nsplits=((0,),) * len(out_chunk_shape))
def execute(cls, ctx, op): import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.io.table import CupidTableUploadSession if op.is_terminal: bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) cupid_session = CupidSession(o) project_name, table_name = op.table_name.split('.') upload_session = CupidTableUploadSession(session=cupid_session, table_name=table_name, project_name=project_name, handle=op.cupid_handle, blocks=op.blocks) upload_session.commit(overwrite=op.overwrite) ctx[op.outputs[0].key] = pd.DataFrame()
def __init__(self, odps, inst=None, project=None): self._odps = odps self._cupid_session = CupidSession(odps, project=project) self._kube_instance = inst self._kube_url = None self._kube_client = None self._kube_namespace = None self._scheduler_key = None self._scheduler_config = None self._worker_config = None self._web_config = None self._endpoint = None self._with_notebook = False self._notebook_endpoint = None self._mars_session = None self._req_session = None
def _write_table_in_cupid(odps, df, table, partition=None, overwrite=True, unknown_as_string=None): import pyarrow as pa from mars.utils import to_str from cupid import CupidSession from cupid.io.table.core import BlockWriter cupid_session = CupidSession(odps) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(table) block_writer = BlockWriter(_table_name=table.name, _project_name=table.project.name, _table_schema=table.schema, _partition_spec=partition, _block_id='0', _handle=to_str(upload_session.handle)) logger.debug('Start writing table block, block id: 0') with block_writer.open_arrow_writer() as cupid_writer: sink = pa.BufferOutputStream() batch_size = 1024 batch_idx = 0 batch_data = df[batch_size * batch_idx:batch_size * (batch_idx + 1)] batch_data = convert_pandas_object_to_string(batch_data) schema = pa.RecordBatch.from_pandas(df[:1], preserve_index=False).schema arrow_writer = pa.RecordBatchStreamWriter(sink, schema) while len(batch_data) > 0: batch = pa.RecordBatch.from_pandas(batch_data, preserve_index=False) arrow_writer.write_batch(batch) batch_idx += 1 batch_data = df[batch_size * batch_idx:batch_size * (batch_idx + 1)] arrow_writer.close() cupid_writer.write(sink.getvalue()) block_writer.commit() upload_session._blocks = {'0': partition} upload_session.commit(overwrite=overwrite)
def _handle_create_table_upload_session(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with odps_params, table_name session_config = pickle.loads(sock.recv(cmd_len)) from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() odps_params = session_config['odps_params'] bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) or odps_params['project'] endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) cupid_session = CupidSession(o) data_src = o.get_table(session_config['table_name']) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) ret_data = { 'handle': upload_session.handle, } _write_request_result(sock, result=ret_data) except: logger.exception('Failed to create upload session') _write_request_result(sock, False, exc_info=sys.exc_info())
def __init__(self, odps, inst=None, project=None): from cupid import CupidSession self._odps = odps self._cupid_session = CupidSession(odps, project=project) self._kube_instance = inst self._kube_url = None self._kube_client = None self._kube_namespace = None self._supervisor_key = None self._supervisor_config = None self._worker_config = None self._endpoint = None self._with_notebook = False self._notebook_endpoint = None self._with_graphscope = False self._graphscope_endpoint = None self._mars_session = None self._req_session = None
def _handle_commit_table_upload_session(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with odps_params, table_name, cupid_handle, blocks, overwrite commit_config = pickle.loads(sock.recv(cmd_len)) from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext from cupid.io.table import CupidTableUploadSession if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() odps_params = commit_config['odps_params'] bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) or odps_params['project'] endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) cupid_session = CupidSession(o) project_name, table_name = commit_config['table_name'].split('.') upload_session = CupidTableUploadSession( session=cupid_session, table_name=table_name, project_name=project_name, handle=commit_config['cupid_handle'], blocks=commit_config['blocks']) upload_session.commit(overwrite=commit_config['overwrite']) _write_request_result(sock) except: logger.exception('Failed to commit upload session') _write_request_result(sock, False, exc_info=sys.exc_info())
def execute(cls, ctx, op): import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.io.table import CupidTableUploadSession if op.is_terminal: bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) o = ODPS(None, None, account=account, **op.odps_params) cupid_session = CupidSession(o) project_name, table_name = op.table_name.split('.') upload_session = CupidTableUploadSession( session=cupid_session, table_name=table_name, project_name=project_name, handle=op.cupid_handle, blocks=op.blocks) upload_session.commit(overwrite=op.overwrite) ctx[op.outputs[0].key] = pd.DataFrame()
def _tile_cupid(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) cupid_session = CupidSession(o) data_src = o.get_table(op.table_name) logger.debug('Start creating upload session from cupid.') upload_session = cupid_session.create_upload_session(data_src) input_df = build_concatenated_rows_frame(op.inputs[0]) out_df = op.outputs[0] out_chunks = [] out_chunk_shape = (0, ) * len(input_df.shape) blocks = {} for chunk in input_df.chunks: block_id = str(int(time.time())) + '_' + str(uuid.uuid4()).replace( '-', '') chunk_op = DataFrameWriteTableSplit( dtypes=op.dtypes, table_name=op.table_name, unknown_as_string=op.unknown_as_string, partition_spec=op.partition_spec, cupid_handle=to_str(upload_session.handle), block_id=block_id, write_batch_size=op.write_batch_size) out_chunk = chunk_op.new_chunk([chunk], shape=out_chunk_shape, index=chunk.index, index_value=out_df.index_value, dtypes=chunk.dtypes) out_chunks.append(out_chunk) blocks[block_id] = op.partition_spec # build commit tree combine_size = 8 chunks = out_chunks while len(chunks) >= combine_size: new_chunks = [] for i in range(0, len(chunks), combine_size): chks = chunks[i:i + combine_size] if len(chks) == 1: chk = chks[0] else: chk_op = DataFrameWriteTableCommit(dtypes=op.dtypes, is_terminal=False) chk = chk_op.new_chunk(chks, shape=out_chunk_shape, index_value=out_df.index_value, dtypes=op.dtypes) new_chunks.append(chk) chunks = new_chunks assert len(chunks) < combine_size commit_table_op = DataFrameWriteTableCommit(dtypes=op.dtypes, table_name=op.table_name, blocks=blocks, cupid_handle=to_str( upload_session.handle), overwrite=op.overwrite, odps_params=op.odps_params, is_terminal=True) commit_table_chunk = commit_table_op.new_chunk( chunks, shape=out_chunk_shape, dtypes=op.dtypes, index_value=out_df.index_value) new_op = op.copy() return new_op.new_dataframes(op.inputs, shape=out_df.shape, index_value=out_df.index_value, dtypes=out_df.dtypes, columns_value=out_df.columns_value, chunks=[commit_table_chunk], nsplits=((0, ), ) * len(out_chunk_shape))
class MarsCupidClient(object): def __init__(self, odps, inst=None, project=None): self._odps = odps self._cupid_session = CupidSession(odps, project=project) self._kube_instance = inst self._kube_url = None self._kube_client = None self._kube_namespace = None self._scheduler_key = None self._scheduler_config = None self._worker_config = None self._web_config = None self._endpoint = None self._with_notebook = False self._notebook_endpoint = None self._mars_session = None self._req_session = None @property def endpoint(self): return self._endpoint @property def notebook_endpoint(self): return self._notebook_endpoint @property def session(self): return self._mars_session @property def instance_id(self): return self._kube_instance.id def submit(self, image=None, scheduler_num=1, scheduler_cpu=8, scheduler_mem=32 * 1024 ** 3, worker_num=1, worker_cpu=8, worker_mem=32 * 1024 ** 3, worker_cache_mem=None, min_worker_num=None, worker_disk_num=1, worker_disk_size=100 * 1024 ** 3, web_num=1, web_cpu=1, web_mem=1024 ** 3, with_notebook=False, notebook_cpu=1, notebook_mem=2 * 1024 ** 3, timeout=None, extra_env=None, extra_modules=None, resources=None, create_session=True, priority=None, running_cluster=None, task_name=None, **kw): try: async_ = kw.pop('async_', None) # compatible with early version mars_image = kw.pop('mars_image', None) default_resources = kw.pop('default_resources', None) or DEFAULT_RESOURCES instance_idle_timeout = kw.pop('instance_idle_timeout', None) if with_notebook is not None: self._with_notebook = bool(with_notebook) else: self._with_notebook = options.mars.launch_notebook if self._kube_instance is None: image = image or mars_image or build_image_name('mars') extra_modules = extra_modules or [] if isinstance(extra_modules, (tuple, list)): extra_modules = list(extra_modules) + ['odps.mars_extension'] else: extra_modules = [extra_modules, 'odps.mars_extension'] if resources is not None: if isinstance(resources, (tuple, list)): resources = list(resources) resources.extend(default_resources) else: resources = [resources] + default_resources else: resources = default_resources if worker_cache_mem is None: worker_cache_mem = int(worker_mem * 0.48) else: worker_cache_mem = worker_cache_mem cluster_args = dict( image=image, scheduler_num=scheduler_num, scheduler_cpu=scheduler_cpu, scheduler_mem=scheduler_mem, worker_num=worker_num, worker_cpu=worker_cpu, worker_mem=worker_mem, worker_cache_mem=worker_cache_mem, min_worker_num=min_worker_num, worker_disk_num=worker_disk_num, worker_disk_size=worker_disk_size, web_num=web_num, web_cpu=web_cpu, web_mem=web_mem, with_notebook=with_notebook, notebook_cpu=notebook_cpu, notebook_mem=notebook_mem, extra_env=extra_env, extra_modules=extra_modules, instance_idle_timeout=instance_idle_timeout, timeout=timeout) command = '/srv/entrypoint.sh %s %s' % ( __name__.rsplit('.', 1)[0] + '.app', base64.b64encode(json.dumps(cluster_args).encode()).decode() ) self._kube_instance = self._cupid_session.start_kubernetes( async_=True, running_cluster=running_cluster, priority=priority, app_image=build_image_name('mars'), app_command=command, resources=resources, task_name=task_name, **kw) write_log(self._kube_instance.get_logview_address()) if async_: return self else: self.wait_for_success(create_session=create_session, min_worker_num=min_worker_num or worker_num) return self except KeyboardInterrupt: self.stop_server() return self def check_service_ready(self, timeout=1): try: resp = self._req_session.get(self._endpoint + '/api', timeout=timeout) except (requests.ConnectionError, requests.Timeout): return False if resp.status_code >= 400: return False return True def count_workers(self): resp = self._req_session.get(self._endpoint + '/api/worker?action=count', timeout=1) return json.loads(resp.text) def get_logview_address(self): return self._kube_instance.get_logview_address() def get_mars_endpoint(self): return self._cupid_session.get_proxied_url(self._kube_instance.id, CUPID_APP_NAME) def get_notebook_endpoint(self): return self._cupid_session.get_proxied_url(self._kube_instance.id, NOTEBOOK_NAME) def get_req_session(self): from ...rest import RestClient if options.mars.use_common_proxy: return RestClient(self._odps.account, self._endpoint, self._odps.project) else: return requests.Session() def check_instance_status(self): if self._kube_instance.is_terminated(): for task_name, task in (self._kube_instance.get_task_statuses()).items(): exc = None if task.status == Instance.Task.TaskStatus.FAILED: exc = errors.parse_instance_error(self._kube_instance.get_task_result(task_name)) elif task.status != Instance.Task.TaskStatus.SUCCESS: exc = errors.ODPSError('%s, status=%s' % (task_name, task.status.value)) if exc: exc.instance_id = self._kube_instance.id raise exc def wait_for_success(self, min_worker_num=0, create_session=True): while True: self.check_instance_status() try: if self._endpoint is None: self._endpoint = self.get_mars_endpoint() write_log('Mars UI: ' + self._endpoint) self._req_session = self.get_req_session() self._req_session.post(self._endpoint.rstrip('/') + '/api/logger', data=dict( content='Mars UI from client: ' + self._endpoint )) if self._with_notebook and self._notebook_endpoint is None: self._notebook_endpoint = self.get_notebook_endpoint() write_log('Notebook UI: ' + self._notebook_endpoint) self._req_session.post(self._endpoint.rstrip('/') + '/api/logger', data=dict( content='Notebook UI from client: ' + self._notebook_endpoint )) except KeyboardInterrupt: raise except: time.sleep(1) continue if not self.check_service_ready(): continue try: if self.count_workers() >= min_worker_num: break else: time.sleep(1) except: continue if create_session: try: self._mars_session = new_session(self._endpoint, req_session=self._req_session).as_default() except KeyboardInterrupt: raise except: if self._kube_instance and self._kube_instance.status == self._kube_instance.Status.RUNNING: self._kube_instance.stop() raise def restart_session(self): self._mars_session.close() self._mars_session = new_session(self._endpoint, req_session=self._req_session).as_default() def stop_server(self): if self._kube_instance: self._kube_instance.stop() self._kube_instance = None
def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project o = ODPS(None, None, account=account, **odps_params) cupid_session = CupidSession(o) df = op.outputs[0] split_size = df.extra_params.chunk_bytes or CHUNK_LIMIT data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) logger.debug('Start creating download session from cupid.') while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) out_chunks = [] # Ignore add_offset at this time. op._add_offset = False for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=op.dtypes, sparse=op.sparse) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(idx, 0)) out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=out_chunks, nsplits=nsplits)
def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from mars.context import get_context cupid_ctx = context() if cupid_ctx is None: raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project o = ODPS(None, None, account=account, **odps_params) cupid_session = CupidSession(o) mars_context = get_context() df = op.outputs[0] split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) try: data_store_size = data_src.size except ODPSError: # fail to get data size, just ignore pass else: if data_store_size < split_size and mars_context is not None: # get worker counts worker_count = max(len(mars_context.get_worker_addresses()), 1) # data is too small, split as many as number of cores split_size = data_store_size // worker_count # at least 1M split_size = max(split_size, 1 * 1024**2) logger.debug( 'Input data size is too small, split_size is {}'.format( split_size)) logger.debug( 'Start creating download session of table {} from cupid.'.format( op.table_name)) while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) if np.isnan(df.shape[0]): est_chunk_rows = [None] * len(download_session.splits) else: sp_file_sizes = np.array([ sp.split_file_end - sp.split_file_start for sp in download_session.splits ]) total_size = sp_file_sizes.sum() est_chunk_rows = sp_file_sizes * df.shape[0] // total_size logger.warning('Estimated chunk rows: %r', est_chunk_rows) out_chunks = [] # Ignore add_offset at this time. op._add_offset = False if len(download_session.splits) == 0: logger.debug('Table {} has no data'.format(op.table_name)) chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(0, 0)) out_chunks = [out_chunk] else: for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=op.dtypes, sparse=op.sparse, split_size=split_size, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=est_chunk_rows[idx]) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, df.shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(idx, 0)) out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=out_chunks, nsplits=nsplits)
def tile(cls, op): import numpy as np import pandas as pd from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context bearer_token = context().get_bearer_token() account = BearerTokenAccount(bearer_token) o = ODPS(None, None, account=account, **op.odps_params) cupid_session = CupidSession(o) df = op.outputs[0] split_size = df.extra_params.chunk_store_limit or options.tensor.chunk_store_limit data_src = o.get_table(op.table_name) if op.partition is not None: data_src = data_src.get_partition(op.partition) logger.debug('Start creating download session from cupid.') download_session = cupid_session.create_download_session( data_src, split_size=split_size) logger.debug('%s table splits have been created.', str(len(download_session.splits))) out_chunks = [] out_count_chunks = [] for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, dtypes=op.dtypes, sparse=op.sparse) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(df.dtypes.index, store_data=True) out_chunk, out_count_chunk = chunk_op.new_chunks( None, kws=[{ 'shape': (np.nan, df.shape[1]), 'dtypes': op.dtypes, 'index_value': index_value, 'columns_value': columns_value, 'index': (idx, ) }, { 'shape': (1, ), 'index': (idx, ) }]) out_chunks.append(out_chunk) out_count_chunks.append(out_count_chunk) if op.add_offset: output_chunks = [] for i, chunk in enumerate(out_chunks): if i == 0: output_chunks.append(chunk) continue counts = out_count_chunks[:i] inputs = [chunk] + counts output_chunk = DataFrameReadTableWithOffset( dtypes=chunk.dtypes).new_chunk( inputs, shape=chunk.shape, index=chunk.index, dtypes=chunk.dtypes, index_value=chunk.index_value, columns_value=chunk.columns_value) output_chunks.append(output_chunk) else: output_chunks = out_chunks new_op = op.copy() nsplits = ((np.nan, ) * len(output_chunks), (df.shape[1], )) return new_op.new_dataframes(None, shape=df.shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=df.columns_value, chunks=output_chunks, nsplits=nsplits)
class MarsCupidClient(object): def __init__(self, odps, inst=None): self._odps = odps self._cupid_session = CupidSession(odps) self._kube_instance = inst self._kube_url = None self._kube_client = None self._kube_namespace = None self._scheduler_key = None self._scheduler_config = None self._worker_config = None self._web_config = None self._endpoint = None self._has_notebook = False self._notebook_endpoint = None self._mars_session = None self._req_session = None @property def endpoint(self): return self._endpoint @property def notebook_endpoint(self): return self._notebook_endpoint @property def session(self): return self._mars_session @property def instance_id(self): return self._kube_instance.id def submit(self, worker_num=1, worker_cpu=8, worker_mem=32, disk_num=1, min_worker_num=None, cache_mem=None, resources=None, module_path=None, create_session=True, priority=None, running_cluster=None, scheduler_num=1, notebook=None, **kw): try: async_ = kw.pop('async_', None) default_resources = kw.pop('default_resources', None) or DEFAULT_RESOURCES if notebook is not None: self._has_notebook = bool(notebook) else: self._has_notebook = options.mars.launch_notebook if self._kube_instance is None: if module_path is not None: if isinstance(module_path, (tuple, list)): module_path = list(module_path) + [ 'odps.mars_extension' ] else: module_path = [module_path, 'odps.mars_extension'] if resources is not None: if isinstance(resources, (tuple, list)): resources = list(resources) resources.extend(default_resources) else: resources = [resources] + default_resources else: resources = default_resources if cache_mem is None: cache_mem = str(worker_mem * 0.48) + 'G' else: cache_mem = str(cache_mem) + 'G' mars_config = { 'scheduler_num': scheduler_num, 'worker_num': worker_num, 'worker_cpu': worker_cpu, 'worker_mem': worker_mem, 'cache_mem': cache_mem or '', 'disk_num': disk_num, 'resources': resources, 'module_path': module_path or ['odps.mars_extension'], } if 'mars_app_image' in kw: mars_config['mars_app_image'] = kw.pop('mars_app_image') if 'mars_image' in kw: mars_config['mars_image'] = kw.pop('mars_image') if 'proxy_endpoint' in kw: mars_config['proxy_endpoint'] = kw.pop('proxy_endpoint') if 'major_task_version' in kw: mars_config['major_task_version'] = kw.pop( 'major_task_version') mars_config['scheduler_mem'] = kw.pop('scheduler_mem', 32) mars_config['scheduler_cpu'] = kw.pop('scheduler_cpu', 8) if self._has_notebook: mars_config['notebook'] = True self._kube_instance = self._cupid_session.start_kubernetes( async_=True, running_cluster=running_cluster, priority=priority, app_name='mars', app_config=mars_config, **kw) write_log(self._kube_instance.get_logview_address()) if async_: return self else: self.wait_for_success(create_session=create_session, min_worker_num=min_worker_num or worker_num) return self except KeyboardInterrupt: self.stop_server() return self def check_service_ready(self, timeout=1): try: resp = self._req_session.get(self._endpoint + '/api', timeout=timeout) except (requests.ConnectionError, requests.Timeout): return False if resp.status_code >= 400: return False return True def count_workers(self): resp = self._req_session.get(self._endpoint + '/api/worker?action=count', timeout=1) return json.loads(resp.text) def get_logview_address(self): return self._kube_instance.get_logview_address() def get_mars_endpoint(self): return self._cupid_session.get_proxied_url(self._kube_instance.id, CUPID_APP_NAME) def get_notebook_endpoint(self): return self._cupid_session.get_proxied_url(self._kube_instance.id, NOTEBOOK_NAME) def get_req_session(self): from ...rest import RestClient if options.mars.use_common_proxy: return RestClient(self._odps.account, self._endpoint, self._odps.project) else: return requests.Session() def check_instance_status(self): if self._kube_instance.is_terminated(): for task_name, task in ( self._kube_instance.get_task_statuses()).items(): exc = None if task.status == Instance.Task.TaskStatus.FAILED: exc = errors.parse_instance_error( self._kube_instance.get_task_result(task_name)) elif task.status != Instance.Task.TaskStatus.SUCCESS: exc = errors.ODPSError('%s, status=%s' % (task_name, task.status.value)) if exc: exc.instance_id = self._kube_instance.id raise exc def wait_for_success(self, min_worker_num=0, create_session=True): while True: self.check_instance_status() try: if self._endpoint is None: self._endpoint = self.get_mars_endpoint() write_log('Mars UI: ' + self._endpoint) self._req_session = self.get_req_session() if self._has_notebook and self._notebook_endpoint is None: self._notebook_endpoint = self.get_notebook_endpoint() write_log('Notebook UI: ' + self._notebook_endpoint) except KeyboardInterrupt: raise except: time.sleep(1) continue if not self.check_service_ready(): continue try: if self.count_workers() >= min_worker_num: break else: time.sleep(1) except: continue if create_session: try: self._mars_session = new_session( self._endpoint, req_session=self._req_session).as_default() except KeyboardInterrupt: raise except: if self._kube_instance and self._kube_instance.status == self._kube_instance.Status.RUNNING: self._kube_instance.stop() raise def stop_server(self): if self._kube_instance: self._kube_instance.stop() self._kube_instance = None
def _tile_cupid(cls, op): from odps import ODPS from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.errors import CupidError from mars.context import get_context cupid_ctx = context() bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) odps_params = op.odps_params.copy() if project: odps_params['project'] = project endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=odps_params['project'], endpoint=endpoint) cupid_session = CupidSession(o) mars_context = get_context() df = op.outputs[0] split_size = df.extra_params.chunk_bytes or READ_CHUNK_LIMIT out_dtypes = df.dtypes out_shape = df.shape out_columns_value = df.columns_value if op.columns is not None: out_dtypes = out_dtypes[op.columns] out_shape = (df.shape[0], len(op.columns)) out_columns_value = parse_index(out_dtypes.index, store_data=True) table_obj = o.get_table(op.table_name) if not table_obj.schema.partitions: data_srcs = [table_obj] elif op.partition is not None and check_partition_exist( table_obj, op.partition): data_srcs = [table_obj.get_partition(op.partition)] else: data_srcs = list(table_obj.partitions) if op.partition is not None: data_srcs = filter_partitions(o, data_srcs, op.partition) out_chunks = [] chunk_idx = 0 for data_src in data_srcs: try: data_store_size = data_src.size except ODPSError: # fail to get data size, just ignore pass else: if data_store_size < split_size and mars_context is not None: # get worker counts worker_count = max( len(mars_context.get_worker_addresses()), 1) # data is too small, split as many as number of cores split_size = data_store_size // worker_count # at least 1M split_size = max(split_size, 1 * 1024**2) logger.debug( 'Input data size is too small, split_size is %s', split_size) logger.debug( 'Start creating download session of table %s from cupid, ' 'columns: %s', op.table_name, op.columns) while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=op.columns, with_split_meta=op.with_split_meta_on_tile) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is %s', split_size) if split_size >= MAX_CHUNK_SIZE: raise else: split_size *= 2 logger.debug('%s table splits have been created.', str(len(download_session.splits))) meta_chunk_rows = [ split.meta_row_count for split in download_session.splits ] if np.isnan(out_shape[0]): est_chunk_rows = meta_chunk_rows else: sp_file_sizes = np.array([ sp.split_file_end - sp.split_file_start for sp in download_session.splits ]) total_size = sp_file_sizes.sum() ratio_chunk_rows = (sp_file_sizes * out_shape[0] // total_size).tolist() est_chunk_rows = [ mr if mr is not None else rr for mr, rr in zip(meta_chunk_rows, ratio_chunk_rows) ] partition_spec = str(data_src.partition_spec) \ if getattr(data_src, 'partition_spec', None) else None logger.warning('Estimated chunk rows: %r', est_chunk_rows) if len(download_session.splits) == 0: logger.debug('Table %s has no data', op.table_name) chunk_op = DataFrameReadTableSplit() index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, out_shape[1]), dtypes=op.dtypes, index_value=index_value, columns_value=columns_value, index=(chunk_idx, 0)) out_chunks.append(out_chunk) chunk_idx += 1 else: for idx, split in enumerate(download_session.splits): chunk_op = DataFrameReadTableSplit( cupid_handle=to_str(split.handle), split_index=split.split_index, split_file_start=split.split_file_start, split_file_end=split.split_file_end, schema_file_start=split.schema_file_start, schema_file_end=split.schema_file_end, add_offset=op.add_offset, dtypes=out_dtypes, sparse=op.sparse, split_size=split_size, string_as_binary=op.string_as_binary, use_arrow_dtype=op.use_arrow_dtype, estimate_rows=est_chunk_rows[idx], partition_spec=partition_spec, append_partitions=op.append_partitions, meta_raw_size=split.meta_raw_size, nrows=meta_chunk_rows[idx] or op.nrows, memory_scale=op.memory_scale) # the chunk shape is unknown index_value = parse_index(pd.RangeIndex(0)) columns_value = parse_index(out_dtypes.index, store_data=True) out_chunk = chunk_op.new_chunk(None, shape=(np.nan, out_shape[1]), dtypes=out_dtypes, index_value=index_value, columns_value=columns_value, index=(chunk_idx, 0)) chunk_idx += 1 out_chunks.append(out_chunk) if op.add_offset: out_chunks = standardize_range_index(out_chunks) new_op = op.copy() nsplits = ((np.nan, ) * len(out_chunks), (out_shape[1], )) return new_op.new_dataframes(None, shape=out_shape, dtypes=op.dtypes, index_value=df.index_value, columns_value=out_columns_value, chunks=out_chunks, nsplits=nsplits)
def _handle_create_table_download_session(sock): try: cmd_len, = struct.unpack('<I', sock.recv(4)) # dict with odps_params, table_name, partition, columns, worker_count, split_size, max_chunk_num session_config = pickle.loads(sock.recv(cmd_len)) from odps import ODPS from odps.errors import ODPSError from odps.accounts import BearerTokenAccount from cupid import CupidSession, context from cupid.errors import CupidError from cupid.runtime import RuntimeContext if not RuntimeContext.is_context_ready(): raise SystemError( 'No Mars cluster found, please create via `o.create_mars_cluster`.' ) cupid_ctx = context() odps_params = session_config['odps_params'] bearer_token = cupid_ctx.get_bearer_token() account = BearerTokenAccount(bearer_token) project = os.environ.get('ODPS_PROJECT_NAME', None) or odps_params['project'] endpoint = os.environ.get( 'ODPS_RUNTIME_ENDPOINT') or odps_params['endpoint'] o = ODPS(None, None, account=account, project=project, endpoint=endpoint) cupid_session = CupidSession(o) split_size = session_config['split_size'] table_name = session_config['table_name'] data_src = o.get_table(table_name) if session_config.get('partition') is not None: data_src = data_src.get_partition(session_config['partition']) try: data_store_size = data_src.size except ODPSError: # fail to get data size, just ignore pass else: worker_count = session_config['worker_count'] if data_store_size < split_size and worker_count is not None: # data is too small, split as many as number of cores split_size = data_store_size // worker_count # at least 1M split_size = max(split_size, 1 * 1024**2) logger.debug( 'Input data size is too small, split_size is {}'.format( split_size)) max_chunk_num = session_config['max_chunk_num'] columns = session_config['columns'] with_split_meta = session_config.get('with_split_meta_on_tile') logger.debug( 'Start creating download session of table %s from cupid, columns %r', table_name, columns) while True: try: download_session = cupid_session.create_download_session( data_src, split_size=split_size, columns=columns, with_split_meta=with_split_meta) break except CupidError: logger.debug( 'The number of splits exceeds 100000, split_size is {}'. format(split_size)) if split_size >= max_chunk_num: raise else: split_size *= 2 ret_data = { 'splits': download_session.splits, 'split_size': split_size, } _write_request_result(sock, result=ret_data) except: logger.exception('Failed to create download session') _write_request_result(sock, False, exc_info=sys.exc_info())