def get_task_run_args(job_id, role, party_id, job_parameters, job_args, input_dsl): task_run_args = {} for input_type, input_detail in input_dsl.items(): if input_type == 'data': this_type_args = task_run_args[input_type] = task_run_args.get( input_type, {}) for data_type, data_list in input_detail.items(): for data_key in data_list: data_key_item = data_key.split('.') search_component_name, search_data_name = data_key_item[ 0], data_key_item[1] if search_component_name == 'args': if job_args.get( 'data', {}).get(search_data_name).get( 'namespace', '') and job_args.get( 'data', {}).get(search_data_name).get( 'name', ''): data_table = session.table( namespace=job_args['data'] [search_data_name]['namespace'], name=job_args['data'][search_data_name] ['name']) else: data_table = None else: data_table = Tracking( job_id=job_id, role=role, party_id=party_id, component_name=search_component_name ).get_output_data_table(data_name=search_data_name) args_from_component = this_type_args[ search_component_name] = this_type_args.get( search_component_name, {}) args_from_component[data_type] = data_table elif input_type in ['model', 'isometric_model']: this_type_args = task_run_args[input_type] = task_run_args.get( input_type, {}) for dsl_model_key in input_detail: dsl_model_key_items = dsl_model_key.split('.') if len(dsl_model_key_items) == 2: search_component_name, search_model_name = dsl_model_key_items[ 0], dsl_model_key_items[1] elif len(dsl_model_key_items ) == 3 and dsl_model_key_items[0] == 'pipeline': search_component_name, search_model_name = dsl_model_key_items[ 1], dsl_model_key_items[2] else: raise Exception( 'get input {} failed'.format(input_type)) models = Tracking( job_id=job_id, role=role, party_id=party_id, component_name=search_component_name, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'] ).get_output_model(model_name=search_model_name) this_type_args[search_component_name] = models return task_run_args
def get_task_run_args(job_id, role, party_id, task_id, job_args, job_parameters, task_parameters, input_dsl, if_save_as_task_input_data, filter_type=None, filter_attr=None): task_run_args = {} for input_type, input_detail in input_dsl.items(): if filter_type and input_type not in filter_type: continue if input_type == 'data': this_type_args = task_run_args[input_type] = task_run_args.get( input_type, {}) for data_type, data_list in input_detail.items(): for data_key in data_list: data_key_item = data_key.split('.') search_component_name, search_data_name = data_key_item[ 0], data_key_item[1] if search_component_name == 'args': if job_args.get( 'data', {}).get(search_data_name).get( 'namespace', '') and job_args.get( 'data', {}).get(search_data_name).get( 'name', ''): data_table = session.table( namespace=job_args['data'] [search_data_name]['namespace'], name=job_args['data'][search_data_name] ['name']) else: data_table = None else: data_table = Tracking( job_id=job_id, role=role, party_id=party_id, component_name=search_component_name ).get_output_data_table(data_name=search_data_name) args_from_component = this_type_args[ search_component_name] = this_type_args.get( search_component_name, {}) # todo: If the same component has more than one identical input, save as is repeated if if_save_as_task_input_data: if data_table: schedule_logger().info( "start save as task {} input data table {} {}" .format(task_id, data_table.get_namespace(), data_table.get_name())) origin_table_metas = data_table.get_metas() origin_table_schema = data_table.schema save_as_options = { "store_type": StoreTypes.ROLLPAIR_IN_MEMORY } if SAVE_AS_TASK_INPUT_DATA_IN_MEMORY else {} data_table = data_table.save_as( namespace=job_utils.generate_session_id( task_id=task_id, role=role, party_id=party_id), name=data_table.get_name(), partition=task_parameters[ 'input_data_partition'] if task_parameters.get( 'input_data_partition', 0) > 0 else data_table.get_partitions(), options=save_as_options) data_table.save_metas(origin_table_metas) data_table.schema = origin_table_schema schedule_logger().info( "save as task {} input data table to {} {} done" .format(task_id, data_table.get_namespace(), data_table.get_name())) else: schedule_logger().info( "pass save as task {} input data table, because the table is none" .format(task_id)) else: schedule_logger().info( "pass save as task {} input data table, because the switch is off" .format(task_id)) if not data_table or not filter_attr or not filter_attr.get( "data", None): args_from_component[data_type] = data_table else: args_from_component[data_type] = dict([ (a, getattr(data_table, "get_{}".format(a))()) for a in filter_attr["data"] ]) elif input_type in ['model', 'isometric_model']: this_type_args = task_run_args[input_type] = task_run_args.get( input_type, {}) for dsl_model_key in input_detail: dsl_model_key_items = dsl_model_key.split('.') if len(dsl_model_key_items) == 2: search_component_name, search_model_alias = dsl_model_key_items[ 0], dsl_model_key_items[1] elif len(dsl_model_key_items ) == 3 and dsl_model_key_items[0] == 'pipeline': search_component_name, search_model_alias = dsl_model_key_items[ 1], dsl_model_key_items[2] else: raise Exception( 'get input {} failed'.format(input_type)) models = Tracking( job_id=job_id, role=role, party_id=party_id, component_name=search_component_name, model_id=job_parameters['model_id'], model_version=job_parameters['model_version'] ).get_output_model(model_alias=search_model_alias) this_type_args[search_component_name] = models return task_run_args
def load_model_parameters(model_table_name, model_namespace): model = table(model_table_name, model_namespace) model_parameters = {} for meta_name, meta_value in model.collect(): model_parameters[meta_name] = meta_value return model_parameters
def save_data_to_eggroll_table(data, namespace, table_name, partition=1): data_table = table(table_name, namespace, partition=partition, create_if_missing=True, error_if_exist=True) data_table.put_all(data) return data_table
def host_ids_process(self, data_instances): # (host_id_process, 1) if self.intersect_cache_param.use_cache: LOGGER.info("Use intersect cache.") if self.has_cache_version: current_version = cache_utils.host_get_current_verison( host_party_id=self.host_party_id, id_type=self.intersect_cache_param.id_type, encrypt_type=self.intersect_cache_param.encrypt_type, tag='Za') version = current_version.get('table_name') namespace = current_version.get('namespace') guest_current_version = self.transfer_variable.cache_version_info.get( 0) LOGGER.info("current_version:{}".format(current_version)) LOGGER.info( "guest_current_version:{}".format(guest_current_version)) if guest_current_version.get('table_name') == version \ and guest_current_version.get('namespace') == namespace and \ current_version is not None: self.is_version_match = True else: self.is_version_match = False version_match_info = { 'version_match': self.is_version_match, 'version': version, 'namespace': namespace } self.transfer_variable.cache_version_match_info.remote( version_match_info, role=consts.GUEST, idx=0) host_ids_process_pair = None if not self.is_version_match or self.sync_intersect_ids: # if self.sync_intersect_ids is true, host will get the encrypted intersect id from guest, # which need the Za to decrypt them LOGGER.info("read Za from cache") host_ids_process_pair = session.table( name=version, namespace=namespace, create_if_missing=True, error_if_exist=False) if host_ids_process_pair.count() == 0: host_ids_process_pair = self.cal_host_ids_process_pair( data_instances) rsa_key = { 'rsa_e': self.e, 'rsa_d': self.d, 'rsa_n': self.n } self.store_cache(host_ids_process_pair, rsa_key=rsa_key) else: self.is_version_match = False LOGGER.info("is version_match:{}".format( self.is_version_match)) namespace = cache_utils.gen_cache_namespace( id_type=self.intersect_cache_param.id_type, encrypt_type=self.intersect_cache_param.encrypt_type, tag='Za', host_party_id=self.host_party_id) version = cache_utils.gen_cache_version(namespace=namespace, create=True) version_match_info = { 'version_match': self.is_version_match, 'version': version, 'namespace': namespace } self.transfer_variable.cache_version_match_info.remote( version_match_info, role=consts.GUEST, idx=0) host_ids_process_pair = self.cal_host_ids_process_pair( data_instances) rsa_key = {'rsa_e': self.e, 'rsa_d': self.d, 'rsa_n': self.n} self.store_cache(host_ids_process_pair, rsa_key=rsa_key, assign_version=version, assign_namespace=namespace) LOGGER.info("remote version match info to guest") else: LOGGER.info("Not using cache, calculate Za using raw id") host_ids_process_pair = self.cal_host_ids_process_pair( data_instances) return host_ids_process_pair