Example #1
0
 def get_table_meta(self, table_info):
     schedule_logger(self.job_id).info(f'start get table meta:{table_info}')
     table_meta_dict = storage.StorageTableMeta(namespace=table_info.get("namespace"), name=table_info.get("table_name"), create_address=False).to_dict()
     schedule_logger(self.job_id).info(f'get table meta success: {table_meta_dict}')
     table_meta_dict["part_of_data"] = serialize_b64(table_meta_dict["part_of_data"], to_str=True)
     table_meta_dict["schema"] = serialize_b64(table_meta_dict["schema"], to_str=True)
     return table_meta_dict
Example #2
0
def table_api(table_func):
    config = request.json
    if table_func == 'table_info':
        table_key_count = 0
        table_partition = None
        table_schema = None
        table_name, namespace = config.get("name") or config.get(
            "table_name"), config.get("namespace")
        table_meta = storage.StorageTableMeta(name=table_name,
                                              namespace=namespace)
        if table_meta:
            table_key_count = table_meta.get_count()
            table_partition = table_meta.get_partitions()
            table_schema = table_meta.get_schema()
            exist = 1
        else:
            exist = 0
        return get_json_result(
            data={
                "table_name": table_name,
                "namespace": namespace,
                "exist": exist,
                "count": table_key_count,
                "partition": table_partition,
                "schema": table_schema
            })
    else:
        return get_json_result()
Example #3
0
def write_to_db(conf, table_name, file_name, namespace, partitions, head):
    db = MysqldbHelper(**conf)
    table_meta = storage.StorageTableMeta(name=table_name, namespace=namespace)
    create_table = 'create table {}(id varchar(50) NOT NULL, features LONGTEXT, PRIMARY KEY(id))'.format(table_name)
    db.execute(create_table.format(table_name))
    print('create table {}'.format(table_name))

    with open(file_name, 'r') as f:
        if head:
            data_head = f.readline()
            header_source_item = data_head.split(',')
            table_meta.update_metas(schema={'header': ','.join(header_source_item[1:]).strip(), 'sid': header_source_item[0]})
        n = 0
        count = 0
        while True:
            data = list()
            lines = f.readlines(12400)

            if lines:
                sql = 'REPLACE INTO {}(id, features)  VALUES'.format(table_name)
                for line in lines:
                    count += 1
                    values = line.replace("\n", "").replace("\t", ",").split(",")
                    data.append((values[0], list_to_str(values[1:])))
                    sql += '("{}", "{}"),'.format(values[0], list_to_str(values[1:]))
                sql = ','.join(sql.split(',')[:-1]) + ';'
                if n == 0:
                    table_meta.update_metas(part_of_data=data, partitions=partitions)
                n +=1
                db.execute(sql)
                db.con.commit()
            else:
                break
        table_meta.update_metas(count=count)
Example #4
0
    def get_output_data_table(self, output_data_infos, tracker_client=None):
        """
        Get component output data table, will run in the task executor process
        :param output_data_infos:
        :return:
        """
        output_tables_meta = {}
        if output_data_infos:
            for output_data_info in output_data_infos:
                schedule_logger(self.job_id).info(
                    "Get task {} {} output table {} {}".format(
                        output_data_info.f_task_id,
                        output_data_info.f_task_version,
                        output_data_info.f_table_namespace,
                        output_data_info.f_table_name))
                if not tracker_client:
                    data_table_meta = storage.StorageTableMeta(
                        name=output_data_info.f_table_name,
                        namespace=output_data_info.f_table_namespace)
                else:
                    data_table_meta = tracker_client.get_table_meta(
                        output_data_info.f_table_name,
                        output_data_info.f_table_namespace)

                output_tables_meta[
                    output_data_info.f_data_name] = data_table_meta
        return output_tables_meta
Example #5
0
 def get_computing_table(self, name, namespace, schema=None):
     storage_table_meta = storage.StorageTableMeta(name=name,
                                                   namespace=namespace)
     computing_table = session.get_computing_session().load(
         storage_table_meta.get_address(),
         schema=schema if schema else storage_table_meta.get_schema(),
         partitions=self.parameters.get("partitions"))
     return computing_table
Example #6
0
 def load(cls, cache: DataCache) -> typing.Tuple[typing.Dict[str, CTableABC], dict]:
     cache_data = {}
     for name, table in cache.data.items():
         storage_table_meta = storage.StorageTableMeta(name=table.name, namespace=table.namespace)
         computing_table = session.get_computing_session().load(
             storage_table_meta.get_address(),
             schema=storage_table_meta.get_schema(),
             partitions=table.partitions)
         cache_data[name] = computing_table
     return cache_data, cache.meta
Example #7
0
def download_upload(access_module):
    job_id = job_utils.generate_job_id()
    if access_module == "upload" and UPLOAD_DATA_FROM_CLIENT and not (request.json and request.json.get("use_local_data") == 0):
        file = request.files['file']
        filename = os.path.join(job_utils.get_job_directory(job_id), 'fate_upload_tmp', file.filename)
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        try:
            file.save(filename)
        except Exception as e:
            shutil.rmtree(os.path.join(job_utils.get_job_directory(job_id), 'fate_upload_tmp'))
            raise e
        job_config = request.args.to_dict()
        if "namespace" in job_config and "table_name" in job_config:
            pass
        else:
            # higher than version 1.5.1, support eggroll run parameters
            job_config = json_loads(list(job_config.keys())[0])
        job_config['file'] = filename
    else:
        job_config = request.json
    required_arguments = ['work_mode', 'namespace', 'table_name']
    if access_module == 'upload':
        required_arguments.extend(['file', 'head', 'partition'])
    elif access_module == 'download':
        required_arguments.extend(['output_path'])
    else:
        raise Exception('can not support this operating: {}'.format(access_module))
    detect_utils.check_config(job_config, required_arguments=required_arguments)
    data = {}
    # compatibility
    if "table_name" in job_config:
        job_config["name"] = job_config["table_name"]
    if "backend" not in job_config:
        job_config["backend"] = 0
    for _ in ["work_mode", "backend", "head", "partition", "drop"]:
        if _ in job_config:
            job_config[_] = int(job_config[_])
    if access_module == "upload":
        if job_config.get('drop', 0) == 1:
            job_config["destroy"] = True
        else:
            job_config["destroy"] = False
        data['table_name'] = job_config["table_name"]
        data['namespace'] = job_config["namespace"]
        data_table_meta = storage.StorageTableMeta(name=job_config["table_name"], namespace=job_config["namespace"])
        if data_table_meta and not job_config["destroy"]:
            return get_json_result(retcode=100,
                                   retmsg='The data table already exists.'
                                          'If you still want to continue uploading, please add the parameter -drop.'
                                          ' 0 means not to delete and continue uploading, '
                                          '1 means to upload again after deleting the table')
    job_dsl, job_runtime_conf = gen_data_access_job_config(job_config, access_module)
    submit_result = DAGScheduler.submit({'job_dsl': job_dsl, 'job_runtime_conf': job_runtime_conf}, job_id=job_id)
    data.update(submit_result)
    return get_json_result(job_id=job_id, data=data)
Example #8
0
 def save_table_meta(self, meta):
     schedule_logger(self.job_id).info(f'start save table meta:{meta}')
     address = storage.StorageTableMeta.create_address(storage_engine=meta.get("engine"),
                                                       address_dict=meta.get("address"))
     table_meta = storage.StorageTableMeta(name=meta.get("name"), namespace=meta.get("namespace"), new=True)
     table_meta.set_metas(**meta)
     meta["address"] = address
     meta["part_of_data"] = deserialize_b64(meta["part_of_data"])
     meta["schema"] = deserialize_b64(meta["schema"])
     table_meta.create()
     schedule_logger(self.job_id).info(f'save table meta success')
Example #9
0
def get_input_data_min_partitions(input_data, role, party_id):
    min_partition = None
    if role != 'arbiter':
        for data_type, data_location in input_data[role][party_id].items():
            table_info = {
                'name': data_location.split('.')[1],
                'namespace': data_location.split('.')[0]
            }
            table_meta = storage.StorageTableMeta(
                name=table_info['name'], namespace=table_info['namespace'])
            if table_meta:
                table_partition = table_meta.get_partitions()
                if not min_partition or min_partition > table_partition:
                    min_partition = table_partition
    return min_partition
Example #10
0
def table_add():
    request_data = request.json
    detect_utils.check_config(request_data,
                              required_arguments=[
                                  "engine", "address", "namespace", "name",
                                  ("head", (0, 1)), "id_delimiter"
                              ])
    address_dict = request_data.get('address')
    engine = request_data.get('engine')
    name = request_data.get('name')
    namespace = request_data.get('namespace')
    address = storage.StorageTableMeta.create_address(
        storage_engine=engine, address_dict=address_dict)
    in_serialized = request_data.get(
        "in_serialized", 1 if engine in {
            storage.StorageEngine.STANDALONE, storage.StorageEngine.EGGROLL,
            storage.StorageEngine.MYSQL
        } else 0)
    destroy = (int(request_data.get("drop", 0)) == 1)
    data_table_meta = storage.StorageTableMeta(name=name, namespace=namespace)
    if data_table_meta:
        if destroy:
            data_table_meta.destroy_metas()
        else:
            return get_json_result(
                retcode=100,
                retmsg='The data table already exists.'
                'If you still want to continue uploading, please add the parameter -drop.'
                '1 means to add again after deleting the table')
    id_name = request_data.get("id_name")
    feature_name = request_data.get("feature_name")
    schema = None
    if id_name and feature_name:
        schema = {'header': feature_name, 'sid': id_name}
    with storage.Session.build(
            storage_engine=engine,
            options=request_data.get("options")) as storage_session:
        storage_session.create_table(
            address=address,
            name=name,
            namespace=namespace,
            partitions=request_data.get('partitions', None),
            hava_head=request_data.get("head"),
            id_delimiter=request_data.get("id_delimiter"),
            in_serialized=in_serialized,
            schema=schema)
    return get_json_result(data={"table_name": name, "namespace": namespace})
Example #11
0
def table_disable():
    request_data = request.json
    adapter_request_data(request_data)
    disable = True if request.url.endswith("disable") else False
    tables_meta = storage.StorageTableMeta.query_table_meta(filter_fields=dict(
        **request_data))
    data = []
    if tables_meta:
        for table_meta in tables_meta:
            storage.StorageTableMeta(
                name=table_meta.f_name,
                namespace=table_meta.f_namespace).update_metas(disable=disable)
            data.append({
                'table_name': table_meta.f_name,
                'namespace': table_meta.f_namespace
            })
        return get_json_result(data=data)
    return get_json_result(retcode=101, retmsg='no find table')
Example #12
0
def table_download():
    request_data = request.json
    from fate_flow.component_env_utils.env_utils import import_component_output_depend
    import_component_output_depend()
    data_table_meta = storage.StorageTableMeta(
        name=request_data.get("name"), namespace=request_data.get("namespace"))
    if not data_table_meta:
        return error_response(
            response_code=210,
            retmsg=
            f'no found table:{request_data.get("namespace")}, {request_data.get("name")}'
        )
    tar_file_name = 'table_{}_{}.tar.gz'.format(request_data.get("namespace"),
                                                request_data.get("name"))
    return TableStorage.send_table(
        output_tables_meta={"table": data_table_meta},
        tar_file_name=tar_file_name,
        need_head=request_data.get("head", True))
Example #13
0
def get_upload_info(jobs_run_conf):
    data = []

    for job_id, job_run_conf in jobs_run_conf.items():
        info = {}
        table_name = job_run_conf["name"]
        namespace = job_run_conf["namespace"]
        table_meta = storage.StorageTableMeta(name=table_name, namespace=namespace)
        if table_meta:
            partition = job_run_conf["partition"]
            info["upload_info"] = {
                "table_name": table_name,
                "namespace": namespace,
                "partition": partition,
                'upload_count': table_meta.get_count()
            }
            info["notes"] = job_run_conf["notes"]
            info["schema"] = table_meta.get_schema()
            data.append({job_id: info})
    return data
Example #14
0
 def get_table_meta(self, table_name, table_namespace):
     request_body = {"table_name": table_name, "namespace": table_namespace}
     response = api_utils.local_api(job_id=self.job_id,
                                    method='POST',
                                    endpoint='/tracker/{}/{}/{}/{}/{}/{}/table_meta/get'.format(
                                        self.job_id,
                                        self.component_name,
                                        self.task_id,
                                        self.task_version,
                                        self.role,
                                        self.party_id),
                                    json_body=request_body)
     if response['retcode'] != RetCode.SUCCESS:
         raise Exception(f"create table meta failed:{response['retmsg']}")
     else:
         data_table_meta = storage.StorageTableMeta(name=table_name,
                                                    namespace=table_namespace, new=True)
         data_table_meta.set_metas(**response["data"])
         data_table_meta.address = storage.StorageTableMeta.create_address(storage_engine=response["data"].get("engine"),
                                                                           address_dict=response["data"].get("address"))
         data_table_meta.part_of_data = deserialize_b64(data_table_meta.part_of_data)
         data_table_meta.schema = deserialize_b64(data_table_meta.schema)
         return data_table_meta
Example #15
0
def table_api(table_func):
    config = request.json
    if table_func == 'table_info':
        table_key_count = 0
        table_partition = None
        table_schema = None
        table_name, namespace = config.get("name") or config.get(
            "table_name"), config.get("namespace")
        table_meta = storage.StorageTableMeta(name=table_name,
                                              namespace=namespace)
        address = None
        enable = True
        origin = None
        if table_meta:
            table_key_count = table_meta.get_count()
            table_partition = table_meta.get_partitions()
            table_schema = table_meta.get_schema()
            address = table_meta.get_address().__dict__
            enable = not table_meta.get_disable()
            origin = table_meta.get_origin()
            exist = 1
        else:
            exist = 0
        return get_json_result(
            data={
                "table_name": table_name,
                "namespace": namespace,
                "exist": exist,
                "count": table_key_count,
                "partition": table_partition,
                "schema": table_schema,
                "enable": enable,
                "origin": origin,
                "address": address,
            })
    else:
        return get_json_result()
Example #16
0
 def component_check(cls, job, check_type="inheritance"):
     if check_type == "rerun":
         task_list = JobSaver.query_task(job_id=job.f_job_id,
                                         party_id=job.f_party_id,
                                         role=job.f_role,
                                         status=TaskStatus.SUCCESS,
                                         only_latest=True)
         tasks = {}
         for task in task_list:
             tasks[task.f_component_name] = task
     else:
         tasks = JobController.load_tasks(
             component_list=job.f_inheritance_info.get(
                 "component_list", []),
             job_id=job.f_inheritance_info.get("job_id"),
             role=job.f_role,
             party_id=job.f_party_id)
     tracker_dict = JobController.load_task_tracker(tasks)
     missing_dependence_component_list = []
     # data dependence
     for tracker in tracker_dict.values():
         table_infos = tracker.get_output_data_info()
         for table in table_infos:
             table_meta = storage.StorageTableMeta(
                 name=table.f_table_name, namespace=table.f_table_namespace)
             if not table_meta:
                 missing_dependence_component_list.append(
                     tracker.component_name)
                 continue
     if check_type == "rerun":
         return missing_dependence_component_list
     elif check_type == "inheritance":
         # reload component list
         return list(
             set(job.f_inheritance_info.get("component_list", [])) -
             set(missing_dependence_component_list))
Example #17
0
    def get_task_run_args(cls,
                          job_id,
                          role,
                          party_id,
                          task_id,
                          task_version,
                          job_args,
                          job_parameters: RunParameters,
                          task_parameters: RunParameters,
                          input_dsl,
                          filter_type=None,
                          filter_attr=None,
                          get_input_table=False):
        task_run_args = {}
        input_table = {}
        if 'idmapping' in role:
            return {}
        for input_type, input_detail in input_dsl.items():
            if filter_type and input_type not in filter_type:
                continue
            if input_type == 'data':
                this_type_args = task_run_args[input_type] = task_run_args.get(
                    input_type, {})
                for data_type, data_list in input_detail.items():
                    data_dict = {}
                    for data_key in data_list:
                        data_key_item = data_key.split('.')
                        data_dict[data_key_item[0]] = {data_type: []}
                    for data_key in data_list:
                        data_key_item = data_key.split('.')
                        search_component_name, search_data_name = data_key_item[
                            0], data_key_item[1]
                        storage_table_meta = None
                        if search_component_name == 'args':
                            if job_args.get(
                                    'data', {}).get(search_data_name).get(
                                        'namespace', '') and job_args.get(
                                            'data',
                                            {}).get(search_data_name).get(
                                                'name', ''):
                                storage_table_meta = storage.StorageTableMeta(
                                    name=job_args['data'][search_data_name]
                                    ['name'],
                                    namespace=job_args['data']
                                    [search_data_name]['namespace'])
                        else:
                            tracker_client = TrackerClient(
                                job_id=job_id,
                                role=role,
                                party_id=party_id,
                                component_name=search_component_name)
                            upstream_output_table_infos_json = tracker_client.get_output_data_info(
                                data_name=search_data_name)
                            if upstream_output_table_infos_json:
                                tracker = Tracker(
                                    job_id=job_id,
                                    role=role,
                                    party_id=party_id,
                                    component_name=search_component_name)
                                upstream_output_table_infos = []
                                for _ in upstream_output_table_infos_json:
                                    upstream_output_table_infos.append(
                                        fill_db_model_object(
                                            Tracker.get_dynamic_db_model(
                                                TrackingOutputDataInfo,
                                                job_id)(), _))
                                output_tables_meta = tracker.get_output_data_table(
                                    output_data_infos=
                                    upstream_output_table_infos)
                                if output_tables_meta:
                                    storage_table_meta = output_tables_meta.get(
                                        search_data_name, None)
                        args_from_component = this_type_args[
                            search_component_name] = this_type_args.get(
                                search_component_name, {})
                        if get_input_table and storage_table_meta:
                            input_table[data_key] = {
                                'namespace':
                                storage_table_meta.get_namespace(),
                                'name': storage_table_meta.get_name()
                            }
                            computing_table = None
                        elif storage_table_meta:
                            LOGGER.info(
                                f"load computing table use {task_parameters.computing_partitions}"
                            )
                            computing_table = session.get_latest_opened(
                            ).computing.load(
                                storage_table_meta.get_address(),
                                schema=storage_table_meta.get_schema(),
                                partitions=task_parameters.computing_partitions
                            )
                        else:
                            computing_table = None

                        if not computing_table or not filter_attr or not filter_attr.get(
                                "data", None):
                            data_dict[search_component_name][data_type].append(
                                computing_table)
                            args_from_component[data_type] = data_dict[
                                search_component_name][data_type]
                        else:
                            args_from_component[data_type] = dict([
                                (a, getattr(computing_table,
                                            "get_{}".format(a))())
                                for a in filter_attr["data"]
                            ])
            elif input_type in ['model', 'isometric_model']:
                this_type_args = task_run_args[input_type] = task_run_args.get(
                    input_type, {})
                for dsl_model_key in input_detail:
                    dsl_model_key_items = dsl_model_key.split('.')
                    if len(dsl_model_key_items) == 2:
                        search_component_name, search_model_alias = dsl_model_key_items[
                            0], dsl_model_key_items[1]
                    elif len(dsl_model_key_items
                             ) == 3 and dsl_model_key_items[0] == 'pipeline':
                        search_component_name, search_model_alias = dsl_model_key_items[
                            1], dsl_model_key_items[2]
                    else:
                        raise Exception(
                            'get input {} failed'.format(input_type))
                    models = Tracker(
                        job_id=job_id,
                        role=role,
                        party_id=party_id,
                        component_name=search_component_name,
                        model_id=job_parameters.model_id,
                        model_version=job_parameters.model_version
                    ).get_output_model(model_alias=search_model_alias)
                    this_type_args[search_component_name] = models
        if get_input_table:
            return input_table
        return task_run_args
Example #18
0
 def save_output_data(self,
                      computing_table,
                      output_storage_engine,
                      output_storage_address: dict,
                      output_table_namespace=None,
                      output_table_name=None):
     if computing_table:
         if not output_table_namespace or not output_table_name:
             output_table_namespace, output_table_name = data_utils.default_output_table_info(
                 task_id=self.task_id, task_version=self.task_version)
         schedule_logger(self.job_id).info(
             'persisting the component output temporary table to {} {}'.
             format(output_table_namespace, output_table_name))
         partitions = computing_table.partitions
         schedule_logger(self.job_id).info(
             'output data table partitions is {}'.format(partitions))
         address_dict = output_storage_address.copy()
         if output_storage_engine == StorageEngine.EGGROLL:
             address_dict.update({
                 "name":
                 output_table_name,
                 "namespace":
                 output_table_namespace,
                 "storage_type":
                 storage.EggRollStorageType.ROLLPAIR_LMDB
             })
         elif output_storage_engine == StorageEngine.STANDALONE:
             address_dict.update({
                 "name":
                 output_table_name,
                 "namespace":
                 output_table_namespace,
                 "storage_type":
                 storage.StandaloneStorageType.ROLLPAIR_LMDB
             })
         elif output_storage_engine == StorageEngine.HDFS:
             address_dict.update({
                 "path":
                 data_utils.default_output_fs_path(
                     name=output_table_name,
                     namespace=output_table_namespace,
                     prefix=address_dict.get("path_prefix"))
             })
         else:
             raise RuntimeError(
                 f"{output_storage_engine} storage is not supported")
         address = storage.StorageTableMeta.create_address(
             storage_engine=output_storage_engine,
             address_dict=address_dict)
         schema = {}
         # persistent table
         computing_table.save(address, schema=schema, partitions=partitions)
         part_of_data = []
         part_of_limit = 100
         for k, v in computing_table.collect():
             part_of_data.append((k, v))
             part_of_limit -= 1
             if part_of_limit == 0:
                 break
         table_count = computing_table.count()
         table_meta = storage.StorageTableMeta(
             name=output_table_name,
             namespace=output_table_namespace,
             new=True)
         table_meta.address = address
         table_meta.partitions = computing_table.partitions
         table_meta.engine = output_storage_engine
         table_meta.type = storage.EggRollStorageType.ROLLPAIR_LMDB
         table_meta.schema = schema
         table_meta.part_of_data = part_of_data
         table_meta.count = table_count
         table_meta.create()
         return output_table_namespace, output_table_name
     else:
         schedule_logger(self.job_id).info(
             'task id {} output data table is none'.format(self.task_id))
         return None, None
Example #19
0
def table_bind():
    request_data = request.json
    address_dict = request_data.get('address')
    engine = request_data.get('engine')
    name = request_data.get('name')
    namespace = request_data.get('namespace')
    address = storage.StorageTableMeta.create_address(
        storage_engine=engine, address_dict=address_dict)
    in_serialized = request_data.get(
        "in_serialized", 1 if engine in {
            storage.StorageEngine.STANDALONE, storage.StorageEngine.EGGROLL,
            storage.StorageEngine.MYSQL, storage.StorageEngine.PATH
        } else 0)
    destroy = (int(request_data.get("drop", 0)) == 1)
    data_table_meta = storage.StorageTableMeta(name=name, namespace=namespace)
    if data_table_meta:
        if destroy:
            data_table_meta.destroy_metas()
        else:
            return get_json_result(
                retcode=100,
                retmsg='The data table already exists.'
                'If you still want to continue uploading, please add the parameter --drop'
            )
    id_column = request_data.get("id_column") or request_data.get("id_name")
    feature_column = request_data.get("feature_column") or request_data.get(
        "feature_name")
    schema = None
    if id_column and feature_column:
        schema = {'header': feature_column, 'sid': id_column}
    elif id_column:
        schema = {'sid': id_column, 'header': ''}
    sess = Session()
    storage_session = sess.storage(storage_engine=engine,
                                   options=request_data.get("options"))
    table = storage_session.create_table(
        address=address,
        name=name,
        namespace=namespace,
        partitions=request_data.get('partitions', None),
        hava_head=request_data.get("head"),
        schema=schema,
        id_delimiter=request_data.get("id_delimiter"),
        in_serialized=in_serialized,
        origin=request_data.get("origin", StorageTableOrigin.TABLE_BIND))
    response = get_json_result(data={
        "table_name": name,
        "namespace": namespace
    })
    if not table.check_address():
        response = get_json_result(
            retcode=100,
            retmsg=f'engine {engine} address {address_dict} check failed')
    else:
        DataTableTracker.create_table_tracker(
            table_name=name,
            table_namespace=namespace,
            entity_info={"have_parent": False},
        )
    sess.destroy_all_sessions()
    return response
Example #20
0
def get_table_info(name, namespace):
    data_table_meta = storage.StorageTableMeta(name=name, namespace=namespace)
    address = data_table_meta.get_address()
    schema = data_table_meta.get_schema()
    return address, schema