Beispiel #1
0
 def upload_file(self,
                 input_file,
                 head,
                 job_id=None,
                 input_feature_count=None,
                 table=None,
                 without_block=True):
     if not table:
         table = self.table
     with open(input_file, "r") as fin:
         lines_count = 0
         if head is True:
             data_head = fin.readline()
             input_feature_count -= 1
             self.update_table_meta(data_head)
         n = 0
         fate_uuid = uuid.uuid1().hex
         get_line = self.get_line()
         while True:
             data = list()
             lines = fin.readlines(JobDefaultConfig.upload_max_bytes)
             line_index = 0
             if lines:
                 # self.append_data_line(lines, data, n)
                 for line in lines:
                     values = line.rstrip().split(
                         self.parameters["id_delimiter"])
                     k, v = get_line(
                         values=values,
                         line_index=line_index,
                         extend_sid=self.parameters["extend_sid"],
                         auto_increasing_sid=self.
                         parameters["auto_increasing_sid"],
                         id_delimiter=self.parameters["id_delimiter"],
                         fate_uuid=fate_uuid,
                     )
                     data.append((k, v))
                     line_index += 1
                 if without_block:
                     lines_count += len(data)
                     save_progress = lines_count / input_feature_count * 100 // 1
                     job_info = {
                         "progress": save_progress,
                         "job_id": job_id,
                         "role": self.parameters["local"]["role"],
                         "party_id": self.parameters["local"]["party_id"],
                     }
                     ControllerClient.update_job(job_info=job_info)
                 table.put_all(data)
                 if n == 0 and without_block:
                     table.meta.update_metas(part_of_data=data)
             else:
                 return
             n += 1
Beispiel #2
0
 def save_data_table(self,
                     job_id,
                     dst_table_name,
                     dst_table_namespace,
                     head=True):
     input_file = self.parameters["file"]
     input_feature_count = self.get_count(input_file)
     with open(input_file, 'r') as fin:
         lines_count = 0
         if head is True:
             data_head = fin.readline()
             input_feature_count -= 1
             _, meta = self.table.get_meta().update_metas(
                 schema=data_utils.get_header_schema(
                     header_line=data_head,
                     id_delimiter=self.parameters["id_delimiter"]))
             self.table.set_meta(meta)
         n = 0
         while True:
             data = list()
             lines = fin.readlines(self.MAX_BYTES)
             if lines:
                 for line in lines:
                     values = line.rstrip().split(
                         self.parameters["id_delimiter"])
                     data.append((
                         values[0],
                         data_utils.list_to_str(
                             values[1:],
                             id_delimiter=self.parameters["id_delimiter"])))
                 lines_count += len(data)
                 save_progress = lines_count / input_feature_count * 100 // 1
                 job_info = {
                     'progress': save_progress,
                     "job_id": job_id,
                     "role": self.parameters["local"]['role'],
                     "party_id": self.parameters["local"]['party_id']
                 }
                 ControllerClient.update_job(job_info=job_info)
                 self.table.put_all(data)
                 if n == 0:
                     self.table.get_meta().update_metas(part_of_data=data)
             else:
                 table_count = self.table.count()
                 self.table.get_meta().update_metas(
                     count=table_count,
                     partitions=self.parameters["partition"])
                 self.save_meta(dst_table_namespace=dst_table_namespace,
                                dst_table_name=dst_table_name,
                                table_count=table_count)
                 return table_count
             n += 1
Beispiel #3
0
 def report_task_update_to_driver(cls, task_info):
     """
     Report task update to FATEFlow Server
     :param task_info:
     :return:
     """
     schedule_logger().info("report task {} {} {} {} to driver".format(
         task_info["task_id"],
         task_info["task_version"],
         task_info["role"],
         task_info["party_id"],
     ))
     ControllerClient.report_task(task_info=task_info)
Beispiel #4
0
 def run(self, component_parameters=None, args=None):
     self.parameters = component_parameters["DownloadParam"]
     self.parameters["role"] = component_parameters["role"]
     self.parameters["local"] = component_parameters["local"]
     name, namespace = self.parameters.get("name"), self.parameters.get(
         "namespace")
     with open(os.path.abspath(self.parameters["output_path"]),
               "w") as fout:
         with storage.Session.build(
                 session_id=job_utils.generate_session_id(
                     self.tracker.task_id,
                     self.tracker.task_version,
                     self.tracker.role,
                     self.tracker.party_id,
                     suffix="storage",
                     random_end=True),
                 name=name,
                 namespace=namespace) as storage_session:
             data_table = storage_session.get_table()
             count = data_table.count()
             LOGGER.info('===== begin to export data =====')
             lines = 0
             job_info = {}
             job_info["job_id"] = self.tracker.job_id
             job_info["role"] = self.tracker.role
             job_info["party_id"] = self.tracker.party_id
             for key, value in data_table.collect():
                 if not value:
                     fout.write(key + "\n")
                 else:
                     fout.write(key +
                                self.parameters.get("delimiter", ",") +
                                str(value) + "\n")
                 lines += 1
                 if lines % 2000 == 0:
                     LOGGER.info(
                         "===== export {} lines =====".format(lines))
                 if lines % 10000 == 0:
                     job_info["progress"] = lines / count * 100 // 1
                     ControllerClient.update_job(job_info=job_info)
             job_info["progress"] = 100
             ControllerClient.update_job(job_info=job_info)
             self.callback_metric(
                 metric_name='data_access',
                 metric_namespace='download',
                 metric_data=[Metric("count", data_table.count())])
         LOGGER.info("===== export {} lines totally =====".format(lines))
         LOGGER.info('===== export data finish =====')
         LOGGER.info('===== export data file path:{} ====='.format(
             os.path.abspath(self.parameters["output_path"])))
Beispiel #5
0
 def _run(self, cpn_input: ComponentInputProtocol):
     self.parameters = cpn_input.parameters
     self.parameters["role"] = cpn_input.roles["role"]
     self.parameters["local"] = cpn_input.roles["local"]
     name, namespace = self.parameters.get("name"), self.parameters.get(
         "namespace")
     with open(os.path.abspath(self.parameters["output_path"]), "w") as fw:
         session = Session(
             job_utils.generate_session_id(
                 self.tracker.task_id,
                 self.tracker.task_version,
                 self.tracker.role,
                 self.tracker.party_id,
             ))
         data_table = session.get_table(name=name, namespace=namespace)
         if not data_table:
             raise Exception(f"no found table {name} {namespace}")
         count = data_table.count()
         LOGGER.info("===== begin to export data =====")
         lines = 0
         job_info = {}
         job_info["job_id"] = self.tracker.job_id
         job_info["role"] = self.tracker.role
         job_info["party_id"] = self.tracker.party_id
         for key, value in data_table.collect():
             if not value:
                 fw.write(key + "\n")
             else:
                 fw.write(key + self.parameters.get("delimiter", ",") +
                          str(value) + "\n")
             lines += 1
             if lines % 2000 == 0:
                 LOGGER.info("===== export {} lines =====".format(lines))
             if lines % 10000 == 0:
                 job_info["progress"] = lines / count * 100 // 1
                 ControllerClient.update_job(job_info=job_info)
         job_info["progress"] = 100
         ControllerClient.update_job(job_info=job_info)
         self.callback_metric(
             metric_name="data_access",
             metric_namespace="download",
             metric_data=[Metric("count", data_table.count())],
         )
         LOGGER.info("===== export {} lines totally =====".format(lines))
         LOGGER.info("===== export data finish =====")
         LOGGER.info("===== export data file path:{} =====".format(
             os.path.abspath(self.parameters["output_path"])))
 def report_task_info_to_driver(self):
     LOGGER.info("report {} {} {} {} {} to driver:\n{}".format(
         self.__class__.__name__, self.report_info["task_id"],
         self.report_info["task_version"], self.report_info["role"],
         self.report_info["party_id"], self.report_info))
     ControllerClient.report_task(self.report_info)