def do_export_file(job_id, _data): try: work_mode = _data.get("work_mode") name = _data.get("name") namespace = _data.get("namespace") delimitor = _data.get("delimitor", ",") output_path = _data.get("output_path") eggroll.init(job_id, work_mode) with open(os.path.abspath(output_path), "w") as fout: data_table = storage.get_data_table(name=name, namespace=namespace) print('===== begin to export data =====') lines = 0 for key, value in data_table.collect(): if not value: fout.write(key + "\n") else: fout.write(key + delimitor + value + "\n") lines += 1 if lines % 2000 == 0: print("===== export {} lines =====".format(lines)) print("===== export {} lines totally =====".format(lines)) print('===== export data finish =====') except: raise ValueError("cannot export data, please check json file")
def read_data(self, table_name, namespace, mode="fit"): input_data = storage.get_data_table(table_name, namespace) LOGGER.info("start to read dense data and change data to instance") abnormal_detection.empty_table_detection(input_data) input_data_features = None input_data_labels = None if self.with_label: if type(self.label_idx).__name__ != "int": raise ValueError("label index should be integer") data_shape = data_overview.get_data_shape(input_data) if not data_shape or self.label_idx >= data_shape: raise ValueError("input data's value is empty, it does not contain a label") input_data_features = input_data.mapValues( lambda value: [] if data_shape == 1 else value.split(self.delimitor, -1)[:self.label_idx] + value.split(self.delimitor, -1)[ self.label_idx + 1:]) input_data_labels = input_data.mapValues(lambda value: value.split(self.delimitor, -1)[self.label_idx]) else: input_data_features = input_data.mapValues(lambda value: [] if not value else value.split(self.delimitor, -1)) if mode == "fit": data_instance = self.fit(input_data_features, input_data_labels, table_name, namespace) else: data_instance = self.transform(input_data_features, input_data_labels) set_schema(data_instance, self.header) return data_instance
def dtable(table_func): config = request.json if table_func == 'tableInfo': table_name, namespace = get_table_info(config=config, create=config.get('create', False)) dtable = storage.get_data_table(name=table_name, namespace=namespace) if dtable: table_key_count = dtable.count() else: table_key_count = 0 return get_json_result(data={'table_name': table_name, 'namespace': namespace, 'count': table_key_count}) else: return get_json_result()
def read_data(self, table_name, namespace, mode="fit"): input_data = storage.get_data_table(table_name, namespace) LOGGER.info("start to read sparse data and change data to instance") abnormal_detection.empty_table_detection(input_data) if mode == "fit": data_instance = self.fit(input_data) else: data_instance = self.transform(input_data) set_schema(data_instance, self.header) return data_instance
def read_data(self, table_name, namespace, mode="fit"): input_data = storage.get_data_table(table_name, namespace) LOGGER.info("start to read sparse data and change data to instance") abnormal_detection.empty_table_detection(input_data) if not data_overview.get_data_shape(input_data): raise ValueError("input data's value is empty, it does not contain a label") if mode == "fit": data_instance = self.fit(input_data) else: data_instance = self.transform(input_data) set_schema(data_instance, self.header) return data_instance