def load(self, file=None, udc=None): outer_conf = ConfigTree() _ = self.load_regs() if file: file_conf = self.load_file(file) outer_conf = outer_conf.replace(file_conf) if udc: udc_conf = self.load_udc(udc) outer_conf = outer_conf.replace(udc_conf) if hasattr(self._cfg_mgr, LEVEL_SYSTEM_STR): for sys_key in self._cfg_mgr[LEVEL_SYSTEM_STR]: if sys_key not in outer_conf: raise KeyError( i18n("Key '{}' isn't found in the configurations!"). format(sys_key)) setattr(self, sys_key, outer_conf[sys_key]) if hasattr(self._cfg_mgr, LEVEL_REQUIRE_STR): for req_key in self._cfg_mgr[LEVEL_REQUIRE_STR]: if req_key in outer_conf: setattr(self, req_key, outer_conf[req_key]) else: logging.warning( i18n("Level-REQUIRE key '{}' has been set, " "but not provide in configurations.").format( req_key)) self._conf = outer_conf
def run(self): hdfs = HDFS() dirs = config._build_dirs tmp_fmap_dir = dirs["tmp_fmap_dir"] tmp_ckpt_dir = dirs["tmp_ckpt_dir"] fmap = Fmap.load(tmp_fmap_dir) input_cls = model[config.MODEL.input_name] files_pattern = hdfs.hdfs_whole_path( Path(config.HDFS_TFRECORD_DIR).joinpath("part*").as_posix()) files = tf.io.gfile.glob(files_pattern) dataset = input_cls(fmap).tfr_inputs(files) model_cls = model[config.MODEL.model_name] model_ins = model_cls(fmap) if tmp_ckpt_dir.joinpath("h5weights", "weights.h5").exists(): logging.info(i18n("Loading model weight success.")) model_ins.load_weights(tmp_ckpt_dir) model_ins.evaluate_act(dataset) logging.info(i18n("Evaluating done."))
def initialize_hdfs(self, sc=None): if self._status: logging.info(i18n("Using exists HDFS Handler.")) return init_spark() from pyspark.sql import SparkSession from pyspark.context import SparkContext if sc is None: new_sc = SparkSession.builder \ .appName("DLFlow.HDSF_Handler") \ .master("local[*]") \ .enableHiveSupport() \ .getOrCreate() \ .sparkContext logging.info(i18n("Using new local SparkContext.")) else: new_sc = sc logging.info(i18n("Using presetting SparkContext.")) if not isinstance(new_sc, SparkContext): raise TypeError( i18n("Expected type {} for 'sc', but got {}").format( str(SparkContext), type(new_sc))) if new_sc._jsc is None: raise RuntimeError(i18n("SparkContext has already closed!")) hadoop_conf = new_sc._jsc.hadoopConfiguration() self._header = hadoop_conf.get("fs.default.name") self._fs = new_sc._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf) self._sc = new_sc self._status = True
def _vertices_info(vertices_set): info = i18n("\nVertices node info:") detail = i18n("\nDetail about neighbor:") if vertices_set is None: info += "\n{:>16}".format("None") detail += "\n{:>16}".format("None") else: for vtx in vertices_set: info += "\n{:>16}: {:<16} head: {:<2} tail: {:<2}".format( vtx.name, vtx.node.__name__, vtx.in_degree, vtx.out_degree) head_vtx = [] for in_edge in vtx.in_edges: head_vtx.append(in_edge.head.name) tail_vtx = [] for out_edge in vtx.out_edges: tail_vtx.append(out_edge.tail.name) detail += "\n{:>16}: {:<16} head: [{}] / tail: [{}]".format( vtx.name, vtx.node.__name__, ", ".join(head_vtx), ", ".join(tail_vtx)) return "{}{}".format(info, detail)
def safe_hdfs_path(self, base_dt, hdfs_patten, max_degrade=7): valid_dir, _ = self._degrade_hdfs(base_dt, hdfs_patten, max_degrade) if valid_dir is None: raise ValueError( i18n("Can't find valid partition on {}").format(hdfs_patten)) logging.info(i18n("Find valid partition {}").format(valid_dir)) return valid_dir
def _timeit(*args, **kwargs): logging.info(i18n("Start running: {}...").format(task_name)) time_enter = perf_counter() func(*args, **kwargs) time_elapsed = perf_counter() - time_enter logging.info( i18n("Task <{}> spend time: {:>.6f}s").format( task_name, time_elapsed))
def env_version_check(): import tensorflow import sys v_major = sys.version_info[0] v_minor = sys.version_info[1] assert (v_major == 3 and v_minor >= 6) or v_major > 3, \ i18n("This program requires at least Python 3.6") assert tensorflow.__version__.startswith("2."), \ i18n("This program require Tensorflow 2.0")
def _initialize(self): if self._dag_obj is None: raise RuntimeError( i18n("WorkflowDAG is not bound, please bind it first!")) if config.conf is None: raise RuntimeError( i18n("Bad configurations, please load it first!")) config.initialize() for vtx in self._dag_obj.DAG: self._tasks.append(vtx.node())
def _internal_collect(self): standard_task_dir = Path(DLFLOW_TASKS) internal_model_dir = Path(DLFLOW_MODELS) logging.info( i18n("Loading DLFlow standard task. From: {}").format( standard_task_dir)) self._collect(standard_task_dir) logging.info( i18n("Loading DLFlow internal model. From: {}").format( internal_model_dir)) self._collect(internal_model_dir)
def safe_hive_date(self, base_dt, hdfs_patten, max_degrade=7, fmt_dt="%Y%m%d"): _, valid_dt = self._degrade_hdfs(base_dt, hdfs_patten, max_degrade) if valid_dt is None: raise ValueError( i18n("Can't find valid partition on {}").format(hdfs_patten)) valid_dt = valid_dt.strftime(fmt_dt) logging.info(i18n("Find valid hive date {}").format(valid_dt)) return valid_dt
def initialize_spark(self, app_name=None, spark_conf=None, reuse=False): if self._status and not reuse: logging.info(i18n("Using exists spark session.")) return elif self._status and reuse: self.close() if self._hdfs.status: self._hdfs.close() init_spark() from pyspark.sql import SparkSession _spark = SparkSession.builder if app_name is not None: _spark = _spark.appName(app_name) logging.info(i18n("Initializing Spark App: {}").format(app_name)) if isinstance(spark_conf, dict): spark_conf = spark_conf.copy() self.spark_conf = spark_conf.copy() if "spark.executorEnv.PYSPARK_PYTHON" in spark_conf: os.environ["PYSPARK_PYTHON"] = \ spark_conf["spark.executorEnv.PYSPARK_PYTHON"] _fmt_conf_str = "{\n" for k, v in spark_conf.items(): _fmt_conf_str += "{}{}: {}\n".format(" " * 4, k, v) _fmt_conf_str += "}\n" logging.info( i18n("Using spark config:\n{s}").format(s=_fmt_conf_str)) if "master" in spark_conf: _spark = _spark.master(spark_conf["master"]) _ = spark_conf.pop("master") for conf_key, conf_value in spark_conf.items(): _spark = _spark.config(conf_key, conf_value) self._spark = _spark.enableHiveSupport().getOrCreate() self._sc = self._spark.sparkContext self.app_name = app_name self._hdfs.initialize_hdfs(sc=self._sc) self._status = True logging.info(i18n("Spark Version: {}").format(self._spark.version))
def __getreg__(cls, key): if key in cls.__MASKS__: reg_key = cls.__MASKS__[key] else: raise KeyError(i18n("Unknown key '{}'").format(key)) return cls.__REGS__[reg_key]
def _collect(self, src_dir): src_dir = Path(src_dir).resolve() for cur, _, files in os.walk(src_dir): logging.debug(i18n("Scanning directory: {}").format(cur)) if files: sys.path.append(cur) for file_name in files: for module_name in self.module_pattern.findall(file_name): _ = __import__(module_name) logging.info( i18n(" * Loading module: {}").format(module_name)) sys.path.remove(cur)
def _topology_sort(self): vertices_set = set(self._vertices.values()) for vtx in vertices_set: vtx.tmp_in_degree = vtx.in_degree sorted_vertices = [] while vertices_set: _kick_set = set() for vtx in vertices_set: if vtx.tmp_in_degree == 0: for edge in vtx.out_edges: edge.tail.tmp_in_degree -= 1 sorted_vertices.append(vtx) _kick_set.add(vtx) del vtx.tmp_in_degree if not _kick_set: _safe_set = set() for vtx in vertices_set: if vtx.out_degree == 0 or vtx.tmp_in_degree == 0: _safe_set.add(vtx) vertices_set -= _safe_set err_info = i18n( "Can't build DAG, circular references " "are found, please check it!\n{}") \ .format(self._vertices_info(vertices_set)) logging.error(err_info) raise RuntimeError(err_info) vertices_set -= _kick_set return sorted_vertices
def bucket(self, bucket): if isinstance(bucket, FeBucket): self._bucket = bucket else: raise TypeError( i18n("Parameter type expected {}, but got {}").format( FeBucket, type(bucket)))
def field(self, field): if isinstance(field, FeField): self._field = field else: raise TypeError( i18n("Parameter type expected {}, but got {}").format( FeField, type(field)))
def initialize(self, file=None, udc=None, log_level=None, log_dir=None, lang=None, mode="local"): config.load(file, udc) if lang: self._lang = lang elif "LANG" in os.environ: self._lang = os.environ["LANG"].split(".")[0].split("_")[0] i18n.initialize(self._lang) if log_level: self._log_level = log_level elif "LOG_LEVEL" in config: self._log_level = config.LOG_LEVEL if self._log_level: set_logging_level(self._log_level) logging_info = get_logging_info_str() logging.warning(i18n("Logging level is changed to '{}'") .format(self._log_level) .join("", logging_info)) if log_dir: self._log_dir = log_dir elif "LOG_DIR" in config: self._log_dir = config.LOG_DIR if self._log_dir: log_name = Path(file).parts[-1].split(".")[0] set_logging_writer(log_name, self._log_dir) logging.info(i18n("Start writing log to {}").format(self._log_dir)) collect = Collector() for key, desc in [("TASKS_DIR", "UDT"), ("MODELS_DIR", "Models")]: if key in config: collect(config[key], desc) self._steps = config.STEPS self._mode = mode.lower()
def run(self): engine_cls = ENGINES[self._mode] engine = engine_cls() if isinstance(engine, BaseEngine): engine.run(self._steps) logging.info(i18n("All task done."))
def load(self, fmap_dir): fmap = Fmap.load(fmap_dir) if isinstance(fmap, Fmap): self._fmap = fmap else: raise TypeError( i18n("Parameter type expected {}, but got {}").format( Fmap, type(fmap)))
def initialize(self): outer_conf = self._conf.copy() default_conf = self.load_regs() self._conf = default_conf.replace(self._conf) self._solver() logging.info(i18n("All parsed configurations:\n{}").format(self._conf)) if hasattr(self._cfg_mgr, LEVEL_REQUIRE_STR): for req_key in self._cfg_mgr[LEVEL_REQUIRE_STR]: if req_key not in outer_conf: logging.warning( i18n("Level-REQUIRE key '{}' has been set, " "but not provide in configurations.").format( req_key))
def __getconf__(self, key): if self._conf is None: raise NotInitializeError(i18n("ConfigTree is not set!")) if key not in self._conf: raise KeyError(key) return self._conf[key]
def init_spark(): try: import pyspark except ModuleNotFoundError: try: logging.warning( i18n("No model named 'pyspark'. " "Trying to call 'findspark'...")) import findspark findspark.init() logging.info(i18n("Loading pyspark success.")) except ModuleNotFoundError: raise ModuleNotFoundError( i18n("Can not find model 'pysaprk' or 'findspark'."))
def __iter__(self): if self._conf is None: raise NotInitializeError(i18n("ConfigTree is not set!")) def _generator(): for k, v in self._conf.dense_dict.items(): yield (k, v) return _generator()
def bind_workflow(self, param): if isinstance(param, (str, )): self._dag_obj = WorkflowDAG(param) elif isinstance(param, (WorkflowDAG, )): self._dag_obj = param else: raise TypeError( i18n("Parameter 'param' expect type are 'str', " "'WorkflowDAG' or 'None', but find '{}'!").format( type(param)))
def path(self, path): if not self._status: raise NotInitializeError( i18n("Can not access HDFS! Because " "SparkContext wasn't initialize!")) path = Path(path).as_posix() jvm_hdfs_path = self._sc._jvm.org.apache.hadoop.fs.Path(path) return jvm_hdfs_path
def run(self): fit = config.MERGE.fit seed_sql = config.MERGE.seed_sql feature_config_file = Path(config.MERGE.config_file).resolve() if fit: assert feature_config_file.is_file(), \ i18n("Input feature configuration is not exists " "when fit=true: {}") \ .format(feature_config_file) _spark_conf = config.SPARK.dense_dict params = [] for k, v in _spark_conf.items(): if k.startswith("spark."): _param = '--conf "{}={}"'.format(k, v) else: _param = '--{} "{}"'.format(k, v) params.append(_param) spark_conf = "\n".join(params) _model_dir = Path(config.HDFS_FEMODEL_DIR) hdfs_merge_dir = _model_dir.joinpath("merge").as_posix() out_dir = Path(config.RAW_FEATURES).as_posix() spark_submit = SparkJARExec(spark_conf=spark_conf, seed_sql=seed_sql, feature_config=feature_config_file, feature_date=config.FEATURE_DATE, feature_model_hdfs=hdfs_merge_dir, feature_out_hdfs=out_dir, fit=fit) spark_submit.run() logging.info(i18n("Feature merge done."))
def add_feature(self, feature: Feature): if isinstance(feature, Feature): feature.field = self feature.set_offset(self._offset) self._offset += feature.size self._container[feature.name] = feature self._mcs = self._update_mcs(feature.mcs_dec) else: raise TypeError( i18n("Parameter type expected {}, but got {}").format( Feature, type(feature)))
def _solver(self): if self._conf is None: raise ValueError() dense_dict = self._conf.dense_dict total_key_set = set([i for i in dense_dict.keys()]) G = {} for k in total_key_set: G[k] = {"tail": [], "tpl": None, "in": 0} for cur_key, cur_value in dense_dict.items(): if isinstance(cur_value, (str, )): conf_template = ConfigTemplate(cur_value) G[cur_key]["tpl"] = conf_template G[cur_key]["in"] = len(conf_template.vars) for head_key in conf_template.vars: G[head_key]["tail"].append(cur_key) sort_keys = [] while total_key_set: _kick_set = set() for key in total_key_set: _ref_key = G[key] if _ref_key["in"] == 0: for tail_key in _ref_key["tail"]: G[tail_key]["in"] -= 1 sort_keys.append(key) _kick_set.add(key) if not _kick_set: err_info = i18n( "Circular references are found in the " "following keys, please check it!\n{}") \ .format(total_key_set) logging.error(err_info) raise RuntimeError(err_info) total_key_set -= _kick_set for key in sort_keys: if G[key]["tpl"]: conf_template = G[key]["tpl"] kws = {} for _k in conf_template.vars: kws[_k] = dense_dict[_k] dense_dict[key] = conf_template.render(kws) self._conf = self._cfg_loader.load(dense_dict, "dict")
def __init__(self, parser_name): self._parser = None self._normalizer = None initializer = { "spark": self._spark_parser_init }.get(parser_name.lower(), None) if initializer is None: raise ParserNotExists( i18n("Unknown feature parser '{}'").format(parser_name)) initializer()
def hdfs_whole_path(self, path, header=None): if header is None: prefix = self._header.strip() else: prefix = header.strip() if path[0] == "/": new_path = "".join([prefix, path]) elif prefix == path[:len(prefix)]: new_path = path else: raise ValueError(i18n("Illegal HDFS Path: {}").format(path)) return new_path