Exemple #1
0
    def load(self, file=None, udc=None):
        outer_conf = ConfigTree()

        _ = self.load_regs()

        if file:
            file_conf = self.load_file(file)
            outer_conf = outer_conf.replace(file_conf)

        if udc:
            udc_conf = self.load_udc(udc)
            outer_conf = outer_conf.replace(udc_conf)

        if hasattr(self._cfg_mgr, LEVEL_SYSTEM_STR):
            for sys_key in self._cfg_mgr[LEVEL_SYSTEM_STR]:
                if sys_key not in outer_conf:
                    raise KeyError(
                        i18n("Key '{}' isn't found in the configurations!").
                        format(sys_key))

                setattr(self, sys_key, outer_conf[sys_key])

        if hasattr(self._cfg_mgr, LEVEL_REQUIRE_STR):
            for req_key in self._cfg_mgr[LEVEL_REQUIRE_STR]:
                if req_key in outer_conf:
                    setattr(self, req_key, outer_conf[req_key])
                else:
                    logging.warning(
                        i18n("Level-REQUIRE key '{}' has been set, "
                             "but not provide in configurations.").format(
                                 req_key))

        self._conf = outer_conf
Exemple #2
0
    def run(self):
        hdfs = HDFS()

        dirs = config._build_dirs
        tmp_fmap_dir = dirs["tmp_fmap_dir"]
        tmp_ckpt_dir = dirs["tmp_ckpt_dir"]

        fmap = Fmap.load(tmp_fmap_dir)

        input_cls = model[config.MODEL.input_name]

        files_pattern = hdfs.hdfs_whole_path(
            Path(config.HDFS_TFRECORD_DIR).joinpath("part*").as_posix())
        files = tf.io.gfile.glob(files_pattern)
        dataset = input_cls(fmap).tfr_inputs(files)

        model_cls = model[config.MODEL.model_name]
        model_ins = model_cls(fmap)

        if tmp_ckpt_dir.joinpath("h5weights", "weights.h5").exists():
            logging.info(i18n("Loading model weight success."))
            model_ins.load_weights(tmp_ckpt_dir)

        model_ins.evaluate_act(dataset)

        logging.info(i18n("Evaluating done."))
Exemple #3
0
    def initialize_hdfs(self, sc=None):
        if self._status:
            logging.info(i18n("Using exists HDFS Handler."))
            return

        init_spark()
        from pyspark.sql import SparkSession
        from pyspark.context import SparkContext

        if sc is None:
            new_sc = SparkSession.builder \
                                 .appName("DLFlow.HDSF_Handler") \
                                 .master("local[*]") \
                                 .enableHiveSupport() \
                                 .getOrCreate() \
                                 .sparkContext
            logging.info(i18n("Using new local SparkContext."))
        else:
            new_sc = sc
            logging.info(i18n("Using presetting SparkContext."))

        if not isinstance(new_sc, SparkContext):
            raise TypeError(
                i18n("Expected type {} for 'sc', but got {}").format(
                    str(SparkContext), type(new_sc)))

        if new_sc._jsc is None:
            raise RuntimeError(i18n("SparkContext has already closed!"))

        hadoop_conf = new_sc._jsc.hadoopConfiguration()
        self._header = hadoop_conf.get("fs.default.name")

        self._fs = new_sc._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
        self._sc = new_sc
        self._status = True
Exemple #4
0
    def _vertices_info(vertices_set):
        info = i18n("\nVertices node info:")
        detail = i18n("\nDetail about neighbor:")

        if vertices_set is None:
            info += "\n{:>16}".format("None")
            detail += "\n{:>16}".format("None")
        else:
            for vtx in vertices_set:
                info += "\n{:>16}: {:<16} head: {:<2} tail: {:<2}".format(
                    vtx.name, vtx.node.__name__, vtx.in_degree, vtx.out_degree)

                head_vtx = []
                for in_edge in vtx.in_edges:
                    head_vtx.append(in_edge.head.name)

                tail_vtx = []
                for out_edge in vtx.out_edges:
                    tail_vtx.append(out_edge.tail.name)

                detail += "\n{:>16}: {:<16} head: [{}]  /  tail: [{}]".format(
                    vtx.name, vtx.node.__name__, ", ".join(head_vtx),
                    ", ".join(tail_vtx))

        return "{}{}".format(info, detail)
Exemple #5
0
    def safe_hdfs_path(self, base_dt, hdfs_patten, max_degrade=7):
        valid_dir, _ = self._degrade_hdfs(base_dt, hdfs_patten, max_degrade)

        if valid_dir is None:
            raise ValueError(
                i18n("Can't find valid partition on {}").format(hdfs_patten))

        logging.info(i18n("Find valid partition {}").format(valid_dir))
        return valid_dir
Exemple #6
0
        def _timeit(*args, **kwargs):
            logging.info(i18n("Start running: {}...").format(task_name))

            time_enter = perf_counter()
            func(*args, **kwargs)
            time_elapsed = perf_counter() - time_enter

            logging.info(
                i18n("Task <{}> spend time: {:>.6f}s").format(
                    task_name, time_elapsed))
Exemple #7
0
def env_version_check():
    import tensorflow
    import sys

    v_major = sys.version_info[0]
    v_minor = sys.version_info[1]
    assert (v_major == 3 and v_minor >= 6) or v_major > 3, \
        i18n("This program requires at least Python 3.6")

    assert tensorflow.__version__.startswith("2."), \
        i18n("This program require Tensorflow 2.0")
Exemple #8
0
    def _initialize(self):
        if self._dag_obj is None:
            raise RuntimeError(
                i18n("WorkflowDAG is not bound, please bind it first!"))

        if config.conf is None:
            raise RuntimeError(
                i18n("Bad configurations, please load it first!"))
        config.initialize()

        for vtx in self._dag_obj.DAG:
            self._tasks.append(vtx.node())
Exemple #9
0
    def _internal_collect(self):
        standard_task_dir = Path(DLFLOW_TASKS)
        internal_model_dir = Path(DLFLOW_MODELS)

        logging.info(
            i18n("Loading DLFlow standard task. From: {}").format(
                standard_task_dir))
        self._collect(standard_task_dir)

        logging.info(
            i18n("Loading DLFlow internal model. From: {}").format(
                internal_model_dir))
        self._collect(internal_model_dir)
Exemple #10
0
    def safe_hive_date(self,
                       base_dt,
                       hdfs_patten,
                       max_degrade=7,
                       fmt_dt="%Y%m%d"):
        _, valid_dt = self._degrade_hdfs(base_dt, hdfs_patten, max_degrade)

        if valid_dt is None:
            raise ValueError(
                i18n("Can't find valid partition on {}").format(hdfs_patten))

        valid_dt = valid_dt.strftime(fmt_dt)
        logging.info(i18n("Find valid hive date {}").format(valid_dt))
        return valid_dt
Exemple #11
0
    def initialize_spark(self, app_name=None, spark_conf=None, reuse=False):
        if self._status and not reuse:
            logging.info(i18n("Using exists spark session."))
            return
        elif self._status and reuse:
            self.close()

        if self._hdfs.status:
            self._hdfs.close()

        init_spark()
        from pyspark.sql import SparkSession

        _spark = SparkSession.builder

        if app_name is not None:
            _spark = _spark.appName(app_name)
            logging.info(i18n("Initializing Spark App: {}").format(app_name))

        if isinstance(spark_conf, dict):
            spark_conf = spark_conf.copy()
            self.spark_conf = spark_conf.copy()

            if "spark.executorEnv.PYSPARK_PYTHON" in spark_conf:
                os.environ["PYSPARK_PYTHON"] = \
                    spark_conf["spark.executorEnv.PYSPARK_PYTHON"]

            _fmt_conf_str = "{\n"
            for k, v in spark_conf.items():
                _fmt_conf_str += "{}{}: {}\n".format(" " * 4, k, v)
            _fmt_conf_str += "}\n"
            logging.info(
                i18n("Using spark config:\n{s}").format(s=_fmt_conf_str))

            if "master" in spark_conf:
                _spark = _spark.master(spark_conf["master"])
                _ = spark_conf.pop("master")

            for conf_key, conf_value in spark_conf.items():
                _spark = _spark.config(conf_key, conf_value)

        self._spark = _spark.enableHiveSupport().getOrCreate()
        self._sc = self._spark.sparkContext
        self.app_name = app_name

        self._hdfs.initialize_hdfs(sc=self._sc)
        self._status = True

        logging.info(i18n("Spark Version: {}").format(self._spark.version))
Exemple #12
0
    def __getreg__(cls, key):
        if key in cls.__MASKS__:
            reg_key = cls.__MASKS__[key]
        else:
            raise KeyError(i18n("Unknown key '{}'").format(key))

        return cls.__REGS__[reg_key]
Exemple #13
0
    def _collect(self, src_dir):
        src_dir = Path(src_dir).resolve()

        for cur, _, files in os.walk(src_dir):
            logging.debug(i18n("Scanning directory: {}").format(cur))

            if files:
                sys.path.append(cur)

                for file_name in files:
                    for module_name in self.module_pattern.findall(file_name):
                        _ = __import__(module_name)
                        logging.info(
                            i18n("  * Loading module: {}").format(module_name))

                sys.path.remove(cur)
Exemple #14
0
    def _topology_sort(self):
        vertices_set = set(self._vertices.values())
        for vtx in vertices_set:
            vtx.tmp_in_degree = vtx.in_degree

        sorted_vertices = []
        while vertices_set:
            _kick_set = set()
            for vtx in vertices_set:
                if vtx.tmp_in_degree == 0:
                    for edge in vtx.out_edges:
                        edge.tail.tmp_in_degree -= 1
                    sorted_vertices.append(vtx)
                    _kick_set.add(vtx)
                    del vtx.tmp_in_degree

            if not _kick_set:
                _safe_set = set()
                for vtx in vertices_set:
                    if vtx.out_degree == 0 or vtx.tmp_in_degree == 0:
                        _safe_set.add(vtx)
                vertices_set -= _safe_set

                err_info = i18n(
                    "Can't build DAG, circular references "
                    "are found, please check it!\n{}") \
                    .format(self._vertices_info(vertices_set))

                logging.error(err_info)
                raise RuntimeError(err_info)

            vertices_set -= _kick_set

        return sorted_vertices
Exemple #15
0
 def bucket(self, bucket):
     if isinstance(bucket, FeBucket):
         self._bucket = bucket
     else:
         raise TypeError(
             i18n("Parameter type expected {}, but got {}").format(
                 FeBucket, type(bucket)))
Exemple #16
0
 def field(self, field):
     if isinstance(field, FeField):
         self._field = field
     else:
         raise TypeError(
             i18n("Parameter type expected {}, but got {}").format(
                 FeField, type(field)))
Exemple #17
0
    def initialize(self,
                   file=None,
                   udc=None,
                   log_level=None,
                   log_dir=None,
                   lang=None,
                   mode="local"):

        config.load(file, udc)

        if lang:
            self._lang = lang
        elif "LANG" in os.environ:
            self._lang = os.environ["LANG"].split(".")[0].split("_")[0]

        i18n.initialize(self._lang)

        if log_level:
            self._log_level = log_level
        elif "LOG_LEVEL" in config:
            self._log_level = config.LOG_LEVEL

        if self._log_level:
            set_logging_level(self._log_level)
            logging_info = get_logging_info_str()
            logging.warning(i18n("Logging level is changed to '{}'")
                            .format(self._log_level)
                            .join("", logging_info))

        if log_dir:
            self._log_dir = log_dir
        elif "LOG_DIR" in config:
            self._log_dir = config.LOG_DIR

        if self._log_dir:
            log_name = Path(file).parts[-1].split(".")[0]
            set_logging_writer(log_name, self._log_dir)
            logging.info(i18n("Start writing log to {}").format(self._log_dir))

        collect = Collector()
        for key, desc in [("TASKS_DIR", "UDT"),
                          ("MODELS_DIR", "Models")]:
            if key in config:
                collect(config[key], desc)

        self._steps = config.STEPS
        self._mode = mode.lower()
Exemple #18
0
    def run(self):
        engine_cls = ENGINES[self._mode]
        engine = engine_cls()

        if isinstance(engine, BaseEngine):
            engine.run(self._steps)

        logging.info(i18n("All task done."))
Exemple #19
0
 def load(self, fmap_dir):
     fmap = Fmap.load(fmap_dir)
     if isinstance(fmap, Fmap):
         self._fmap = fmap
     else:
         raise TypeError(
             i18n("Parameter type expected {}, but got {}").format(
                 Fmap, type(fmap)))
Exemple #20
0
    def initialize(self):
        outer_conf = self._conf.copy()

        default_conf = self.load_regs()

        self._conf = default_conf.replace(self._conf)

        self._solver()
        logging.info(i18n("All parsed configurations:\n{}").format(self._conf))

        if hasattr(self._cfg_mgr, LEVEL_REQUIRE_STR):
            for req_key in self._cfg_mgr[LEVEL_REQUIRE_STR]:
                if req_key not in outer_conf:
                    logging.warning(
                        i18n("Level-REQUIRE key '{}' has been set, "
                             "but not provide in configurations.").format(
                                 req_key))
Exemple #21
0
    def __getconf__(self, key):
        if self._conf is None:
            raise NotInitializeError(i18n("ConfigTree is not set!"))

        if key not in self._conf:
            raise KeyError(key)

        return self._conf[key]
Exemple #22
0
def init_spark():
    try:
        import pyspark

    except ModuleNotFoundError:

        try:
            logging.warning(
                i18n("No model named 'pyspark'. "
                     "Trying to call 'findspark'..."))
            import findspark
            findspark.init()
            logging.info(i18n("Loading pyspark success."))

        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                i18n("Can not find model 'pysaprk' or 'findspark'."))
Exemple #23
0
    def __iter__(self):
        if self._conf is None:
            raise NotInitializeError(i18n("ConfigTree is not set!"))

        def _generator():
            for k, v in self._conf.dense_dict.items():
                yield (k, v)

        return _generator()
Exemple #24
0
 def bind_workflow(self, param):
     if isinstance(param, (str, )):
         self._dag_obj = WorkflowDAG(param)
     elif isinstance(param, (WorkflowDAG, )):
         self._dag_obj = param
     else:
         raise TypeError(
             i18n("Parameter 'param' expect type are 'str', "
                  "'WorkflowDAG' or 'None', but find '{}'!").format(
                      type(param)))
Exemple #25
0
    def path(self, path):
        if not self._status:
            raise NotInitializeError(
                i18n("Can not access HDFS! Because "
                     "SparkContext wasn't initialize!"))

        path = Path(path).as_posix()
        jvm_hdfs_path = self._sc._jvm.org.apache.hadoop.fs.Path(path)

        return jvm_hdfs_path
Exemple #26
0
    def run(self):
        fit = config.MERGE.fit

        seed_sql = config.MERGE.seed_sql

        feature_config_file = Path(config.MERGE.config_file).resolve()

        if fit:
            assert feature_config_file.is_file(), \
                i18n("Input feature configuration is not exists "
                     "when fit=true: {}") \
                .format(feature_config_file)

        _spark_conf = config.SPARK.dense_dict
        params = []
        for k, v in _spark_conf.items():
            if k.startswith("spark."):
                _param = '--conf "{}={}"'.format(k, v)
            else:
                _param = '--{} "{}"'.format(k, v)

            params.append(_param)

        spark_conf = "\n".join(params)

        _model_dir = Path(config.HDFS_FEMODEL_DIR)
        hdfs_merge_dir = _model_dir.joinpath("merge").as_posix()

        out_dir = Path(config.RAW_FEATURES).as_posix()

        spark_submit = SparkJARExec(spark_conf=spark_conf,
                                    seed_sql=seed_sql,
                                    feature_config=feature_config_file,
                                    feature_date=config.FEATURE_DATE,
                                    feature_model_hdfs=hdfs_merge_dir,
                                    feature_out_hdfs=out_dir,
                                    fit=fit)

        spark_submit.run()

        logging.info(i18n("Feature merge done."))
Exemple #27
0
    def add_feature(self, feature: Feature):
        if isinstance(feature, Feature):
            feature.field = self
            feature.set_offset(self._offset)

            self._offset += feature.size
            self._container[feature.name] = feature
            self._mcs = self._update_mcs(feature.mcs_dec)
        else:
            raise TypeError(
                i18n("Parameter type expected {}, but got {}").format(
                    Feature, type(feature)))
Exemple #28
0
    def _solver(self):
        if self._conf is None:
            raise ValueError()

        dense_dict = self._conf.dense_dict
        total_key_set = set([i for i in dense_dict.keys()])

        G = {}
        for k in total_key_set:
            G[k] = {"tail": [], "tpl": None, "in": 0}

        for cur_key, cur_value in dense_dict.items():
            if isinstance(cur_value, (str, )):
                conf_template = ConfigTemplate(cur_value)
                G[cur_key]["tpl"] = conf_template
                G[cur_key]["in"] = len(conf_template.vars)
                for head_key in conf_template.vars:
                    G[head_key]["tail"].append(cur_key)

        sort_keys = []
        while total_key_set:
            _kick_set = set()

            for key in total_key_set:
                _ref_key = G[key]
                if _ref_key["in"] == 0:
                    for tail_key in _ref_key["tail"]:
                        G[tail_key]["in"] -= 1
                    sort_keys.append(key)
                    _kick_set.add(key)

            if not _kick_set:
                err_info = i18n(
                    "Circular references are found in the "
                    "following keys, please check it!\n{}") \
                    .format(total_key_set)
                logging.error(err_info)
                raise RuntimeError(err_info)

            total_key_set -= _kick_set

        for key in sort_keys:
            if G[key]["tpl"]:
                conf_template = G[key]["tpl"]

                kws = {}
                for _k in conf_template.vars:
                    kws[_k] = dense_dict[_k]

                dense_dict[key] = conf_template.render(kws)

        self._conf = self._cfg_loader.load(dense_dict, "dict")
Exemple #29
0
    def __init__(self, parser_name):
        self._parser = None
        self._normalizer = None

        initializer = {
            "spark": self._spark_parser_init
        }.get(parser_name.lower(), None)

        if initializer is None:
            raise ParserNotExists(
                i18n("Unknown feature parser '{}'").format(parser_name))

        initializer()
Exemple #30
0
    def hdfs_whole_path(self, path, header=None):
        if header is None:
            prefix = self._header.strip()
        else:
            prefix = header.strip()

        if path[0] == "/":
            new_path = "".join([prefix, path])
        elif prefix == path[:len(prefix)]:
            new_path = path
        else:
            raise ValueError(i18n("Illegal HDFS Path: {}").format(path))

        return new_path