Exemple #1
0
    def __init__(self, args, args2):
        """
        Spark version for reading data from HDFS and then write to HIVE
        @param args: dict
        inputUrl: String
        outputUrl: String
        """
        self.originalDF = None
        self.outputUrl1 = args["output"][0]["value"]

        self.type = args["param"]["type"]
        self.inputUrl1 = args["param"]["path"]
        self.DF = None

        self.dataUtil = utils.dataUtil(args2)
Exemple #2
0
    def __init__(self, args, args2):
        """
        Standalone version for training model
        @param args: dict
        featureCols: list
        labelCol: String
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        self.originalDF = None
        self.inputUrl1 = args["input"][0]["value"]
        self.inputUrl2 = args["input"][1]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.param = args["param"]
        self.model = None
        self.dataUtil = utils.dataUtil(args2)
Exemple #3
0
    def __init__(self, args, args2):
        """
        Standalone version for initializing RandomForest regressor
        @param args: dict
        n_estimators: int
        criterion: string one of "mse" and "mae"
        max_depth: int
        min_samples_split: int
        min_samples_leaf: int
        """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.outputUrl1 = args["output"][0]["value"]
        self.param = args["param"]
        self.model = None

        self.dataUtil = utils.dataUtil(args2)
Exemple #4
0
    def __init__(self, args, args2):
        """
        Standalone version for evaluating clustering model
        @param args: dict
        featureCols: list
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        self.inputUrl1 = args["input"][0]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.featureCols = args["param"]["features"]

        self.originalDF = None
        self.metric_df = None

        self.dataUtil = utils.dataUtil(args2)
Exemple #5
0
    def __init__(self, args, args2):
        """
        Standalone version for evaluating regressor
        @param args: dict
        featureCols: list
        labelCol: string
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        self.inputUrl1 = args["input"][0]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.labelCol = args["param"]["label"]
        self.originalDF = None
        self.metric_df = None

        self.dataUtil = utils.dataUtil(args2)
Exemple #6
0
 def __init__(self, args, args2):
     """
     Standalone version for split data, including byRatio and byThreshold
     @param args: dict
     splitBy: string one of byRation and byThreshold
     ratio: double
     thresholdColumn: stirng
     threshold: double
     """
     self.originalDF = None
     self.DF1 = None
     self.DF2 = None
     self.inputUrl1 = args["input"][0]["value"]
     self.outputUrl1 = args["output"][0]["value"]
     self.outputUrl2 = args["output"][1]["value"]
     self.param = args["param"]
     self.dataUtil = utils.dataUtil(args2)
Exemple #7
0
    def __init__(self, args, args2):
        """
        Spark version for evaluating clustering model
        @param args: dict
        featureCols: list
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        self.inputUrl1 = args["input"][0]["value"]
        self.inputUrl2 = args["input"][1]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.param = args["param"]

        self.originalDF = None
        self.model = None
        self.result = None

        self.dataUtil = utils.dataUtil(args2)
Exemple #8
0
    def __init__(self, args, args2):
        """
        Python version for conducting PCA on input dataset
        @param args: dict
        inputUrl1: String
        outputUrl1: String
        columns: list
        k: int
        """
        self.originalDF = None
        self.transformDF = None
        self.inputUrl1 = args["input"][0]["value"]
        self.outputUrl1 = args["output"][0]["value"]

        self.columns = args["param"]["columns"]
        self.k = args["param"]["k"]

        self.dataUtil = utils.dataUtil(args2)
Exemple #9
0
    def __init__(self, args, args2):
        """
        Standalone version for initializing KMeans clustering
        @param args: dict
        K: int
        init: string one of "k-means++" and "random"
        n_init: int
        max_iter: int
        tol: float
        """
        # init logging
        self.logger = logging.getLogger(self.__class__.__name__)

        self.outputUrl1 = args["output"][0]["value"]
        self.param = args["param"]
        self.model = None

        self.dataUtil = utils.dataUtil(args2)
Exemple #10
0
    def __init__(self, args, args2):
        """
        Standalone version for predicting classification and regression algo
        @param args: dict
        featureCols: list
        labelCol: string
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        self.inputUrl1 = args["input"][0]["value"]
        self.inputUrl2 = args["input"][1]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.featureCols = args["param"]["features"]

        self.originalDF = None
        self.transformDF = None
        self.model = None

        self.dataUtil = utils.dataUtil(args2)
Exemple #11
0
    def __init__(self, args, args2):
        """
        Standalone version for type transformation
        @param args: dict
        toDoubleColumns: list
        defaultDoubleValue: double
        toIntColumns: list
        defaultIntValue: int
        toCategoricalColumns: list
        """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.originalDF = None
        self.transformedDF = None
        self.paramDF = None

        self.inputUrl1 = args["input"][0]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.outputUrl2 = self.outputUrl1 + "toCategorical"
        self.param = args["param"]
        try:
            self.toDoubleColumns = self.param["toDouble"]
        except KeyError:
            self.toDoubleColumns = None
        try:
            self.defaultDoubleValue = self.param["defaultDoubleValue"]
        except KeyError:
            self.defaultDoubleValue = 0.0
        try:
            self.toIntColumns = self.param["toInt"]
        except KeyError:
            self.toIntColumns = None
        try:
            self.defaultIntValue = self.param["defaultIntValue"]
        except KeyError:
            self.defaultIntValue = 0
        try:
            self.toCategoricalColumns = self.param["toCategoricalColumns"]
        except KeyError:
            self.toCategoricalColumns = None

        self.mode = (self.param["mode"])

        self.dataUtil = utils.dataUtil(args2)
Exemple #12
0
 def __init__(self, args, args2):
     """
     Standalone version for initializing and training auto-sklearn model
     @param args: dict
     time_limit: int
     time_per_model: int
     ensemble_size: int
     metric: string one of "roc_auc", "accuracy", "precision" and "f1"
     """
     self.logger = logging.getLogger(self.__class__.__name__)
     self.inputUrl1 = args["input"][0]["value"]
     self.outputUrl1 = args["output"][0]["value"]
     self.param = args["param"]
     self.originalDF = None
     self.model = None
     self.dataUtil = utils.dataUtil(args2)
     current_time = str(time.time())
     self.tmp_folder = 'tmp/auto_regression_' + current_time + "_tmp"
     self.out_folder = 'tmp/auto_regression_' + current_time + "_out"
Exemple #13
0
    def __init__(self, args, args2):
        """
        Spark version for initializing RandomForest multi-class classifier
        @param args: dict
        n_estimators: int
        criterion: string one of "gini" and "entropy"
        max_depth: int
        min_samples_split: int
        min_samples_leaf: int
        """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.outputUrl1 = args["output"][0]["value"]
        self.param = args["param"]
        self.model = None

        self.dataUtil = utils.dataUtil(args2)

        self.logger.info("initializing SparkSession")

        self.spark = utils.init_spark()
Exemple #14
0
    def __init__(self, args, args2):
        """
        Standalone version for evaluating binary classifier
        @param args: dict
        featureCols: list
        labelCol: string
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        self.inputUrl1 = args["input"][0]["value"]
        self.outputUrl1 = args["output"][0]["value"]
        self.outputUrl2 = args["output"][1]["value"]
        self.labelCol = args["param"]["label"]
        try:
            self.posLabel = args["param"]["posLabel"]
        except Exception:
            self.posLabel = None
        self.originalDF = None
        self.metric_df = None
        self.roc_pr = None

        self.dataUtil = utils.dataUtil(args2)
Exemple #15
0
 def __init__(self, args, args2):
     """
     Standalone version for normalizing data, including minmax and zscore
     @param args: dict
     columns: list
     """
     self.logger = logging.getLogger(self.__class__.__name__)
     self.originalDF = None
     self.transformedDF = None
     self.parameterDF = None
     self.inputUrl1 = args["input"][0]["value"]
     try:
         self.inputUrl2 = args["input"][1]["value"]
     except IndexError:
         self.inputUrl2 = ""
     self.outputUrl1 = args["output"][0]["value"]
     self.outputUrl2 = args["output"][1]["value"]
     try:
         self.columns = args["param"]["columns"]
     except KeyError as e:
         self.columns = None
     self.param = args["param"]
     self.dataUtil = utils.dataUtil(args2)
Exemple #16
0
        modelPath = args["input"][0]["value"] if args["input"][0][
            "type"] == "model" else None
    chosenClass = None

    if classType == "Py.Distributed":
        chosenClass = getattr(
            importlib.import_module("idsw-spark." + moduleName), className)
    elif classType == "Py.Standalone":
        chosenClass = getattr(importlib.import_module("idsw." + moduleName),
                              className)
    elif classType.startswith("Py"):
        # 对于不指定单机或分布式的机器学习组件,若输入参数含有机器学习模型,判断为单机模型还是分布式模型,分别处理
        if modelPath is not None:
            import joblib
            try:
                hdfs = utils.dataUtil(args2)._get_HDFS_connection()
                with hdfs.open(modelPath.replace("hdfs://", "")) as reader:
                    joblib.load(reader)
                    chosenClass = getattr(
                        importlib.import_module("idsw." + moduleName),
                        className)
            except OSError as e:
                # if str(e) == "Could not open file: %s, mode: rb File does not exist: %s" % (modelPath, modelPath):
                chosenClass = getattr(
                    importlib.import_module("idsw-spark." + moduleName),
                    className)
        # 如果没有涉及机器学习模型,则认为默认使用单机模块
        else:
            chosenClass = getattr(
                importlib.import_module("idsw." + moduleName), className)