def __init__(self, args, args2): """ Spark version for reading data from HDFS and then write to HIVE @param args: dict inputUrl: String outputUrl: String """ self.originalDF = None self.outputUrl1 = args["output"][0]["value"] self.type = args["param"]["type"] self.inputUrl1 = args["param"]["path"] self.DF = None self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Standalone version for training model @param args: dict featureCols: list labelCol: String """ self.logger = logging.getLogger(self.__class__.__name__) self.originalDF = None self.inputUrl1 = args["input"][0]["value"] self.inputUrl2 = args["input"][1]["value"] self.outputUrl1 = args["output"][0]["value"] self.param = args["param"] self.model = None self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Standalone version for initializing RandomForest regressor @param args: dict n_estimators: int criterion: string one of "mse" and "mae" max_depth: int min_samples_split: int min_samples_leaf: int """ self.logger = logging.getLogger(self.__class__.__name__) self.outputUrl1 = args["output"][0]["value"] self.param = args["param"] self.model = None self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Standalone version for evaluating clustering model @param args: dict featureCols: list """ self.logger = logging.getLogger(self.__class__.__name__) self.inputUrl1 = args["input"][0]["value"] self.outputUrl1 = args["output"][0]["value"] self.featureCols = args["param"]["features"] self.originalDF = None self.metric_df = None self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Standalone version for evaluating regressor @param args: dict featureCols: list labelCol: string """ self.logger = logging.getLogger(self.__class__.__name__) self.inputUrl1 = args["input"][0]["value"] self.outputUrl1 = args["output"][0]["value"] self.labelCol = args["param"]["label"] self.originalDF = None self.metric_df = None self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Standalone version for split data, including byRatio and byThreshold @param args: dict splitBy: string one of byRation and byThreshold ratio: double thresholdColumn: stirng threshold: double """ self.originalDF = None self.DF1 = None self.DF2 = None self.inputUrl1 = args["input"][0]["value"] self.outputUrl1 = args["output"][0]["value"] self.outputUrl2 = args["output"][1]["value"] self.param = args["param"] self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Spark version for evaluating clustering model @param args: dict featureCols: list """ self.logger = logging.getLogger(self.__class__.__name__) self.inputUrl1 = args["input"][0]["value"] self.inputUrl2 = args["input"][1]["value"] self.outputUrl1 = args["output"][0]["value"] self.param = args["param"] self.originalDF = None self.model = None self.result = None self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Python version for conducting PCA on input dataset @param args: dict inputUrl1: String outputUrl1: String columns: list k: int """ self.originalDF = None self.transformDF = None self.inputUrl1 = args["input"][0]["value"] self.outputUrl1 = args["output"][0]["value"] self.columns = args["param"]["columns"] self.k = args["param"]["k"] self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Standalone version for initializing KMeans clustering @param args: dict K: int init: string one of "k-means++" and "random" n_init: int max_iter: int tol: float """ # init logging self.logger = logging.getLogger(self.__class__.__name__) self.outputUrl1 = args["output"][0]["value"] self.param = args["param"] self.model = None self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Standalone version for predicting classification and regression algo @param args: dict featureCols: list labelCol: string """ self.logger = logging.getLogger(self.__class__.__name__) self.inputUrl1 = args["input"][0]["value"] self.inputUrl2 = args["input"][1]["value"] self.outputUrl1 = args["output"][0]["value"] self.featureCols = args["param"]["features"] self.originalDF = None self.transformDF = None self.model = None self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Standalone version for type transformation @param args: dict toDoubleColumns: list defaultDoubleValue: double toIntColumns: list defaultIntValue: int toCategoricalColumns: list """ self.logger = logging.getLogger(self.__class__.__name__) self.originalDF = None self.transformedDF = None self.paramDF = None self.inputUrl1 = args["input"][0]["value"] self.outputUrl1 = args["output"][0]["value"] self.outputUrl2 = self.outputUrl1 + "toCategorical" self.param = args["param"] try: self.toDoubleColumns = self.param["toDouble"] except KeyError: self.toDoubleColumns = None try: self.defaultDoubleValue = self.param["defaultDoubleValue"] except KeyError: self.defaultDoubleValue = 0.0 try: self.toIntColumns = self.param["toInt"] except KeyError: self.toIntColumns = None try: self.defaultIntValue = self.param["defaultIntValue"] except KeyError: self.defaultIntValue = 0 try: self.toCategoricalColumns = self.param["toCategoricalColumns"] except KeyError: self.toCategoricalColumns = None self.mode = (self.param["mode"]) self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Standalone version for initializing and training auto-sklearn model @param args: dict time_limit: int time_per_model: int ensemble_size: int metric: string one of "roc_auc", "accuracy", "precision" and "f1" """ self.logger = logging.getLogger(self.__class__.__name__) self.inputUrl1 = args["input"][0]["value"] self.outputUrl1 = args["output"][0]["value"] self.param = args["param"] self.originalDF = None self.model = None self.dataUtil = utils.dataUtil(args2) current_time = str(time.time()) self.tmp_folder = 'tmp/auto_regression_' + current_time + "_tmp" self.out_folder = 'tmp/auto_regression_' + current_time + "_out"
def __init__(self, args, args2): """ Spark version for initializing RandomForest multi-class classifier @param args: dict n_estimators: int criterion: string one of "gini" and "entropy" max_depth: int min_samples_split: int min_samples_leaf: int """ self.logger = logging.getLogger(self.__class__.__name__) self.outputUrl1 = args["output"][0]["value"] self.param = args["param"] self.model = None self.dataUtil = utils.dataUtil(args2) self.logger.info("initializing SparkSession") self.spark = utils.init_spark()
def __init__(self, args, args2): """ Standalone version for evaluating binary classifier @param args: dict featureCols: list labelCol: string """ self.logger = logging.getLogger(self.__class__.__name__) self.inputUrl1 = args["input"][0]["value"] self.outputUrl1 = args["output"][0]["value"] self.outputUrl2 = args["output"][1]["value"] self.labelCol = args["param"]["label"] try: self.posLabel = args["param"]["posLabel"] except Exception: self.posLabel = None self.originalDF = None self.metric_df = None self.roc_pr = None self.dataUtil = utils.dataUtil(args2)
def __init__(self, args, args2): """ Standalone version for normalizing data, including minmax and zscore @param args: dict columns: list """ self.logger = logging.getLogger(self.__class__.__name__) self.originalDF = None self.transformedDF = None self.parameterDF = None self.inputUrl1 = args["input"][0]["value"] try: self.inputUrl2 = args["input"][1]["value"] except IndexError: self.inputUrl2 = "" self.outputUrl1 = args["output"][0]["value"] self.outputUrl2 = args["output"][1]["value"] try: self.columns = args["param"]["columns"] except KeyError as e: self.columns = None self.param = args["param"] self.dataUtil = utils.dataUtil(args2)
modelPath = args["input"][0]["value"] if args["input"][0][ "type"] == "model" else None chosenClass = None if classType == "Py.Distributed": chosenClass = getattr( importlib.import_module("idsw-spark." + moduleName), className) elif classType == "Py.Standalone": chosenClass = getattr(importlib.import_module("idsw." + moduleName), className) elif classType.startswith("Py"): # 对于不指定单机或分布式的机器学习组件,若输入参数含有机器学习模型,判断为单机模型还是分布式模型,分别处理 if modelPath is not None: import joblib try: hdfs = utils.dataUtil(args2)._get_HDFS_connection() with hdfs.open(modelPath.replace("hdfs://", "")) as reader: joblib.load(reader) chosenClass = getattr( importlib.import_module("idsw." + moduleName), className) except OSError as e: # if str(e) == "Could not open file: %s, mode: rb File does not exist: %s" % (modelPath, modelPath): chosenClass = getattr( importlib.import_module("idsw-spark." + moduleName), className) # 如果没有涉及机器学习模型,则认为默认使用单机模块 else: chosenClass = getattr( importlib.import_module("idsw." + moduleName), className)