def correctTypos(strings: Frame, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param frequency_threshold: Strings that occur above this frequency level will not be corrected :param distance_threshold: Max distance at which strings are considered similar :param is_verbose: Print debug information :return: 'OperationNode' containing """ params_dict = {'strings': strings} params_dict.update(kwargs) vX_0 = Frame(strings.sds_context, '') vX_1 = Scalar(strings.sds_context, '') vX_2 = Scalar(strings.sds_context, '') vX_3 = Matrix(strings.sds_context, '') vX_4 = Frame(strings.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, ] op = MultiReturn(strings.sds_context, 'correctTypos', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] return op
def hyperband(X_train: Matrix, y_train: Matrix, X_val: Matrix, y_val: Matrix, params: List, paramRanges: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param One: hyper parameter, first column specifies min, second column max value. :param verbose: If TRUE print messages are activated :return: 'OperationNode' containing """ params_dict = {'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'params': params, 'paramRanges': paramRanges} params_dict.update(kwargs) vX_0 = Matrix(X_train.sds_context, '') vX_1 = Frame(X_train.sds_context, '') output_nodes = [vX_0, vX_1, ] op = MultiReturn(X_train.sds_context, 'hyperband', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] return op
def gridSearch(X: Matrix, y: Matrix, train: str, predict: str, params: List, paramValues: List, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param train: Name ft of the train function to call via ft(trainArgs) :param predict: Name fp of the loss function to call via fp((predictArgs,B)) :param numB: Maximum number of parameters in model B (pass the max because the size :param may: parameters like icpt or multi-class classification) :param columnvectors: hyper-parameters in 'params' :param gridSearch: hyper-parameter by name, if :param not: an empty list, the lm parameters are used :param gridSearch: trained models at the end, if :param not: an empty list, list(X, y) is used instead :param cv: flag enabling k-fold cross validation, otherwise training loss :param cvk: if cv=TRUE, specifies the the number of folds, otherwise ignored :param verbose: flag for verbose debug output :return: 'OperationNode' containing returned as a column-major linearized column vector """ params_dict = {'X': X, 'y': y, 'train': train, 'predict': predict, 'params': params, 'paramValues': paramValues} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Frame(X.sds_context, '') output_nodes = [vX_0, vX_1, ] op = MultiReturn(X.sds_context, 'gridSearch', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] return op
def hyperband(X_train: Matrix, y_train: Matrix, X_val: Matrix, y_val: Matrix, params: Iterable, paramRanges: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = { 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'params': params, 'paramRanges': paramRanges } params_dict.update(kwargs) vX_0 = Matrix(X_train.sds_context, '') vX_1 = Frame(X_train.sds_context, '') output_nodes = [ vX_0, vX_1, ] op = MultiReturn(X_train.sds_context, 'hyperband', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] return op
def __getitem__(self, i) -> 'Frame': if isinstance(i, tuple) and len(i) > 2: raise ValueError("Maximum of two dimensions are allowed") elif isinstance(i, list): check_no_less_than_zero(i) slice = self.sds_context.from_numpy(np.array(i)) + 1 select = Matrix(self.sds_context, "table", [slice, 1, self.nRow(), 1]) ret = Frame(self.sds_context, "removeEmpty", [], { 'target': self, 'margin': '"rows"', 'select': select }) return ret elif isinstance(i, tuple) and isinstance(i[0], list) and isinstance( i[1], list): raise NotImplementedError("double slicing is not supported yet") elif isinstance(i, tuple) and check_is_empty_slice( i[0]) and isinstance(i[1], list): check_no_less_than_zero(i[1]) slice = self.sds_context.from_numpy(np.array(i[1])) + 1 select = Matrix(self.sds_context, "table", [slice, 1, self.nCol(), 1]) ret = Frame(self.sds_context, "removeEmpty", [], { 'target': self, 'margin': '"cols"', 'select': select }) return ret else: sliceIns = get_slice_string(i) return Frame(self.sds_context, '', [self, sliceIns], brackets=True)
def winsorize(X: Matrix, verbose: bool, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param verbose: To print output on screen :return: 'OperationNode' containing """ params_dict = {'X': X, 'verbose': verbose} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Matrix(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, ] op = MultiReturn(X.sds_context, 'winsorize', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] return op
def slicefinder(X: Matrix, e: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param k: Number of subsets required :param maxL: level L (conjunctions of L predicates), 0 unlimited :param minSup: support (min number of rows per slice) :param alpha: [0,1]: 0 only size, 1 only error :param tpEval: for task-parallel slice evaluation, :param tpBlksz: size for task-parallel execution (num slices) :param selFeat: for removing one-hot-encoded features that don't satisfy :param the: constraint and/or have zero error :param verbose: for verbose debug output :return: 'OperationNode' containing """ params_dict = {'X': X, 'e': e} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Matrix(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, ] op = MultiReturn(X.sds_context, 'slicefinder', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] return op
def applyAndEvaluate(trainData: Frame, testData: Frame, pip: Frame, applyFunc: Frame, hp: Matrix, evaluationFunc: str, evalFunHp: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = { 'trainData': trainData, 'testData': testData, 'pip': pip, 'applyFunc': applyFunc, 'hp': hp, 'evaluationFunc': evaluationFunc, 'evalFunHp': evalFunHp } params_dict.update(kwargs) vX_0 = Matrix(trainData.sds_context, '') vX_1 = Matrix(trainData.sds_context, '') vX_2 = Matrix(trainData.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, ] op = MultiReturn(trainData.sds_context, 'applyAndEvaluate', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] return op
def gaussianClassifier(D: Matrix, C: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param varSmoothing: Smoothing factor for variances :param verbose: Print accuracy of the training set :return: 'OperationNode' containing """ params_dict = {'D': D, 'C': C} params_dict.update(kwargs) vX_0 = Matrix(D.sds_context, '') vX_1 = Matrix(D.sds_context, '') vX_2 = List(D.sds_context, '') vX_3 = Matrix(D.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, ] op = MultiReturn(D.sds_context, 'gaussianClassifier', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] return op
def scale(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param center: Indicates whether or not to center the feature matrix :param scale: Indicates whether or not to scale the feature matrix :return: 'OperationNode' containing """ params_dict = {'X': X} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Matrix(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, ] op = MultiReturn(X.sds_context, 'scale', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] return op
def fixInvalidLengths(F1: Frame, mask: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = {'F1': F1, 'mask': mask} params_dict.update(kwargs) vX_0 = Frame(F1.sds_context, '') vX_1 = Matrix(F1.sds_context, '') vX_2 = Matrix(F1.sds_context, '') vX_3 = Matrix(F1.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, ] op = MultiReturn(F1.sds_context, 'fixInvalidLengths', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] return op
def tomeklink(X: Matrix, y: Matrix): """ :param X: Data Matrix (nxm) :param y: Label Matrix (nx1) :return: 'OperationNode' containing """ params_dict = {'X': X, 'y': y} vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Matrix(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, ] op = MultiReturn(X.sds_context, 'tomeklink', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] return op
def denialConstraints(dataFrame: Frame, constraintsFrame: Frame): """ :param dataFrame: frame which columns represent the variables of the data and the rows correspond :param to: or instances. :param Recommended: a column indexing the instances from 1 to N (N=number of instances). :param constraintsFrame: frame with fixed columns and each row representing one constraint. :param ie: value of the variable 1 in instance 1 is lower/higher than the value of variable 1 in instance 2, :param then: of of variable 2 in instance 2 can't be lower/higher than the value of variable 2 in instance 2. :param in: of instanceCompare :param rank: yrs.service sex salary :param 1: 19 18 Male 139750 :param 2: 20 16 Male 173200 :param 3: 3 3 Male 79750.56 :param 4: 45 39 Male 115000 :param 5: 40 40 Male 141500 :param 6: 6 6 Male 97000 :param 7: 30 23 Male 175000 :param 8: 45 45 Male 147765 :param 9: 21 20 Male 119250 :param 10: 18 18 Female 129000 :param 11: 12 8 Male 119800 :param 12: 7 2 Male 79800 :param 13: 1 1 Male 77700 :param 1: yrs.since.phd < yrs.service :param 2: rank Prof yrs.service >< salary :param 3: salary = 78182 :param 4: discipline B yrs.service > yrs.since.phd :return: 'OperationNode' containing shows the indexes of dataframe that are wrong. & shows the index of the denial constraint that is fulfilled & no wrong instances to show (0 constrains fulfilled) --> wronginstances=matrix(0,1,2) """ params_dict = {'dataFrame': dataFrame, 'constraintsFrame': constraintsFrame} return Matrix(dataFrame.sds_context, 'denialConstraints', named_input_nodes=params_dict)
def univar(X: Matrix, types: Matrix): """ :param 1: 2 for nominal, 3 for ordinal :return: 'OperationNode' containing """ params_dict = {'X': X, 'types': types} return Matrix(X.sds_context, 'univar', named_input_nodes=params_dict)
def discoverFD(X: Matrix, Mask: Matrix, threshold: float): """ :param will: second column from processing :return: 'OperationNode' containing """ params_dict = {'X': X, 'Mask': Mask, 'threshold': threshold} return Matrix(X.sds_context, 'discoverFD', named_input_nodes=params_dict)
def components(G: OperationNode, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Matrix: G._check_matrix_op() params_dict = {'G': G} params_dict.update(kwargs) return Matrix(G.sds_context, 'components', named_input_nodes=params_dict)
def outlierByArima(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = {'X': X} params_dict.update(kwargs) return Matrix(X.sds_context, 'outlierByArima', named_input_nodes=params_dict)
def vectorToCsv(mask: OperationNode) -> Matrix: mask._check_matrix_op() params_dict = {'mask': mask} return Matrix(mask.sds_context, 'vectorToCsv', named_input_nodes=params_dict)
def lenetTrain(X: Matrix, Y: Matrix, X_val: Matrix, Y_val: Matrix, C: int, Hin: int, Win: int, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param C: Number of input channels (dimensionality of input depth) :param Hin: Input width :param Win: Input height :param batch_size: Batch size :param epochs: Number of epochs :param lr: Learning rate :param mu: Momentum value :param decay: Learning rate decay :param reg: Regularization strength :param seed: Seed for model initialization :param verbose: Flag indicates if function should print to stdout :return: 'OperationNode' containing """ params_dict = {'X': X, 'Y': Y, 'X_val': X_val, 'Y_val': Y_val, 'C': C, 'Hin': Hin, 'Win': Win} params_dict.update(kwargs) return Matrix(X.sds_context, 'lenetTrain', named_input_nodes=params_dict)
def from_numpy(self, mat: np.array, *args: Sequence[VALID_INPUT_TYPES], **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Matrix: """Generate DAGNode representing matrix with data given by a numpy array, which will be sent to SystemDS on need. :param mat: the numpy array :param args: unnamed parameters :param kwargs: named parameters :return: A Matrix """ unnamed_params = ['\'./tmp/{file_name}\''] if len(mat.shape) == 2: named_params = {'rows': mat.shape[0], 'cols': mat.shape[1]} elif len(mat.shape) == 1: named_params = {'rows': mat.shape[0], 'cols': 1} else: # TODO Support tensors. raise ValueError("Only two dimensional arrays supported") unnamed_params.extend(args) named_params.update(kwargs) return Matrix(self, 'read', unnamed_params, named_params, local_data=mat)
def winsorize(X: Matrix, verbose: bool): params_dict = {'X': X, 'verbose': verbose} return Matrix(X.sds_context, 'winsorize', named_input_nodes=params_dict)
def univar(X: Matrix, types: Matrix): params_dict = {'X': X, 'types': types} return Matrix(X.sds_context, 'univar', named_input_nodes=params_dict)
def toOneHot(X: Matrix, numClasses: int): """ :param numclasses: Number of columns, must be be greater than or equal to largest value in X :return: 'OperationNode' containing """ params_dict = {'X': X, 'numClasses': numClasses} return Matrix(X.sds_context, 'toOneHot', named_input_nodes=params_dict)
def mcc(predictions: Matrix, labels: Matrix): params_dict = {'predictions': predictions, 'labels': labels} return Matrix(predictions.sds_context, 'mcc', named_input_nodes=params_dict)
def read(self, path: os.PathLike, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> OperationNode: """ Read an file from disk. Supportted types include: CSV, Matrix Market(coordinate), Text(i,j,v), SystemDS Binary, etc. See: http://apache.github.io/systemds/site/dml-language-reference#readwrite-built-in-functions for more details :return: an Operation Node, containing the read data the operationNode read can be of types, Matrix, Frame or Scalar. """ mdt_filepath = path + ".mtd" if os.path.exists(mdt_filepath): with open(mdt_filepath) as jspec_file: mtd = json.load(jspec_file) kwargs["data_type"] = mtd["data_type"] data_type = kwargs.get("data_type", None) file_format = kwargs.get("format", None) if data_type == "matrix": kwargs["data_type"] = f'"{data_type}"' return Matrix(self, "read", [f'"{path}"'], named_input_nodes=kwargs) elif data_type == "frame": kwargs["data_type"] = f'"{data_type}"' if isinstance(file_format, str): kwargs["format"] = f'"{kwargs["format"]}"' return Frame(self, "read", [f'"{path}"'], named_input_nodes=kwargs) elif data_type == "scalar": kwargs["data_type"] = f'"{data_type}"' output_type = OutputType.from_str(kwargs.get("value_type", None)) kwargs["value_type"] = f'"{output_type.name}"' return Scalar(self, "read", [f'"{path}"'], named_input_nodes=kwargs, output_type=output_type) elif data_type == "list": # Reading a list have no extra arguments. return List(self, "read", [f'"{path}"']) kwargs["data_type"] = None print("WARNING: Unknown type read please add a mtd file, or specify in arguments") return OperationNode(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
def discoverFD(X: OperationNode, Mask: OperationNode, threshold: float) -> Matrix: X._check_matrix_op() Mask._check_matrix_op() params_dict = {'X': X, 'Mask': Mask, 'threshold': threshold} return Matrix(X.sds_context, 'discoverFD', named_input_nodes=params_dict)
def outlierBySd(X: OperationNode, max_iterations: int, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Matrix: X._check_matrix_op() params_dict = {'X': X, 'max_iterations': max_iterations} params_dict.update(kwargs) return Matrix(X.sds_context, 'outlierBySd', named_input_nodes=params_dict)
def kmeansPredict(X: Matrix, C: Matrix): params_dict = {'X': X, 'C': C} return Matrix(X.sds_context, 'kmeansPredict', named_input_nodes=params_dict)
def getAccuracy(y: OperationNode, yhat: OperationNode, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Matrix: y._check_matrix_op() yhat._check_matrix_op() params_dict = {'y': y, 'yhat': yhat} params_dict.update(kwargs) return Matrix(y.sds_context, 'getAccuracy', named_input_nodes=params_dict)
def smote(X: OperationNode, mask: OperationNode, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Matrix: X._check_matrix_op() mask._check_matrix_op() params_dict = {'X': X, 'mask': mask} params_dict.update(kwargs) return Matrix(X.sds_context, 'smote', named_input_nodes=params_dict)