Example #1
0
def correctTypos(strings: Frame, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param frequency_threshold: Strings that occur above this frequency level will not be corrected
    :param distance_threshold: Max distance at which strings are considered similar
    :param is_verbose: Print debug information
    :return: 'OperationNode' containing  
    """
    params_dict = {'strings': strings}
    params_dict.update(kwargs)

    vX_0 = Frame(strings.sds_context, '')
    vX_1 = Scalar(strings.sds_context, '')
    vX_2 = Scalar(strings.sds_context, '')
    vX_3 = Matrix(strings.sds_context, '')
    vX_4 = Frame(strings.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(strings.sds_context,
                     'correctTypos',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op
Example #2
0
def hyperband(X_train: Matrix,
              y_train: Matrix,
              X_val: Matrix,
              y_val: Matrix,
              params: List,
              paramRanges: Matrix,
              **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param One: hyper parameter, first column specifies min, second column max value.
    :param verbose: If TRUE print messages are activated
    :return: 'OperationNode' containing  
    """
    params_dict = {'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'params': params, 'paramRanges': paramRanges}
    params_dict.update(kwargs)
    
    vX_0 = Matrix(X_train.sds_context, '')
    vX_1 = Frame(X_train.sds_context, '')
    output_nodes = [vX_0, vX_1, ]

    op = MultiReturn(X_train.sds_context, 'hyperband', output_nodes, named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]

    return op
Example #3
0
def gridSearch(X: Matrix,
               y: Matrix,
               train: str,
               predict: str,
               params: List,
               paramValues: List,
               **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param train: Name ft of the train function to call via ft(trainArgs)
    :param predict: Name fp of the loss function to call via fp((predictArgs,B))
    :param numB: Maximum number of parameters in model B (pass the max because the size
    :param may: parameters like icpt or multi-class classification)
    :param columnvectors: hyper-parameters in 'params'
    :param gridSearch: hyper-parameter by name, if
    :param not: an empty list, the lm parameters are used
    :param gridSearch: trained models at the end, if
    :param not: an empty list, list(X, y) is used instead
    :param cv: flag enabling k-fold cross validation, otherwise training loss
    :param cvk: if cv=TRUE, specifies the the number of folds, otherwise ignored
    :param verbose: flag for verbose debug output
    :return: 'OperationNode' containing returned as a column-major linearized column vector 
    """
    params_dict = {'X': X, 'y': y, 'train': train, 'predict': predict, 'params': params, 'paramValues': paramValues}
    params_dict.update(kwargs)
    
    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Frame(X.sds_context, '')
    output_nodes = [vX_0, vX_1, ]

    op = MultiReturn(X.sds_context, 'gridSearch', output_nodes, named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]

    return op
Example #4
0
def hyperband(X_train: Matrix, y_train: Matrix, X_val: Matrix, y_val: Matrix,
              params: Iterable, paramRanges: Matrix,
              **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {
        'X_train': X_train,
        'y_train': y_train,
        'X_val': X_val,
        'y_val': y_val,
        'params': params,
        'paramRanges': paramRanges
    }
    params_dict.update(kwargs)

    vX_0 = Matrix(X_train.sds_context, '')
    vX_1 = Frame(X_train.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
    ]

    op = MultiReturn(X_train.sds_context,
                     'hyperband',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]

    return op
Example #5
0
 def __getitem__(self, i) -> 'Frame':
     if isinstance(i, tuple) and len(i) > 2:
         raise ValueError("Maximum of two dimensions are allowed")
     elif isinstance(i, list):
         check_no_less_than_zero(i)
         slice = self.sds_context.from_numpy(np.array(i)) + 1
         select = Matrix(self.sds_context, "table",
                         [slice, 1, self.nRow(), 1])
         ret = Frame(self.sds_context, "removeEmpty", [], {
             'target': self,
             'margin': '"rows"',
             'select': select
         })
         return ret
     elif isinstance(i, tuple) and isinstance(i[0], list) and isinstance(
             i[1], list):
         raise NotImplementedError("double slicing is not supported yet")
     elif isinstance(i, tuple) and check_is_empty_slice(
             i[0]) and isinstance(i[1], list):
         check_no_less_than_zero(i[1])
         slice = self.sds_context.from_numpy(np.array(i[1])) + 1
         select = Matrix(self.sds_context, "table",
                         [slice, 1, self.nCol(), 1])
         ret = Frame(self.sds_context, "removeEmpty", [], {
             'target': self,
             'margin': '"cols"',
             'select': select
         })
         return ret
     else:
         sliceIns = get_slice_string(i)
         return Frame(self.sds_context, '', [self, sliceIns], brackets=True)
Example #6
0
def winsorize(X: Matrix, verbose: bool, **kwargs: Dict[str,
                                                       VALID_INPUT_TYPES]):
    """
    :param verbose: To print output on screen
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X, 'verbose': verbose}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Matrix(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
    ]

    op = MultiReturn(X.sds_context,
                     'winsorize',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Example #7
0
def slicefinder(X: Matrix, e: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param k: Number of subsets required
    :param maxL: level L (conjunctions of L predicates), 0 unlimited
    :param minSup: support (min number of rows per slice)
    :param alpha: [0,1]: 0 only size, 1 only error
    :param tpEval: for task-parallel slice evaluation,
    :param tpBlksz: size for task-parallel execution (num slices)
    :param selFeat: for removing one-hot-encoded features that don't satisfy
    :param the: constraint and/or have zero error
    :param verbose: for verbose debug output
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X, 'e': e}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Matrix(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
    ]

    op = MultiReturn(X.sds_context,
                     'slicefinder',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Example #8
0
def applyAndEvaluate(trainData: Frame, testData: Frame, pip: Frame,
                     applyFunc: Frame, hp: Matrix, evaluationFunc: str,
                     evalFunHp: Matrix, **kwargs: Dict[str,
                                                       VALID_INPUT_TYPES]):

    params_dict = {
        'trainData': trainData,
        'testData': testData,
        'pip': pip,
        'applyFunc': applyFunc,
        'hp': hp,
        'evaluationFunc': evaluationFunc,
        'evalFunHp': evalFunHp
    }
    params_dict.update(kwargs)

    vX_0 = Matrix(trainData.sds_context, '')
    vX_1 = Matrix(trainData.sds_context, '')
    vX_2 = Matrix(trainData.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
    ]

    op = MultiReturn(trainData.sds_context,
                     'applyAndEvaluate',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Example #9
0
def gaussianClassifier(D: Matrix, C: Matrix,
                       **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param varSmoothing: Smoothing factor for variances
    :param verbose: Print accuracy of the training set
    :return: 'OperationNode' containing  
    """
    params_dict = {'D': D, 'C': C}
    params_dict.update(kwargs)

    vX_0 = Matrix(D.sds_context, '')
    vX_1 = Matrix(D.sds_context, '')
    vX_2 = List(D.sds_context, '')
    vX_3 = Matrix(D.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
    ]

    op = MultiReturn(D.sds_context,
                     'gaussianClassifier',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]

    return op
Example #10
0
def scale(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param center: Indicates whether or not to center the feature matrix
    :param scale: Indicates whether or not to scale the feature matrix
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Matrix(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
    ]

    op = MultiReturn(X.sds_context,
                     'scale',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Example #11
0
def fixInvalidLengths(F1: Frame, mask: Matrix,
                      **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {'F1': F1, 'mask': mask}
    params_dict.update(kwargs)

    vX_0 = Frame(F1.sds_context, '')
    vX_1 = Matrix(F1.sds_context, '')
    vX_2 = Matrix(F1.sds_context, '')
    vX_3 = Matrix(F1.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
    ]

    op = MultiReturn(F1.sds_context,
                     'fixInvalidLengths',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]

    return op
Example #12
0
def tomeklink(X: Matrix, y: Matrix):
    """
    :param X: Data Matrix (nxm)
    :param y: Label Matrix (nx1)
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X, 'y': y}

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Matrix(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
    ]

    op = MultiReturn(X.sds_context,
                     'tomeklink',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Example #13
0
def denialConstraints(dataFrame: Frame,
                      constraintsFrame: Frame):
    """
    :param dataFrame: frame which columns represent the variables of the data and the rows correspond
    :param to: or instances.
    :param Recommended: a column indexing the instances from 1 to N (N=number of instances).
    :param constraintsFrame: frame with fixed columns and each row representing one constraint.
    :param ie: value of the variable 1 in instance 1 is lower/higher than the value of variable 1 in instance 2, 
    :param then: of of variable 2 in instance 2 can't be lower/higher than the value of variable 2 in instance 2.
    :param in: of instanceCompare
    :param rank: yrs.service   sex      salary
    :param 1: 19              18            Male     139750
    :param 2: 20              16            Male     173200
    :param 3: 3               3             Male     79750.56
    :param 4: 45              39            Male     115000
    :param 5: 40              40            Male     141500
    :param 6: 6               6             Male     97000
    :param 7: 30              23            Male     175000
    :param 8: 45              45            Male     147765
    :param 9: 21              20            Male     119250
    :param 10: 18              18            Female   129000
    :param 11: 12              8             Male     119800
    :param 12: 7               2             Male     79800
    :param 13: 1               1             Male     77700
    :param 1: yrs.since.phd  <          yrs.service
    :param 2: rank                Prof           yrs.service    ><         salary
    :param 3: salary         =          78182
    :param 4: discipline          B              yrs.service    >          yrs.since.phd
    :return: 'OperationNode' containing shows the indexes of dataframe that are wrong. & shows the index of the denial constraint that is fulfilled & no wrong instances to show (0 constrains fulfilled) --> wronginstances=matrix(0,1,2) 
    """
    params_dict = {'dataFrame': dataFrame, 'constraintsFrame': constraintsFrame}
    return Matrix(dataFrame.sds_context,
        'denialConstraints',
        named_input_nodes=params_dict)
Example #14
0
def univar(X: Matrix, types: Matrix):
    """
    :param 1: 2 for nominal, 3 for ordinal
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X, 'types': types}
    return Matrix(X.sds_context, 'univar', named_input_nodes=params_dict)
Example #15
0
def discoverFD(X: Matrix, Mask: Matrix, threshold: float):
    """
    :param will: second column from processing
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X, 'Mask': Mask, 'threshold': threshold}
    return Matrix(X.sds_context, 'discoverFD', named_input_nodes=params_dict)
Example #16
0
def components(G: OperationNode, **kwargs: Dict[str,
                                                VALID_INPUT_TYPES]) -> Matrix:

    G._check_matrix_op()
    params_dict = {'G': G}
    params_dict.update(kwargs)
    return Matrix(G.sds_context, 'components', named_input_nodes=params_dict)
Example #17
0
def outlierByArima(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {'X': X}
    params_dict.update(kwargs)
    return Matrix(X.sds_context,
                  'outlierByArima',
                  named_input_nodes=params_dict)
Example #18
0
def vectorToCsv(mask: OperationNode) -> Matrix:

    mask._check_matrix_op()
    params_dict = {'mask': mask}
    return Matrix(mask.sds_context,
                  'vectorToCsv',
                  named_input_nodes=params_dict)
Example #19
0
def lenetTrain(X: Matrix,
               Y: Matrix,
               X_val: Matrix,
               Y_val: Matrix,
               C: int,
               Hin: int,
               Win: int,
               **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param C: Number of input channels (dimensionality of input depth)
    :param Hin: Input width
    :param Win: Input height
    :param batch_size: Batch size
    :param epochs: Number of epochs
    :param lr: Learning rate
    :param mu: Momentum value
    :param decay: Learning rate decay
    :param reg: Regularization strength
    :param seed: Seed for model initialization
    :param verbose: Flag indicates if function should print to stdout
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X, 'Y': Y, 'X_val': X_val, 'Y_val': Y_val, 'C': C, 'Hin': Hin, 'Win': Win}
    params_dict.update(kwargs)
    return Matrix(X.sds_context,
        'lenetTrain',
        named_input_nodes=params_dict)
Example #20
0
    def from_numpy(self, mat: np.array,
                   *args: Sequence[VALID_INPUT_TYPES],
                   **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Matrix:
        """Generate DAGNode representing matrix with data given by a numpy array, which will be sent to SystemDS
        on need.

        :param mat: the numpy array
        :param args: unnamed parameters
        :param kwargs: named parameters
        :return: A Matrix
        """

        unnamed_params = ['\'./tmp/{file_name}\'']

        if len(mat.shape) == 2:
            named_params = {'rows': mat.shape[0], 'cols': mat.shape[1]}
        elif len(mat.shape) == 1:
            named_params = {'rows': mat.shape[0], 'cols': 1}
        else:
            # TODO Support tensors.
            raise ValueError("Only two dimensional arrays supported")

        unnamed_params.extend(args)
        named_params.update(kwargs)
        return Matrix(self, 'read', unnamed_params, named_params, local_data=mat)
Example #21
0
def winsorize(X: Matrix,
              verbose: bool):
    
    params_dict = {'X': X, 'verbose': verbose}
    return Matrix(X.sds_context,
        'winsorize',
        named_input_nodes=params_dict)
Example #22
0
def univar(X: Matrix,
           types: Matrix):
    
    params_dict = {'X': X, 'types': types}
    return Matrix(X.sds_context,
        'univar',
        named_input_nodes=params_dict)
Example #23
0
def toOneHot(X: Matrix, numClasses: int):
    """
    :param numclasses: Number of columns, must be be greater than or equal to largest value in X
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X, 'numClasses': numClasses}
    return Matrix(X.sds_context, 'toOneHot', named_input_nodes=params_dict)
Example #24
0
def mcc(predictions: Matrix,
        labels: Matrix):
    
    params_dict = {'predictions': predictions, 'labels': labels}
    return Matrix(predictions.sds_context,
        'mcc',
        named_input_nodes=params_dict)
Example #25
0
    def read(self, path: os.PathLike, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> OperationNode:
        """ Read an file from disk. Supportted types include:
        CSV, Matrix Market(coordinate), Text(i,j,v), SystemDS Binary, etc.
        See: http://apache.github.io/systemds/site/dml-language-reference#readwrite-built-in-functions for more details
        :return: an Operation Node, containing the read data the operationNode read can be of types, Matrix, Frame or Scalar.
        """
        mdt_filepath = path + ".mtd"
        if os.path.exists(mdt_filepath):
            with open(mdt_filepath) as jspec_file:
                mtd = json.load(jspec_file)
                kwargs["data_type"] = mtd["data_type"]

        data_type = kwargs.get("data_type", None)
        file_format = kwargs.get("format", None)
        if data_type == "matrix":
            kwargs["data_type"] = f'"{data_type}"'
            return Matrix(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
        elif data_type == "frame":
            kwargs["data_type"] = f'"{data_type}"'
            if isinstance(file_format, str):
                kwargs["format"] = f'"{kwargs["format"]}"'
            return Frame(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
        elif data_type == "scalar":
            kwargs["data_type"] = f'"{data_type}"'
            output_type = OutputType.from_str(kwargs.get("value_type", None))
            kwargs["value_type"] = f'"{output_type.name}"'
            return Scalar(self, "read", [f'"{path}"'], named_input_nodes=kwargs, output_type=output_type)
        elif data_type == "list":
            # Reading a list have no extra arguments.
            return List(self, "read", [f'"{path}"'])

        kwargs["data_type"] = None
        print("WARNING: Unknown type read please add a mtd file, or specify in arguments")
        return OperationNode(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
Example #26
0
def discoverFD(X: OperationNode, Mask: OperationNode,
               threshold: float) -> Matrix:

    X._check_matrix_op()
    Mask._check_matrix_op()
    params_dict = {'X': X, 'Mask': Mask, 'threshold': threshold}
    return Matrix(X.sds_context, 'discoverFD', named_input_nodes=params_dict)
Example #27
0
def outlierBySd(X: OperationNode, max_iterations: int,
                **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Matrix:

    X._check_matrix_op()
    params_dict = {'X': X, 'max_iterations': max_iterations}
    params_dict.update(kwargs)
    return Matrix(X.sds_context, 'outlierBySd', named_input_nodes=params_dict)
Example #28
0
def kmeansPredict(X: Matrix,
                  C: Matrix):
    
    params_dict = {'X': X, 'C': C}
    return Matrix(X.sds_context,
        'kmeansPredict',
        named_input_nodes=params_dict)
Example #29
0
def getAccuracy(y: OperationNode, yhat: OperationNode,
                **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Matrix:

    y._check_matrix_op()
    yhat._check_matrix_op()
    params_dict = {'y': y, 'yhat': yhat}
    params_dict.update(kwargs)
    return Matrix(y.sds_context, 'getAccuracy', named_input_nodes=params_dict)
Example #30
0
def smote(X: OperationNode, mask: OperationNode,
          **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Matrix:

    X._check_matrix_op()
    mask._check_matrix_op()
    params_dict = {'X': X, 'mask': mask}
    params_dict.update(kwargs)
    return Matrix(X.sds_context, 'smote', named_input_nodes=params_dict)