Ejemplo n.º 1
0
def correctTypos(strings: Frame, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param frequency_threshold: Strings that occur above this frequency level will not be corrected
    :param distance_threshold: Max distance at which strings are considered similar
    :param is_verbose: Print debug information
    :return: 'OperationNode' containing  
    """
    params_dict = {'strings': strings}
    params_dict.update(kwargs)

    vX_0 = Frame(strings.sds_context, '')
    vX_1 = Scalar(strings.sds_context, '')
    vX_2 = Scalar(strings.sds_context, '')
    vX_3 = Matrix(strings.sds_context, '')
    vX_4 = Frame(strings.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(strings.sds_context,
                     'correctTypos',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 2
0
def executePipeline(pipeline: Frame, Xtrain: Matrix, Ytrain: Matrix,
                    Xtest: Matrix, Ytest: Matrix, metaList: List,
                    hyperParameters: Matrix, flagsCount: int, verbose: bool,
                    **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param flagsCount: ---
    :param test: ---
    :return: 'OperationNode' containing  
    """
    params_dict = {
        'pipeline': pipeline,
        'Xtrain': Xtrain,
        'Ytrain': Ytrain,
        'Xtest': Xtest,
        'Ytest': Ytest,
        'metaList': metaList,
        'hyperParameters': hyperParameters,
        'flagsCount': flagsCount,
        'verbose': verbose
    }
    params_dict.update(kwargs)

    vX_0 = Matrix(pipeline.sds_context, '')
    vX_1 = Matrix(pipeline.sds_context, '')
    vX_2 = Matrix(pipeline.sds_context, '')
    vX_3 = Matrix(pipeline.sds_context, '')
    vX_4 = Scalar(pipeline.sds_context, '')
    vX_5 = Matrix(pipeline.sds_context, '')
    vX_6 = Matrix(pipeline.sds_context, '')
    vX_7 = Scalar(pipeline.sds_context, '')
    vX_8 = List(pipeline.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
        vX_6,
        vX_7,
        vX_8,
    ]

    op = MultiReturn(pipeline.sds_context,
                     'executePipeline',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]
    vX_6._unnamed_input_nodes = [op]
    vX_7._unnamed_input_nodes = [op]
    vX_8._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 3
0
def garch(X: Matrix, kmax: int, momentum: float, start_stepsize: float,
          end_stepsize: float, start_vicinity: float, end_vicinity: float,
          sim_seed: int, verbose: bool):
    """
    :param X: The input Matrix to apply Arima on.
    :param kmax: Number of iterations
    :param momentum: Momentum for momentum-gradient descent (set to 0 to deactivate)
    :param start_stepsize: Initial gradient-descent stepsize
    :param end_stepsize: gradient-descent stepsize at end (linear descent)
    :param start_vicinity: proportion of randomness of restart-location for gradient descent at beginning
    :param end_vicinity: same at end (linear decay)
    :param sim_seed: seed for simulation of process on fitted coefficients
    :param verbose: verbosity, comments during fitting
    :return: 'OperationNode' containing simulated garch(1,1) process on fitted coefficients & variances of simulated fitted process & constant term of fitted process & 1-st arch-coefficient of fitted process & 1-st garch-coefficient of fitted process & drawbacks: slow convergence of optimization (sort of simulated annealing/gradient descent) 
    """
    params_dict = {
        'X': X,
        'kmax': kmax,
        'momentum': momentum,
        'start_stepsize': start_stepsize,
        'end_stepsize': end_stepsize,
        'start_vicinity': start_vicinity,
        'end_vicinity': end_vicinity,
        'sim_seed': sim_seed,
        'verbose': verbose
    }

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    vX_3 = Scalar(X.sds_context, '')
    vX_4 = Scalar(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(X.sds_context,
                     'garch',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 4
0
def multiLogRegPredict(X: Matrix, B: Matrix, Y: Matrix,
                       **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param verbose: flag specifying if logging information should be printed
    :return: 'OperationNode' containing value of accuracy 
    """
    params_dict = {'X': X, 'B': B, 'Y': Y}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
    ]

    op = MultiReturn(X.sds_context,
                     'multiLogRegPredict',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 5
0
    def read(self, path: os.PathLike, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> OperationNode:
        """ Read an file from disk. Supportted types include:
        CSV, Matrix Market(coordinate), Text(i,j,v), SystemDS Binary, etc.
        See: http://apache.github.io/systemds/site/dml-language-reference#readwrite-built-in-functions for more details
        :return: an Operation Node, containing the read data the operationNode read can be of types, Matrix, Frame or Scalar.
        """
        mdt_filepath = path + ".mtd"
        if os.path.exists(mdt_filepath):
            with open(mdt_filepath) as jspec_file:
                mtd = json.load(jspec_file)
                kwargs["data_type"] = mtd["data_type"]

        data_type = kwargs.get("data_type", None)
        file_format = kwargs.get("format", None)
        if data_type == "matrix":
            kwargs["data_type"] = f'"{data_type}"'
            return Matrix(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
        elif data_type == "frame":
            kwargs["data_type"] = f'"{data_type}"'
            if isinstance(file_format, str):
                kwargs["format"] = f'"{kwargs["format"]}"'
            return Frame(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
        elif data_type == "scalar":
            kwargs["data_type"] = f'"{data_type}"'
            output_type = OutputType.from_str(kwargs.get("value_type", None))
            kwargs["value_type"] = f'"{output_type.name}"'
            return Scalar(self, "read", [f'"{path}"'], named_input_nodes=kwargs, output_type=output_type)
        elif data_type == "list":
            # Reading a list have no extra arguments.
            return List(self, "read", [f'"{path}"'])

        kwargs["data_type"] = None
        print("WARNING: Unknown type read please add a mtd file, or specify in arguments")
        return OperationNode(self, "read", [f'"{path}"'], named_input_nodes=kwargs)
Ejemplo n.º 6
0
def multiLogRegPredict(X: Matrix, B: Matrix, Y: Matrix,
                       **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param X: Data Matrix X
    :param B: Regression parameters betas
    :param Y: Response vector Y
    :param verbose: /
    :return: 'OperationNode' containing matrix m of predicted means/probabilities & predicted response vector & scalar value of accuracy 
    """
    params_dict = {'X': X, 'B': B, 'Y': Y}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
    ]

    op = MultiReturn(X.sds_context,
                     'multiLogRegPredict',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 7
0
 def to_string(self, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> 'Scalar':
     """ Converts the input to a string representation.
     :return: `Scalar` containing the string.
     """
     return Scalar(self.sds_context,
                   'toString', [self],
                   kwargs,
                   output_type=OutputType.STRING)
Ejemplo n.º 8
0
def gmm(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param n_components: Number of n_components in the Gaussian mixture model
    :param model: "VVV": unequal variance (full),each component has its own general covariance matrix
    :param init_param: initialize weights with "kmeans" or "random"
    :param iterations: Number of iterations
    :param reg_covar: regularization parameter for covariance matrix
    :param tol: tolerance value for convergence
    :return: 'OperationNode' containing of estimated parameters & information criterion for best iteration & kth class 
    """
    params_dict = {'X': X}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    vX_3 = Scalar(X.sds_context, '')
    vX_4 = Matrix(X.sds_context, '')
    vX_5 = Matrix(X.sds_context, '')
    vX_6 = Matrix(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
        vX_6,
    ]

    op = MultiReturn(X.sds_context,
                     'gmm',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]
    vX_6._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 9
0
    def scalar(self, v: Dict[str, VALID_INPUT_TYPES]) -> Scalar:
        """ Construct an scalar value, this can contain str, float, double, integers and booleans.
        :return: A scalar containing the given value.
        """
        if type(v) is str:
            if not ((v[0] == '"' and v[-1] == '"') or (v[0] == "'" and v[-1] == "'")):
                v = f'"{v}"'

        # output type assign simply assigns the given variable to the value
        # therefore the output type is assign.
        return Scalar(self, v, assign=True, output_type=OutputType.from_str(v))
Ejemplo n.º 10
0
def outlierByIQR(X: Matrix, k: float, max_iterations: int,
                 **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param k: a constant used to discern outliers k*IQR
    :param isIterative: iterative repair or single repair
    :param repairMethod: values: 0 = delete rows having outliers,
    :param max_iterations: values: 0 = arbitrary number of iteraition until all outliers are removed,
    :param verbose: flag specifying if logging information should be printed
    :return: 'OperationNode' containing meaning & matrix x with no outliers 
    """
    params_dict = {'X': X, 'k': k, 'max_iterations': max_iterations}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Matrix(X.sds_context, '')
    vX_3 = Matrix(X.sds_context, '')
    vX_4 = Scalar(X.sds_context, '')
    vX_5 = Scalar(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
    ]

    op = MultiReturn(X.sds_context,
                     'outlierByIQR',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 11
0
    def var(self, axis: int = None) -> 'OperationNode':
        """Calculate variance of matrix.

        :param axis: can be 0 or 1 to do either row or column vars
        :return: `Matrix` representing operation
        """
        if axis == 0:
            return Matrix(self.sds_context, 'colVars', [self])
        elif axis == 1:
            return Matrix(self.sds_context, 'rowVars', [self])
        elif axis is None:
            return Scalar(self.sds_context, 'var', [self])
        raise ValueError(
            f"Axis has to be either 0, 1 or None, for column, row or complete {self.operation}")
Ejemplo n.º 12
0
def gmm(X: Matrix, verbose: bool, **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {'X': X, 'verbose': verbose}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    vX_3 = Scalar(X.sds_context, '')
    vX_4 = Matrix(X.sds_context, '')
    vX_5 = Matrix(X.sds_context, '')
    vX_6 = Matrix(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
        vX_6,
    ]

    op = MultiReturn(X.sds_context,
                     'gmm',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]
    vX_6._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 13
0
def outlierBySd(X: Matrix, max_iterations: int,
                **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param k: threshold values 1, 2, 3 for 68%, 95%, 99.7% respectively (3-sigma rule)
    :param repairMethod: values: 0 = delete rows having outliers, 1 = replace outliers as  zeros
    :param max_iterations: values: 0 = arbitrary number of iteration until all outliers are removed,
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X, 'max_iterations': max_iterations}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Matrix(X.sds_context, '')
    vX_3 = Scalar(X.sds_context, '')
    vX_4 = Scalar(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(X.sds_context,
                     'outlierBySd',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 14
0
    def mean(self, axis: int = None) -> 'Matrix':
        """Calculate mean of matrix.

        :param axis: can be 0 or 1 to do either row or column means
        :return: `Matrix` representing operation
        """
        if axis == 0:
            return Matrix(self.sds_context, 'colMeans', [self])
        elif axis == 1:
            return Matrix(self.sds_context, 'rowMeans', [self])
        elif axis is None:
            return Scalar(self.sds_context, 'mean', [self])
        raise ValueError(
            f"Axis has to be either 0, 1 or None, for column, row or complete {self.operation}"
        )
Ejemplo n.º 15
0
def topk_cleaning(dataTrain: Frame, primitives: Frame, parameters: Frame,
                  evaluationFunc: str, evalFunHp: Matrix,
                  **kwargs: Dict[str, VALID_INPUT_TYPES]):

    params_dict = {
        'dataTrain': dataTrain,
        'primitives': primitives,
        'parameters': parameters,
        'evaluationFunc': evaluationFunc,
        'evalFunHp': evalFunHp
    }
    params_dict.update(kwargs)

    vX_0 = Frame(dataTrain.sds_context, '')
    vX_1 = Matrix(dataTrain.sds_context, '')
    vX_2 = Matrix(dataTrain.sds_context, '')
    vX_3 = Scalar(dataTrain.sds_context, '')
    vX_4 = Matrix(dataTrain.sds_context, '')
    vX_5 = Frame(dataTrain.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
        vX_5,
    ]

    op = MultiReturn(dataTrain.sds_context,
                     'topk_cleaning',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]
    vX_5._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 16
0
def dbscan(X: Matrix,
           **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param eps: Maximum distance between two points for one to be considered reachable for the other.
    :param minPts: Number of points in a neighborhood for a point to be considered as a core point
    :return: 'OperationNode' containing  
    """
    params_dict = {'X': X}
    params_dict.update(kwargs)
    
    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    output_nodes = [vX_0, vX_1, vX_2, ]

    op = MultiReturn(X.sds_context, 'dbscan', output_nodes, named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 17
0
def mice(X: Matrix, cMask: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]):
    """
    :param iter: Number of iteration for multiple imputations
    :param threshold: confidence value [0, 1] for robust imputation, values will only be imputed
    :param if: value has probability greater than threshold,
    :param only: categorical data
    :param verbose: Boolean value.
    :return: 'OperationNode' containing are represented with empty string i.e ",," in csv file   & n are storing continuos/numeric data and variables with  & storing categorical data 
    """
    params_dict = {'X': X, 'cMask': cMask}
    params_dict.update(kwargs)

    vX_0 = Matrix(X.sds_context, '')
    vX_1 = Matrix(X.sds_context, '')
    vX_2 = Scalar(X.sds_context, '')
    vX_3 = Frame(X.sds_context, '')
    vX_4 = List(X.sds_context, '')
    output_nodes = [
        vX_0,
        vX_1,
        vX_2,
        vX_3,
        vX_4,
    ]

    op = MultiReturn(X.sds_context,
                     'mice',
                     output_nodes,
                     named_input_nodes=params_dict)

    vX_0._unnamed_input_nodes = [op]
    vX_1._unnamed_input_nodes = [op]
    vX_2._unnamed_input_nodes = [op]
    vX_3._unnamed_input_nodes = [op]
    vX_4._unnamed_input_nodes = [op]

    return op
Ejemplo n.º 18
0
 def nCol(self) -> 'Scalar':
     return Scalar(self.sds_context, 'ncol', [self])
Ejemplo n.º 19
0
 def nRow(self) -> 'Scalar':
     return Scalar(self.sds_context, 'nrow', [self])
Ejemplo n.º 20
0
 def as_scalar(self) -> Scalar:
     ent = self._list_source[self._key]
     res = Scalar(self.sds_context, "as.scalar", [ent])
     self._list_source._outputs[self._key] = res
     return res