def correctTypos(strings: Frame, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param frequency_threshold: Strings that occur above this frequency level will not be corrected :param distance_threshold: Max distance at which strings are considered similar :param is_verbose: Print debug information :return: 'OperationNode' containing """ params_dict = {'strings': strings} params_dict.update(kwargs) vX_0 = Frame(strings.sds_context, '') vX_1 = Scalar(strings.sds_context, '') vX_2 = Scalar(strings.sds_context, '') vX_3 = Matrix(strings.sds_context, '') vX_4 = Frame(strings.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, ] op = MultiReturn(strings.sds_context, 'correctTypos', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] return op
def executePipeline(pipeline: Frame, Xtrain: Matrix, Ytrain: Matrix, Xtest: Matrix, Ytest: Matrix, metaList: List, hyperParameters: Matrix, flagsCount: int, verbose: bool, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param flagsCount: --- :param test: --- :return: 'OperationNode' containing """ params_dict = { 'pipeline': pipeline, 'Xtrain': Xtrain, 'Ytrain': Ytrain, 'Xtest': Xtest, 'Ytest': Ytest, 'metaList': metaList, 'hyperParameters': hyperParameters, 'flagsCount': flagsCount, 'verbose': verbose } params_dict.update(kwargs) vX_0 = Matrix(pipeline.sds_context, '') vX_1 = Matrix(pipeline.sds_context, '') vX_2 = Matrix(pipeline.sds_context, '') vX_3 = Matrix(pipeline.sds_context, '') vX_4 = Scalar(pipeline.sds_context, '') vX_5 = Matrix(pipeline.sds_context, '') vX_6 = Matrix(pipeline.sds_context, '') vX_7 = Scalar(pipeline.sds_context, '') vX_8 = List(pipeline.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, vX_5, vX_6, vX_7, vX_8, ] op = MultiReturn(pipeline.sds_context, 'executePipeline', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] vX_5._unnamed_input_nodes = [op] vX_6._unnamed_input_nodes = [op] vX_7._unnamed_input_nodes = [op] vX_8._unnamed_input_nodes = [op] return op
def garch(X: Matrix, kmax: int, momentum: float, start_stepsize: float, end_stepsize: float, start_vicinity: float, end_vicinity: float, sim_seed: int, verbose: bool): """ :param X: The input Matrix to apply Arima on. :param kmax: Number of iterations :param momentum: Momentum for momentum-gradient descent (set to 0 to deactivate) :param start_stepsize: Initial gradient-descent stepsize :param end_stepsize: gradient-descent stepsize at end (linear descent) :param start_vicinity: proportion of randomness of restart-location for gradient descent at beginning :param end_vicinity: same at end (linear decay) :param sim_seed: seed for simulation of process on fitted coefficients :param verbose: verbosity, comments during fitting :return: 'OperationNode' containing simulated garch(1,1) process on fitted coefficients & variances of simulated fitted process & constant term of fitted process & 1-st arch-coefficient of fitted process & 1-st garch-coefficient of fitted process & drawbacks: slow convergence of optimization (sort of simulated annealing/gradient descent) """ params_dict = { 'X': X, 'kmax': kmax, 'momentum': momentum, 'start_stepsize': start_stepsize, 'end_stepsize': end_stepsize, 'start_vicinity': start_vicinity, 'end_vicinity': end_vicinity, 'sim_seed': sim_seed, 'verbose': verbose } vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Scalar(X.sds_context, '') vX_3 = Scalar(X.sds_context, '') vX_4 = Scalar(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, ] op = MultiReturn(X.sds_context, 'garch', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] return op
def multiLogRegPredict(X: Matrix, B: Matrix, Y: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param X: Data Matrix X :param B: Regression parameters betas :param Y: Response vector Y :param verbose: / :return: 'OperationNode' containing matrix m of predicted means/probabilities & predicted response vector & scalar value of accuracy """ params_dict = {'X': X, 'B': B, 'Y': Y} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Scalar(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, ] op = MultiReturn(X.sds_context, 'multiLogRegPredict', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] return op
def multiLogRegPredict(X: Matrix, B: Matrix, Y: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param verbose: flag specifying if logging information should be printed :return: 'OperationNode' containing value of accuracy """ params_dict = {'X': X, 'B': B, 'Y': Y} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Scalar(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, ] op = MultiReturn(X.sds_context, 'multiLogRegPredict', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] return op
def gmm(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param n_components: Number of n_components in the Gaussian mixture model :param model: "VVV": unequal variance (full),each component has its own general covariance matrix :param init_param: initialize weights with "kmeans" or "random" :param iterations: Number of iterations :param reg_covar: regularization parameter for covariance matrix :param tol: tolerance value for convergence :return: 'OperationNode' containing of estimated parameters & information criterion for best iteration & kth class """ params_dict = {'X': X} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Scalar(X.sds_context, '') vX_3 = Scalar(X.sds_context, '') vX_4 = Matrix(X.sds_context, '') vX_5 = Matrix(X.sds_context, '') vX_6 = Matrix(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, vX_5, vX_6, ] op = MultiReturn(X.sds_context, 'gmm', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] vX_5._unnamed_input_nodes = [op] vX_6._unnamed_input_nodes = [op] return op
def outlierByIQR(X: Matrix, k: float, max_iterations: int, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param k: a constant used to discern outliers k*IQR :param isIterative: iterative repair or single repair :param repairMethod: values: 0 = delete rows having outliers, :param max_iterations: values: 0 = arbitrary number of iteraition until all outliers are removed, :param verbose: flag specifying if logging information should be printed :return: 'OperationNode' containing meaning & matrix x with no outliers """ params_dict = {'X': X, 'k': k, 'max_iterations': max_iterations} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Matrix(X.sds_context, '') vX_3 = Matrix(X.sds_context, '') vX_4 = Scalar(X.sds_context, '') vX_5 = Scalar(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, vX_5, ] op = MultiReturn(X.sds_context, 'outlierByIQR', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] vX_5._unnamed_input_nodes = [op] return op
def gmm(X: Matrix, verbose: bool, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = {'X': X, 'verbose': verbose} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Scalar(X.sds_context, '') vX_3 = Scalar(X.sds_context, '') vX_4 = Matrix(X.sds_context, '') vX_5 = Matrix(X.sds_context, '') vX_6 = Matrix(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, vX_5, vX_6, ] op = MultiReturn(X.sds_context, 'gmm', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] vX_5._unnamed_input_nodes = [op] vX_6._unnamed_input_nodes = [op] return op
def outlierBySd(X: Matrix, max_iterations: int, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param k: threshold values 1, 2, 3 for 68%, 95%, 99.7% respectively (3-sigma rule) :param repairMethod: values: 0 = delete rows having outliers, 1 = replace outliers as zeros :param max_iterations: values: 0 = arbitrary number of iteration until all outliers are removed, :return: 'OperationNode' containing """ params_dict = {'X': X, 'max_iterations': max_iterations} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Matrix(X.sds_context, '') vX_3 = Scalar(X.sds_context, '') vX_4 = Scalar(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, ] op = MultiReturn(X.sds_context, 'outlierBySd', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] return op
def topk_cleaning(dataTrain: Frame, primitives: Frame, parameters: Frame, evaluationFunc: str, evalFunHp: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): params_dict = { 'dataTrain': dataTrain, 'primitives': primitives, 'parameters': parameters, 'evaluationFunc': evaluationFunc, 'evalFunHp': evalFunHp } params_dict.update(kwargs) vX_0 = Frame(dataTrain.sds_context, '') vX_1 = Matrix(dataTrain.sds_context, '') vX_2 = Matrix(dataTrain.sds_context, '') vX_3 = Scalar(dataTrain.sds_context, '') vX_4 = Matrix(dataTrain.sds_context, '') vX_5 = Frame(dataTrain.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, vX_5, ] op = MultiReturn(dataTrain.sds_context, 'topk_cleaning', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] vX_5._unnamed_input_nodes = [op] return op
def dbscan(X: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param eps: Maximum distance between two points for one to be considered reachable for the other. :param minPts: Number of points in a neighborhood for a point to be considered as a core point :return: 'OperationNode' containing """ params_dict = {'X': X} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Scalar(X.sds_context, '') output_nodes = [vX_0, vX_1, vX_2, ] op = MultiReturn(X.sds_context, 'dbscan', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] return op
def mice(X: Matrix, cMask: Matrix, **kwargs: Dict[str, VALID_INPUT_TYPES]): """ :param iter: Number of iteration for multiple imputations :param threshold: confidence value [0, 1] for robust imputation, values will only be imputed :param if: value has probability greater than threshold, :param only: categorical data :param verbose: Boolean value. :return: 'OperationNode' containing are represented with empty string i.e ",," in csv file & n are storing continuos/numeric data and variables with & storing categorical data """ params_dict = {'X': X, 'cMask': cMask} params_dict.update(kwargs) vX_0 = Matrix(X.sds_context, '') vX_1 = Matrix(X.sds_context, '') vX_2 = Scalar(X.sds_context, '') vX_3 = Frame(X.sds_context, '') vX_4 = List(X.sds_context, '') output_nodes = [ vX_0, vX_1, vX_2, vX_3, vX_4, ] op = MultiReturn(X.sds_context, 'mice', output_nodes, named_input_nodes=params_dict) vX_0._unnamed_input_nodes = [op] vX_1._unnamed_input_nodes = [op] vX_2._unnamed_input_nodes = [op] vX_3._unnamed_input_nodes = [op] vX_4._unnamed_input_nodes = [op] return op