Beispiel #1
0
    def test_scale(self):
        data = {'col1': [1.0, 2.0, 3.0], 'col2': [4.0, 5.0, 6.0]}

        #
        # No model
        #
        X = pd.DataFrame(data)
        y = transform(resolve_full_name('sklearn.preprocessing:scale'), 'all',
                      X['col1'], None, {}, None)
        # It returns ndarray

        self.assertEqual(len(y), 3)

        self.assertAlmostEqual(y.mean(), 0.0)
        self.assertAlmostEqual(y.std(ddof=0), 1.0)

        #
        # Use some model, 2 input columns and one output column (so second input column will be overwritten)
        #
        X = pd.DataFrame(data)
        model = {'with_mean': True, 'with_std': False}
        y = transform(resolve_full_name('sklearn.preprocessing:scale'), 'all',
                      X['col2'], None, model, None)

        self.assertAlmostEqual(y.mean(), 0.0)
        self.assertAlmostEqual(y.std(ddof=0), 0.816496580927726)
Beispiel #2
0
    def test_UDF(self):
        data = {'col1': [1.0, 2.0, 3.0], 'col2': [4.0, 5.0, 6.0]}

        #
        # No model. Single input
        #
        X = pd.DataFrame(data)
        out = transform(resolve_full_name('test_transform:udf1'), 'one',
                        X[['col2']], None, {}, None)

        self.assertEqual(len(out), 3)

        self.assertAlmostEqual(out[0], 5.0)
        self.assertAlmostEqual(out[1], 6.0)
        self.assertAlmostEqual(out[2], 7.0)

        #
        # Has model. Parameters flattened.
        #
        model = {'addition': 1.0}
        X = pd.DataFrame(data)
        out = transform(resolve_full_name('test_transform:udf2'), 'one',
                        X['col2'], None, model, None)

        self.assertEqual(len(out), 3)

        self.assertAlmostEqual(out[0], 5.0)
        self.assertAlmostEqual(out[1], 6.0)
        self.assertAlmostEqual(out[2], 7.0)

        #
        # No model. Row input
        #
        X = pd.DataFrame(data)
        out = transform(resolve_full_name('test_transform:udf3'), 'one',
                        X[['col1', 'col2']], None, {}, None)

        self.assertEqual(len(out), 3)

        self.assertAlmostEqual(out[0], 5.0)
        self.assertAlmostEqual(out[1], 7.0)
        self.assertAlmostEqual(out[2], 9.0)

        #
        # Has model. Row input
        #
        model = {'addition': 1.0}
        X = pd.DataFrame(data)
        out = transform(resolve_full_name('test_transform:udf4'), 'one',
                        X[['col1', 'col2']], None, model, None)

        self.assertEqual(len(out), 3)

        self.assertAlmostEqual(out[0], 6.0)
        self.assertAlmostEqual(out[1], 8.0)
        self.assertAlmostEqual(out[2], 10.0)
Beispiel #3
0
    def evaluate(self):
        """
        Evaluate this column.
        """
        log.info("  ===> Start evaluating column '{0}'".format(self.id))

        #
        # Stage 1: Ensure that "data" field is ready for applying column operations
        #
        table = self.table.data  # Table the columns will be added to

        #
        # Stage 2: Generate a list of concrete definitions by imposing extensions on the base definition
        # "extensions" field determine family or not.
        #
        concrete_definitions = self.get_definitions()
        num_extensions = len(concrete_definitions)

        for i, definition in enumerate(concrete_definitions):

            #
            # Stage 3. Resolve the function
            #
            func_name = definition.get('function')
            func = resolve_full_name(func_name)
            if not func:
                log.warning(
                    "Cannot resolve user-defined function '{0}'. Skip column definition."
                    .format(func_name))
                break

            scope = definition.get('scope')

            #
            # Stage 4. Prepare input data argument to pass to the function (as the first argument)
            #
            data = table
            inputs = definition.get('inputs')
            if inputs is None:
                inputs = []
            inputs = get_columns(inputs, data)
            if inputs is None:
                log.warning(
                    "Error reading column list. Skip column definition.")
                break

            # Validation: check if all explicitly specified columns available
            if not all_columns_exist(inputs, data):
                log.warning(
                    "Not all columns available. Skip column definition.".
                    format())
                break

            # Select only specified columns
            data = data[inputs]

            data_type = definition.get('data_type')

            #
            # Stage 5. Prepare model object to pass to the function (as the second argument)
            # It can be necessary to instantiate the argument object by using the specified class
            # It can be necessary to generate (train) a model (we need some specific logic to determine such a need)
            #
            model_ref = definition.get('model')
            model_type = definition.get('model_type')
            if model_ref and isinstance(model_ref,
                                        str) and model_ref.startswith('$'):
                log.info("Load model from {0}.".format(model_ref))
                model = get_value(
                    model_ref
                )  # De-reference model which can be represented by-reference (if it is a string starting with $)
            else:
                model = model_ref

            train = definition.get('train')
            if not model and train:

                # 1. Resolve train function
                train_func_name = train.get('function')
                train_func = resolve_full_name(train_func_name)
                if not train_func:
                    log.warning(
                        "Cannot resolve user-defined training function '{0}'. Skip training."
                        .format(train_func_name))
                    break

                # 2. Filter rows for train data
                train_table = table
                train_row_filter = train.get("row_filter")
                if train_row_filter:
                    train_table = apply_row_filter(table, train_row_filter)

                # 3. Select columns to use for training
                train_data = train_table
                train_inputs = train.get('inputs')
                if train_inputs is None:
                    train_inputs = inputs  # Inherit from the 'apply' section
                train_inputs = get_columns(train_inputs, train_data)
                if train_inputs is None:
                    log.warning(
                        "Error reading column list for training. Skip column training."
                    )
                    break

                # Validation: check if all explicitly specified columns available
                if not all_columns_exist(train_inputs, train_data):
                    log.warning(
                        "Not all columns available for training. Skip column definition."
                        .format())
                    break

                # Select only specified columns
                train_data = train_data[train_inputs]

                # 3. Determine labels
                # - no labels at all (no argument is expected) - unsupervised learning
                # - explicitly specified outputs
                # - use output column specified in the transformation (but it has to be already available, e.g., loaded from source data, while the transformation will overwrite it)
                labels = train.get('outputs')
                if not labels:
                    labels = definition.get(
                        'outputs'
                    )  # Same columns as used by the transformation

                if labels:
                    labels = get_columns(labels, table)
                    if labels is None:
                        log.warning(
                            "Error reading column list. Skip column definition."
                        )
                        break
                    train_labels = train_table[
                        labels]  # Select only specified columns
                else:
                    train_labels = None  # Do not pass any labels at all (unsupervised)

                # 4. Retrieve hyper-model
                train_model = train.get('model', {})

                # Cast data argument
                if data_type == 'ndarray':
                    data_arg = train_data.values
                    if train_labels is not None:
                        labels_arg = train_labels.values
                else:
                    data_arg = train_data
                    if train_labels is not None:
                        labels_arg = train_labels

                # 5. Call the function and generate a model
                if train_labels is None:
                    model = train_func(data_arg, **train_model)
                else:
                    if train_model is None:
                        model = train_func(data_arg, labels_arg)
                    else:
                        model = train_func(data_arg, labels_arg, **train_model)

                # 6. Each time a new model is generated, we store it in the model field of the definition
                if model and model_ref:
                    log.info("Store trained model in {0}.".format(model_ref))
                    set_value(model_ref, model)

            elif not model and not train:
                model = {}

            #
            # Stage 6. Apply function.
            # Depending on the "scope" the system will organize a loop over records, windows or make single call
            # It also depends on the call options (how and what to pass in data and model arguments, flatten json, ndarry or Series etc.)
            #

            out = transform(func, scope, data, data_type, model, model_type)

            #
            # Stage 7. Post-process the result by renaming the output columns accordingly (some convention is needed to know what output to expect)
            #
            outputs = definition.get('outputs', [])
            if isinstance(
                    outputs, str
            ):  # If a single name is provided (not a list), then we wrap into a list
                outputs = [outputs]
            if not outputs:
                id = definition.get('id')
                # TODO: We could use a smarter logic here by finding a parameter of the extension which really changes (is overwritten): inputs, function, outputs, scope, model etc.
                if num_extensions > 1:
                    id = id + '_' + str(i)
                outputs.append(id)

            # TODO: There result could be a complex object, while some option (like 'result_path') could provide a path to access it, so we need to be able to retrieve the result (either here or in transform function)
            # TODO: The result can be Series/listndarray(1d or 2d) and we need to convert it to DataFrame by using the original index.
            out = pd.DataFrame(out)  # Result can be ndarray
            for i, c in enumerate(out.columns):
                if outputs and i < len(
                        outputs):  # Explicitly specified output column name
                    n = outputs[i]
                else:  # Same name - overwrite input column
                    n = inputs[i]
                table[n] = out[
                    c]  # A column is attached by matching indexes so indexes have to be consistent (the same)

        #
        # Stage 8. Post-process the whole family
        #

        log.info("  <=== Finish evaluating column '{0}'".format(self.id))
Beispiel #4
0
    def evaluate(self):
        """
        Evaluate this column.
        Evaluation logic depends on the operation (definition) kind.
        """
        log.info("  ---> Start evaluating column '{0}'".format(self.id))

        #
        # Stage 1: Ensure that the data field (with table data) is ready for applying column operations
        #
        table = self.table.data  # Table the columns will be added to

        #
        # Stage 2: Generate a list of concrete definitions by imposing extensions on the base definition
        # "extensions" field determine family or not.
        #
        concrete_definitions = self.get_definitions()
        num_extensions = len(concrete_definitions)

        # Essentially, we evaluate several columns independently
        for i, definition in enumerate(concrete_definitions):

            window = definition.get('window')

            operation = definition.get('operation')
            if not operation:  # Default
                if window is None or window == 'one' or window == '1':
                    operation = 'calculate'  # Default
                elif window == 'all':
                    operation = 'all'
                else:
                    operation = 'roll'

            #
            # Stage 3. Resolve the function
            #
            func_name = definition.get('function')
            if not func_name:
                log.warning("Column function is not specified. Skip column definition.".format(func_name))
                break

            func = resolve_full_name(func_name)
            if not func:
                log.warning("Cannot resolve user-defined function '{0}'. Skip column definition.".format(func_name))
                break

            #
            # Stage 4. Prepare input data argument to pass to the function (as the first argument)
            #
            data = table

            inputs = definition.get('inputs', [])
            inputs = get_columns(inputs, data)
            if inputs is None:
                log.warning("Error reading column list. Skip column definition.")
                break

            # Validation: check if all explicitly specified columns available
            if not all_columns_exist(inputs, data):
                log.warning("Not all columns available. Skip column definition.".format())
                break

            # Select only the specified input columns
            data = data[inputs]

            data_type = definition.get('data_type')

            #
            # Stage 5. Prepare model object to pass to the function (as the second argument)
            #
            model_type = definition.get('model_type')
            model = self.prepare_model(definition, inputs)
            if model is None:
                break

            #
            # Stage 6. Apply function.
            # Depending on the "window" the system will organize a loop over records, windows or make single call
            # It also depends on the call options (how and what to pass in data and model arguments, flatten json, ndarry or Series etc.)
            #
            out = transform(func, window, data, data_type, model, model_type)

            #
            # Stage 7. Post-process the result by renaming the output columns accordingly (some convention is needed to know what output to expect)
            #
            outputs = definition.get('outputs', [])
            if isinstance(outputs, str):  # If a single name is provided (not a list), then we wrap into a list
                outputs = [outputs]
            if not outputs:
                id = definition.get('id')
                # TODO: We could use a smarter logic here by finding a parameter of the extension which really changes (is overwritten): inputs, function, outputs, window, model etc.
                if num_extensions > 1:
                    id = id + '_' + str(i)
                outputs.append(id)

            # TODO: There result could be a complex object, while some option (like 'result_path') could provide a path to access it, so we need to be able to retrieve the result (either here or in transform function)
            # TODO: The result can be Series/listndarray(1d or 2d) and we need to convert it to DataFrame by using the original index.
            out = pd.DataFrame(out)  # Result can be ndarray
            for i, c in enumerate(out.columns):
                if outputs and i < len(outputs):  # Explicitly specified output column name
                    n = outputs[i]
                else:  # Same name - overwrite input column
                    n = inputs[i]
                table[n] = out[c]  # A column is attached by matching indexes so indexes have to be consistent (the same)

        #
        # Stage 8. Post-process the whole family
        #

        log.info("  <--- Finish evaluating column '{0}'".format(self.id))