Exemple #1
0
 def time_new(self, columns):
     base_utils.combine_columns(
         self.large_dataframe_with_many_columns,
         list(range(int(columns / 4),
                    int(columns / 2))),  # Just 1/4 of columns.
         self.list_of_many_dataframe_columns,
         return_result='new',
         add_index_columns=True,
     )
Exemple #2
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        if not self._fitted:
            raise exceptions.PrimitiveNotFittedError("Primitive not fitted.")

        assert self._add_semantic_types is not None
        assert self._remove_semantic_types is not None

        columns_to_use, output_columns = self._produce_columns(
            inputs, self._add_semantic_types, self._remove_semantic_types)

        if self.hyperparams['replace_index_columns'] and self.hyperparams[
                'return_result'] == 'append':
            assert len(columns_to_use) == len(output_columns)

            index_columns = inputs.metadata.get_index_columns()

            index_columns_to_use = []
            other_columns_to_use = []
            index_output_columns = []
            other_output_columns = []
            for column_to_use, output_column in zip(columns_to_use,
                                                    output_columns):
                if column_to_use in index_columns:
                    index_columns_to_use.append(column_to_use)
                    index_output_columns.append(output_column)
                else:
                    other_columns_to_use.append(column_to_use)
                    other_output_columns.append(output_column)

            outputs = base_utils.combine_columns(
                inputs,
                index_columns_to_use,
                index_output_columns,
                return_result='replace',
                add_index_columns=self.hyperparams['add_index_columns'])
            outputs = base_utils.combine_columns(
                outputs,
                other_columns_to_use,
                other_output_columns,
                return_result='append',
                add_index_columns=self.hyperparams['add_index_columns'])
        else:
            outputs = base_utils.combine_columns(
                inputs,
                columns_to_use,
                output_columns,
                return_result=self.hyperparams['return_result'],
                add_index_columns=self.hyperparams['add_index_columns'])

        return base.CallResult(outputs)
Exemple #3
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        sk_inputs, columns_to_use, _ = self._get_columns_to_fit(inputs, self.hyperparams)
        output = []
        if len(sk_inputs.columns):
            try:
                sk_output = self._clf.transform(sk_inputs)
            except sklearn.exceptions.NotFittedError as error:
                raise PrimitiveNotFittedError("Primitive not fitted.") from error
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            target_columns_metadata = self._copy_columns_metadata(inputs.metadata, self._training_indices, self.hyperparams)
            output = self._wrap_predictions(inputs, sk_output, target_columns_metadata)

            output.columns = [inputs.columns[idx] for idx in range(len(inputs.columns)) if idx in self._training_indices]
            output = [output]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        _, _, dropped_cols = self._get_columns_to_fit(inputs, self.hyperparams)
        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
                                               add_index_columns=self.hyperparams['add_index_columns'],
                                               inputs=inputs, column_indices=self._training_indices + dropped_cols,
                                               columns_list=output)
        return CallResult(outputs)
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
        output = []
        if len(sk_inputs.columns):
            try:
                sk_output = self._clf.predict(sk_inputs)
            except sklearn.exceptions.NotFittedError as error:
                raise PrimitiveNotFittedError("Primitive not fitted.") from error
            # For primitives that allow predicting without fitting like GaussianProcessRegressor
            if not self._fitted:
                raise PrimitiveNotFittedError("Primitive not fitted.")
            if sparse.issparse(sk_output):
                sk_output = pandas.DataFrame.sparse.from_spmatrix(sk_output)
            output = self._wrap_predictions(inputs, sk_output)
            output.columns = self._target_names
            output = [output]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
                                               add_index_columns=self.hyperparams['add_index_columns'],
                                               inputs=inputs, column_indices=self._target_column_indices,
                                               columns_list=output)

        return CallResult(outputs)
Exemple #5
0
    def produce(self,
                *,
                inputs: Inputs,
                iterations: int = None,
                timeout: float = None) -> base.CallResult[Outputs]:
        """
        Inputs:  ndarray of features
        Returns: Pandas DataFrame Containing predictions
        """
        # Inference
        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        XTest, columns_to_use = self._select_inputs_columns(inputs)

        if len(XTest.columns):
            # Prediction
            YpredCCF, _, _ = predictFromCCF(self._CCF, XTest)

            output_columns = [self._wrap_predictions(YpredCCF)]

        outputs = base_utils.combine_columns(
            inputs,
            columns_to_use,
            output_columns,
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'])

        return base.CallResult(outputs)
Exemple #6
0
    def produce_score(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame. Time series data up to outlier detection.

        Returns:
            Container DataFrame
            1 marks Outliers, 0 marks normal.
        """

        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")
        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            sk_inputs = inputs.iloc[:, self._training_indices]
        output_columns = []
        if len(self._training_indices) > 0:

            if self.hyperparams['return_subseq_inds']:

                if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD
                    pred_score = self._clf.decision_function(sk_inputs.values).ravel()
                    left_inds_ = numpy.arange(0, len(pred_score), self.step_size)
                    right_inds_ = left_inds_ + self.window_size
                    right_inds_[right_inds_ > len(pred_score)] = len(pred_score)

                else:
                    pred_score, left_inds_, right_inds_ = self._clf.decision_function(sk_inputs.values)

                # print(pred_score.shape, left_inds_.shape, right_inds_.shape)

                sk_output = numpy.concatenate((numpy.expand_dims(pred_score, axis=1),
                                               numpy.expand_dims(left_inds_, axis=1),
                                               numpy.expand_dims(right_inds_, axis=1)), axis=1)

            else:
                if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD
                    sk_output = self._clf.decision_function(sk_inputs.values)

                else:
                    sk_output, _, _ = self._clf.decision_function(sk_inputs.values)

            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)
            if len(outputs.columns) == len(self._input_column_names):
                outputs.columns = self._input_column_names
            output_columns = [outputs]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
                                             add_index_columns=self.hyperparams['add_index_columns'],
                                             inputs=inputs, column_indices=self._training_indices,
                                             columns_list=output_columns)
        return CallResult(outputs)
Exemple #7
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame.

        Returns:
            Container DataFrame after HPFilter.
        """
        # Get cols to fit.
        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if len(self._training_indices) > 0:
            # self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")
        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            sk_inputs = inputs.iloc[:, self._training_indices]
        output_columns = []
        if len(self._training_indices) > 0:
            sk_output = self._hpfilter(sk_inputs,
                                       lamb=self.hyperparams['lamb'])
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)

            if len(outputs.columns) == len(self._input_column_names):
                outputs.columns = self._input_column_names
            output_columns = [outputs]

        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._training_indices,
            columns_list=output_columns)

        # self._write(outputs)
        # self.logger.warning('produce was called3')
        return CallResult(outputs)
Exemple #8
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
        """

            Args:
                inputs: Container DataFrame

            Returns:
                Container DataFrame added with absolute and phase value in a columns named 'column_name_fft_abs' and 'column_name_fft_phse'.
                These values correspnd to the absolute and angle values for a complex number we get as FFT coefficients

        """
        assert isinstance(inputs, container.DataFrame), type(dataframe)

        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns
        
        if len(self._training_indices) > 0:
            # self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        
        
        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            cols = [inputs.columns[x] for x in self._training_indices]
            sk_inputs = container.DataFrame(data = inputs.iloc[:, self._training_indices].values,columns = cols, generate_metadata=True)

        output_columns = []
        if len(self._training_indices) > 0:
            sk_output = self._clf.produce(sk_inputs)
            
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)
            # if len(outputs.columns) == len(self._input_column_names):
            #     outputs.columns = self._input_column_names
            output_columns = [outputs]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        
        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
                                            add_index_columns=self.hyperparams['add_index_columns'],
                                            inputs=inputs, column_indices=self._training_indices,
                                            columns_list=output_columns)

        
        return base.CallResult(outputs)
Exemple #9
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
        """

        Args:
            inputs: Container DataFrame
            timeout: Default
            iterations: Default

        Returns:
            Container DataFrame containing hmean of  time series
        """
        self.logger.info('Statistical Hmean  Primitive called')

        # Get cols to fit.
        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if len(self._training_indices) > 0:
            # self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")
        statistical_hmean_input = inputs
        if self.hyperparams['use_semantic_types']:
            statistical_hmean_input = inputs.iloc[:, self._training_indices]
        output_columns = []
        if len(self._training_indices) > 0:
            statistical_hmean_output = self._hmean(statistical_hmean_input,self.hyperparams["window_size"])

            if sparse.issparse(statistical_hmean_output):
                statistical_hmean_output = statistical_hmean_output.toarray()
            outputs = self._wrap_predictions(inputs, statistical_hmean_output)

            #if len(outputs.columns) == len(self._input_column_names):
               # outputs.columns = self._input_column_names

            output_columns = [outputs]


        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
                                             add_index_columns=self.hyperparams['add_index_columns'],
                                             inputs=inputs, column_indices=self._training_indices,
                                             columns_list=output_columns)

        self.logger.info('Statistical Hmean  Primitive returned')

        return base.CallResult(outputs)
Exemple #10
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame. Time series data up to Wavelet transform.

        Returns:
            [cA_n, cD_n, cD_n-1, …, cD2, cD1]: Container DataFrame after Wavelet Transformation.
            Ordered frame of coefficients arrays where n denotes the level of decomposition. The first element (cA_n) of the result is approximation coefficients array and the following elements (cD_n - cD_1) are details coefficients arrays.
        """
        assert isinstance(inputs,
                          container.DataFrame), type(container.DataFrame)

        _, self._columns_to_produce = self._get_columns_to_fit(
            inputs, self.hyperparams)
        self._input_column_names = inputs.columns

        # print('columns_to_produce=', self._columns_to_produce)

        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            sk_inputs = inputs.iloc[:, self._columns_to_produce]
        output_columns = []
        if len(self._columns_to_produce) > 0:
            sk_output = self._clf.produce(sk_inputs,
                                          self.hyperparams['inverse'])
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)
            if len(outputs.columns) == len(self._input_column_names):
                outputs.columns = self._input_column_names
            output_columns = [outputs]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._columns_to_produce,
            columns_list=output_columns)

        # print(inputs)
        # print(outputs)
        # if self.hyperparams['inverse'] == 1:
        #     print(outputs)
        # print(outputs.metadata.to_internal_simple_structure())

        # outputs = inputs
        return base.CallResult(outputs)
Exemple #11
0
    def produce(self, *, inputs: FileReaderInputs, timeout: float = None, iterations: int = None) -> base.CallResult[FileReaderOutputs]:
        columns_to_use = self._get_columns(inputs.metadata)

        output_columns = [self._produce_column(inputs, column_index) for column_index in columns_to_use]

        outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'])

        if self.hyperparams['return_result'] == 'append':
            outputs.metadata = self._reassign_boundaries(outputs.metadata, columns_to_use)

        return base.CallResult(outputs)
Exemple #12
0
    def time_replace2(self, columns):
        cols = 5000
        large_dataframe_with_many_columns = container.DataFrame(
            {str(i): [j for j in range(5)]
             for i in range(cols)},
            columns=[str(i) for i in range(cols)],
            generate_metadata=True)
        list_of_many_dataframe_columns = [
            container.DataFrame({str(i): [j for j in range(5, 1000)]},
                                columns=[str(i)],
                                generate_metadata=True)
            for i in range(int(cols / 2))
        ]

        base_utils.combine_columns(
            large_dataframe_with_many_columns,
            list(range(int(cols))),  # All of the columns.
            list_of_many_dataframe_columns,
            return_result='replace',
            add_index_columns=True,
        )
Exemple #13
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame. Time series data up to scale.

        Returns:
            Container DataFrame after scaling.
        """

        assert isinstance(inputs, container.DataFrame), type(dataframe)

        _, self._columns_to_produce = self._get_columns_to_fit(inputs, self.hyperparams)
        self._input_column_names = inputs.columns
        # print(self._columns_to_produce)
        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            sk_inputs = inputs.iloc[:, self._columns_to_produce]
        output_columns = []
        if len(self._columns_to_produce) > 0:
            sk_output = self._clf.produce(sk_inputs)
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)
            if len(outputs.columns) == len(self._input_column_names):
                outputs.columns = self._input_column_names
            output_columns = [outputs]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        # print(outputs.metadata.to_internal_simple_structure())
        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
                                             add_index_columns=self.hyperparams['add_index_columns'],
                                             inputs=inputs, column_indices=self._columns_to_produce,
                                             columns_list=output_columns)



        # print(inputs)
        # print(outputs)
        return base.CallResult(outputs)
Exemple #14
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:

        """
        Process the testing data.
        Args:
            inputs: Container DataFrame. Time series data up to standardlize.

        Returns:
            Container DataFrame after standardlization.
        """

        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")
        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            sk_inputs = inputs.iloc[:, self._training_indices]
        output_columns = []
        if len(self._training_indices) > 0:
            sk_output = self._clf.transform(sk_inputs)
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)
            if len(outputs.columns) == len(self._input_column_names):
                outputs.columns = self._input_column_names
            output_columns = [outputs]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        # print(outputs.metadata.to_internal_simple_structure())

        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
                                             add_index_columns=self.hyperparams['add_index_columns'],
                                             inputs=inputs, column_indices=self._training_indices,
                                             columns_list=output_columns)
        # print(inputs)
        # print(outputs)
        # print(inputs.metadata.to_internal_simple_structure())
        # print(outputs.metadata.to_internal_simple_structure())

        return CallResult(outputs)
Exemple #15
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        sk_inputs, columns_to_use = self._get_columns_to_fit(
            inputs, self.hyperparams)
        output = []
        if len(sk_inputs.columns):
            try:
                af_inputs = af.from_ndarray(sk_inputs.values)
                weight_by_dist = self._weights == 'distance'
                dist_type = self._get_dist_type(self.hyperparams['dist_type'])
                af_output = self._predict(af_inputs, self._data, self._labels,        \
                                          self.hyperparams['n_neighbors'], dist_type, \
                                          weight_by_dist)
                af_ndarray_output = af_output.to_ndarray().astype('int32')
            except sklearn.exceptions.NotFittedError as error:
                raise PrimitiveNotFittedError(
                    "Primitive not fitted.") from error
            # For primitives that allow predicting without fitting like GaussianProcessRegressor
            if not self._fitted:
                raise PrimitiveNotFittedError("Primitive not fitted.")
            if sparse.issparse(af_ndarray_output):
                af_ndarray_output = af_ndarray_output.toarray()
            output = self._wrap_predictions(inputs, af_ndarray_output)
            output.columns = self._target_names
            output = [output]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._target_column_indices,
            columns_list=output)

        return CallResult(outputs)
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        sk_inputs, columns_to_use = self._get_columns_to_fit(
            inputs, self.hyperparams)

        output = []
        if len(sk_inputs.columns):
            af_inputs = af.from_ndarray(sk_inputs.values.astype('float32'))

            # Normalize feature values
            if not self._max_feature_value_defined:
                self._max_feature_value = af.max(train_feats)
            af_inputs = af_inputs / self._max_feature_value

            af_output = self._predict(af_inputs, self._weights)
            ndarray_output = af_output.to_ndarray()

            output = self._wrap_predictions(inputs, ndarray_output)
            output.columns = self._target_names
            output = [output]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._target_column_indices,
            columns_list=output)

        return CallResult(outputs)
Exemple #17
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame.

        Returns:
            Container DataFrame after Truncated SVD.
        """
        self._clf = trmf(
            lags=self.hyperparams['lags'],
            K=self.hyperparams['K'],
            lambda_f=self.hyperparams['lambda_f'],
            lambda_x=self.hyperparams['lambda_x'],
            lambda_w=self.hyperparams['lambda_w'],
            alpha=self.hyperparams['alpha'],
            eta=self.hyperparams['eta'],
            max_iter=self.hyperparams['max_iter'],
            F_step=self.hyperparams['F_step'],
            X_step=self.hyperparams['X_step'],
            W_step=self.hyperparams['W_step'],
        )

        tmp = inputs.copy()
        for col in inputs.columns:
            tmp[col] = inputs[col] / inputs[col].max()

        self._inputs = tmp
        self._fitted = False

        # Get cols to fit.
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            self._inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if len(self._training_indices) > 0:
            self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            sk_inputs = inputs.iloc[:, self._training_indices]
        output_columns = []
        if len(self._training_indices) > 0:

            sk_output = self._clf.get_X()

            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)
            if len(outputs.columns) == len(self._input_column_names):
                outputs.columns = self._input_column_names
            output_columns = [outputs]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._training_indices,
            columns_list=output_columns)

        # self._write(outputs)
        return CallResult(outputs)
Exemple #18
0
    def test_combine_columns_compact_metadata(self):
        main = container.DataFrame(
            {
                'a1': [1, 2, 3],
                'b1': [4, 5, 6],
                'c1': [7, 8, 9],
                'd1': [10, 11, 12],
                'e1': [13, 14, 15]
            }, {
                'top_level': 'main',
            },
            generate_metadata=False)
        main.metadata = main.metadata.generate(main, compact=True)
        main.metadata = main.metadata.update_column(0, {'name': 'aaa111'})
        main.metadata = main.metadata.update_column(1, {
            'name': 'bbb111',
            'extra': 'b_column'
        })
        main.metadata = main.metadata.update_column(2, {'name': 'ccc111'})

        columns2 = container.DataFrame({
            'a2': [21, 22, 23],
            'b2': [24, 25, 26]
        }, {
            'top_level': 'columns2',
        },
                                       generate_metadata=False)
        columns2.metadata = columns2.metadata.generate(columns2, compact=True)
        columns2.metadata = columns2.metadata.update_column(
            0, {'name': 'aaa222'})
        columns2.metadata = columns2.metadata.update_column(
            1, {'name': 'bbb222'})

        columns3 = container.DataFrame({
            'a3': [31, 32, 33],
            'b3': [34, 35, 36]
        }, {
            'top_level': 'columns3',
        },
                                       generate_metadata=False)
        columns3.metadata = columns3.metadata.generate(columns3, compact=True)
        columns3.metadata = columns3.metadata.update_column(
            0, {'name': 'aaa333'})
        columns3.metadata = columns3.metadata.update_column(
            1, {'name': 'bbb333'})

        result = utils.combine_columns(main, [1, 2], [columns2, columns3],
                                       return_result='append',
                                       add_index_columns=False)

        self.assertEqual(result.values.tolist(), [
            [1, 4, 7, 10, 13, 21, 24, 31, 34],
            [2, 5, 8, 11, 14, 22, 25, 32, 35],
            [3, 6, 9, 12, 15, 23, 26, 33, 36],
        ])

        self.assertEqual(
            d3m_utils.to_json_structure(
                result.metadata.to_internal_simple_structure()),
            [{
                'selector': [],
                'metadata': {
                    'top_level':
                    'main',
                    'schema':
                    metadata_base.CONTAINER_SCHEMA_VERSION,
                    'structural_type':
                    'd3m.container.pandas.DataFrame',
                    'semantic_types':
                    ['https://metadata.datadrivendiscovery.org/types/Table'],
                    'dimension': {
                        'name':
                        'rows',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularRow'
                        ],
                        'length':
                        3,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__'],
                'metadata': {
                    'dimension': {
                        'name':
                        'columns',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularColumn'
                        ],
                        'length':
                        9,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
                'metadata': {
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 0],
                'metadata': {
                    'name': 'aaa111',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 1],
                'metadata': {
                    'name': 'bbb111',
                    'extra': 'b_column',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 2],
                'metadata': {
                    'name': 'ccc111',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 3],
                'metadata': {
                    'name': 'd1',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 4],
                'metadata': {
                    'name': 'e1',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 5],
                'metadata': {
                    'name': 'aaa222',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 6],
                'metadata': {
                    'name': 'bbb222',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 7],
                'metadata': {
                    'name': 'aaa333',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 8],
                'metadata': {
                    'name': 'bbb333',
                    'structural_type': 'numpy.int64',
                },
            }])

        result = utils.combine_columns(main, [1, 2], [columns2, columns3],
                                       return_result='new',
                                       add_index_columns=False)

        self.assertEqual(result.values.tolist(), [
            [21, 24, 31, 34],
            [22, 25, 32, 35],
            [23, 26, 33, 36],
        ])

        self.assertEqual(
            d3m_utils.to_json_structure(
                result.metadata.to_internal_simple_structure()),
            [{
                'selector': [],
                'metadata': {
                    'top_level':
                    'columns2',
                    'schema':
                    metadata_base.CONTAINER_SCHEMA_VERSION,
                    'structural_type':
                    'd3m.container.pandas.DataFrame',
                    'semantic_types':
                    ['https://metadata.datadrivendiscovery.org/types/Table'],
                    'dimension': {
                        'name':
                        'rows',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularRow'
                        ],
                        'length':
                        3,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__'],
                'metadata': {
                    'dimension': {
                        'name':
                        'columns',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularColumn'
                        ],
                        'length':
                        4,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
                'metadata': {
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 0],
                'metadata': {
                    'name': 'aaa222',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 1],
                'metadata': {
                    'name': 'bbb222',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 2],
                'metadata': {
                    'name': 'aaa333',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 3],
                'metadata': {
                    'name': 'bbb333',
                    'structural_type': 'numpy.int64',
                },
            }])

        result = utils.combine_columns(main, [1, 2], [columns2, columns3],
                                       return_result='replace',
                                       add_index_columns=False)

        self.assertEqual(result.values.tolist(), [
            [1, 21, 24, 31, 34, 10, 13],
            [2, 22, 25, 32, 35, 11, 14],
            [3, 23, 26, 33, 36, 12, 15],
        ])

        self.assertEqual(
            d3m_utils.to_json_structure(
                result.metadata.to_internal_simple_structure()),
            [{
                'selector': [],
                'metadata': {
                    'top_level':
                    'main',
                    'schema':
                    metadata_base.CONTAINER_SCHEMA_VERSION,
                    'structural_type':
                    'd3m.container.pandas.DataFrame',
                    'semantic_types':
                    ['https://metadata.datadrivendiscovery.org/types/Table'],
                    'dimension': {
                        'name':
                        'rows',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularRow'
                        ],
                        'length':
                        3,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__'],
                'metadata': {
                    'dimension': {
                        'name':
                        'columns',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularColumn'
                        ],
                        'length':
                        7,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
                'metadata': {
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 0],
                'metadata': {
                    'name': 'aaa111',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 1],
                'metadata': {
                    'name': 'aaa222',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 2],
                'metadata': {
                    'name': 'bbb222',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 3],
                'metadata': {
                    'name': 'aaa333',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 4],
                'metadata': {
                    'name': 'bbb333',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 5],
                'metadata': {
                    'name': 'd1',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 6],
                'metadata': {
                    'name': 'e1',
                    'structural_type': 'numpy.int64',
                },
            }])

        result = utils.combine_columns(main, [0, 1, 2, 3, 4],
                                       [columns2, columns3],
                                       return_result='replace',
                                       add_index_columns=False)

        self.assertEqual(result.values.tolist(), [
            [21, 24, 31, 34],
            [22, 25, 32, 35],
            [23, 26, 33, 36],
        ])

        self.assertEqual(
            d3m_utils.to_json_structure(
                result.metadata.to_internal_simple_structure()),
            [{
                'selector': [],
                'metadata': {
                    'top_level':
                    'main',
                    'schema':
                    metadata_base.CONTAINER_SCHEMA_VERSION,
                    'structural_type':
                    'd3m.container.pandas.DataFrame',
                    'semantic_types':
                    ['https://metadata.datadrivendiscovery.org/types/Table'],
                    'dimension': {
                        'name':
                        'rows',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularRow'
                        ],
                        'length':
                        3,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__'],
                'metadata': {
                    'dimension': {
                        'name':
                        'columns',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularColumn'
                        ],
                        'length':
                        4,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
                'metadata': {
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 0],
                'metadata': {
                    'name': 'aaa222',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 1],
                'metadata': {
                    'name': 'bbb222',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 2],
                'metadata': {
                    'name': 'aaa333',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 3],
                'metadata': {
                    'name': 'bbb333',
                    'structural_type': 'numpy.int64',
                },
            }])

        result = utils.combine_columns(main, [4], [columns2, columns3],
                                       return_result='replace',
                                       add_index_columns=False)

        self.assertEqual(result.values.tolist(), [
            [1, 4, 7, 10, 21, 24, 31, 34],
            [2, 5, 8, 11, 22, 25, 32, 35],
            [3, 6, 9, 12, 23, 26, 33, 36],
        ])

        self.assertEqual(
            d3m_utils.to_json_structure(
                result.metadata.to_internal_simple_structure()),
            [{
                'selector': [],
                'metadata': {
                    'top_level':
                    'main',
                    'schema':
                    metadata_base.CONTAINER_SCHEMA_VERSION,
                    'structural_type':
                    'd3m.container.pandas.DataFrame',
                    'semantic_types':
                    ['https://metadata.datadrivendiscovery.org/types/Table'],
                    'dimension': {
                        'name':
                        'rows',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularRow'
                        ],
                        'length':
                        3,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__'],
                'metadata': {
                    'dimension': {
                        'name':
                        'columns',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularColumn'
                        ],
                        'length':
                        8,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
                'metadata': {
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 0],
                'metadata': {
                    'name': 'aaa111',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 1],
                'metadata': {
                    'name': 'bbb111',
                    'extra': 'b_column',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 2],
                'metadata': {
                    'name': 'ccc111',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 3],
                'metadata': {
                    'name': 'd1',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 4],
                'metadata': {
                    'structural_type': 'numpy.int64',
                    'name': 'aaa222',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 5],
                'metadata': {
                    'structural_type': 'numpy.int64',
                    'name': 'bbb222',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 6],
                'metadata': {
                    'structural_type': 'numpy.int64',
                    'name': 'aaa333',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 7],
                'metadata': {
                    'structural_type': 'numpy.int64',
                    'name': 'bbb333',
                },
            }])

        result = utils.combine_columns(main, [0, 2, 4], [columns2, columns3],
                                       return_result='replace',
                                       add_index_columns=False)

        self.assertEqual(result.values.tolist(), [
            [21, 4, 24, 10, 31, 34],
            [22, 5, 25, 11, 32, 35],
            [23, 6, 26, 12, 33, 36],
        ])

        self.assertEqual(
            d3m_utils.to_json_structure(
                result.metadata.to_internal_simple_structure()),
            [{
                'selector': [],
                'metadata': {
                    'top_level':
                    'main',
                    'schema':
                    metadata_base.CONTAINER_SCHEMA_VERSION,
                    'structural_type':
                    'd3m.container.pandas.DataFrame',
                    'semantic_types':
                    ['https://metadata.datadrivendiscovery.org/types/Table'],
                    'dimension': {
                        'name':
                        'rows',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularRow'
                        ],
                        'length':
                        3,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__'],
                'metadata': {
                    'dimension': {
                        'name':
                        'columns',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularColumn'
                        ],
                        'length':
                        6,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'],
                'metadata': {
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 0],
                'metadata': {
                    'name': 'aaa222',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 1],
                'metadata': {
                    'name': 'bbb111',
                    'extra': 'b_column',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 2],
                'metadata': {
                    'name': 'bbb222',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 3],
                'metadata': {
                    'name': 'd1',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 4],
                'metadata': {
                    'name': 'aaa333',
                    'structural_type': 'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 5],
                'metadata': {
                    'name': 'bbb333',
                    'structural_type': 'numpy.int64',
                },
            }])
Exemple #19
0
    def test_combine_columns_new_with_index_noncompact_metadata(self):
        main = container.DataFrame(
            {
                'd3mIndex': [1, 2, 3],
                'b1': [4, 5, 6],
                'c1': [7, 8, 9]
            },
            columns=['d3mIndex', 'b1', 'c1'],
            generate_metadata=False)
        main.metadata = main.metadata.generate(main, compact=False)
        main.metadata = main.metadata.update_column(
            0, {
                'name':
                'd3mIndex',
                'semantic_types': [
                    'http://schema.org/Integer',
                    'https://metadata.datadrivendiscovery.org/types/PrimaryKey'
                ]
            })
        main.metadata = main.metadata.update_column(
            1, {
                'name':
                'b1',
                'semantic_types':
                ['https://metadata.datadrivendiscovery.org/types/Attribute']
            })
        main.metadata = main.metadata.update_column(
            2, {
                'name':
                'c1',
                'semantic_types':
                ['https://metadata.datadrivendiscovery.org/types/Attribute']
            })

        columns = container.DataFrame({
            'd3mIndex': [1, 2, 3],
            'b2': [4, 5, 6]
        },
                                      columns=['d3mIndex', 'b2'],
                                      generate_metadata=False)
        columns.metadata = columns.metadata.generate(columns, compact=False)
        columns.metadata = columns.metadata.update_column(
            0, {
                'name':
                'd3mIndex',
                'semantic_types': [
                    'http://schema.org/Integer',
                    'https://metadata.datadrivendiscovery.org/types/PrimaryKey'
                ]
            })
        columns.metadata = columns.metadata.update_column(
            1, {
                'name':
                'b2',
                'semantic_types':
                ['https://metadata.datadrivendiscovery.org/types/Attribute']
            })

        result = utils.combine_columns(main, [], [columns],
                                       return_result='new',
                                       add_index_columns=True)

        self.assertEqual(result.values.tolist(), [
            [1, 4],
            [2, 5],
            [3, 6],
        ])

        self.assertEqual(
            d3m_utils.to_json_structure(
                result.metadata.to_internal_simple_structure()),
            [{
                'selector': [],
                'metadata': {
                    'schema':
                    metadata_base.CONTAINER_SCHEMA_VERSION,
                    'structural_type':
                    'd3m.container.pandas.DataFrame',
                    'semantic_types':
                    ['https://metadata.datadrivendiscovery.org/types/Table'],
                    'dimension': {
                        'name':
                        'rows',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularRow'
                        ],
                        'length':
                        3,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__'],
                'metadata': {
                    'dimension': {
                        'name':
                        'columns',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularColumn'
                        ],
                        'length':
                        2,
                    },
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 0],
                'metadata': {
                    'name':
                    'd3mIndex',
                    'semantic_types': [
                        'http://schema.org/Integer',
                        'https://metadata.datadrivendiscovery.org/types/PrimaryKey'
                    ],
                    'structural_type':
                    'numpy.int64',
                },
            }, {
                'selector': ['__ALL_ELEMENTS__', 1],
                'metadata': {
                    'name':
                    'b2',
                    'semantic_types': [
                        'https://metadata.datadrivendiscovery.org/types/Attribute'
                    ],
                    'structural_type':
                    'numpy.int64',
                },
            }])
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Inputs]:
        """ Add SIMON annotations 

            Arguments:
                inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target

            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})

            Raises:
                PrimitiveNotFittedError: if primitive not fit

            Returns:
                CallResult[Outputs] -- Input pd frame with metadata augmented 

        """
        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        ## BEGIN originally from from d3m.primitives.schema_discovery.profiler.Common """
        assert self._add_semantic_types is not None
        assert self._remove_semantic_types is not None

        columns_to_use, output_columns = self._produce_columns(
            inputs, self._add_semantic_types, self._remove_semantic_types)

        if self.hyperparams['replace_index_columns'] and self.hyperparams[
                'return_result'] == 'append':
            assert len(columns_to_use) == len(output_columns)

            index_columns = inputs.metadata.get_index_columns()

            index_columns_to_use = []
            other_columns_to_use = []
            index_output_columns = []
            other_output_columns = []
            for column_to_use, output_column in zip(columns_to_use,
                                                    output_columns):
                if column_to_use in index_columns:
                    index_columns_to_use.append(column_to_use)
                    index_output_columns.append(output_column)
                else:
                    other_columns_to_use.append(column_to_use)
                    other_output_columns.append(output_column)

            outputs = base_utils.combine_columns(
                inputs,
                index_columns_to_use,
                index_output_columns,
                return_result='replace',
                add_index_columns=self.hyperparams['add_index_columns'])
            outputs = base_utils.combine_columns(
                outputs,
                other_columns_to_use,
                other_output_columns,
                return_result='append',
                add_index_columns=self.hyperparams['add_index_columns'])
        else:
            outputs = base_utils.combine_columns(
                inputs,
                columns_to_use,
                output_columns,
                return_result=self.hyperparams['return_result'],
                add_index_columns=self.hyperparams['add_index_columns'])
        ## EMD originally from from d3m.primitives.schema_discovery.profiler.Common """

        return CallResult(outputs, has_finished=self._is_fit)
Exemple #21
0
	def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:	

		"""

		Args:

			inputs: Container DataFrame

			timeout: Default

			iterations: Default

		Returns:

		    Container DataFrame containing Matrix Profile of selected columns
		
		"""

		# Get cols to fit.
		self._fitted = False
		self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams)
		self._input_column_names = self._training_inputs.columns


		if len(self._training_indices) > 0:
			self._fitted = True
		else:
			if self.hyperparams['error_on_no_input']:
				raise RuntimeError("No input columns were selected")
			self.logger.warn("No input columns were selected")

		if not self._fitted:
			raise PrimitiveNotFittedError("Primitive not fitted.")
		
		sk_inputs = inputs
		if self.hyperparams['use_semantic_types']:
			sk_inputs = inputs.iloc[:, self._training_indices]
		output_columns = []
		if len(self._training_indices) > 0:
			sk_output = self._clf.produce(sk_inputs)
			if sparse.issparse(sk_output):
				sk_output = sk_output.toarray()
			outputs = self._wrap_predictions(inputs, sk_output)
			
			if len(outputs.columns) == len(self._input_column_names):
				outputs.columns = self._input_column_names
			output_columns = [outputs]

		else:
			if self.hyperparams['error_on_no_input']:
				raise RuntimeError("No input columns were selected")
			self.logger.warn("No input columns were selected")

		outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
							   add_index_columns=self.hyperparams['add_index_columns'],
							   inputs=inputs, column_indices=self._training_indices,
							   columns_list=output_columns)
		#print(outputs)
		#CallResult(outputs)
		#print("___")
		print(outputs.columns)
		#outputs.columns = [str(x) for x in outputs.columns]

		return CallResult(outputs)
Exemple #22
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        """
            Args:
                inputs: Container DataFrame

            Returns:
                Container DataFrame added with binary version of a column a sort of one hot encoding of values under different columns
                named as "column name_category value" for all the columns passed in list while building the pipeline
        """

        assert isinstance(inputs, container.DataFrame), type(dataframe)

        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if len(self._training_indices) > 0:
            # self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            cols = [inputs.columns[x] for x in self._training_indices]
            sk_inputs = container.DataFrame(
                data=inputs.iloc[:, self._training_indices].values,
                columns=cols,
                generate_metadata=True)

        output_columns = []
        if len(self._training_indices) > 0:
            sk_output = self._clf.produce(sk_inputs)
            # print("sk_ouput",sk_output)
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)
            # if len(outputs.columns) == len(self._input_column_names):
            #     outputs.columns = self._input_column_names
            output_columns = [outputs]
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._training_indices,
            columns_list=output_columns)

        # self._update_metadata(outputs)
        return base.CallResult(outputs)
Exemple #23
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame.

        Returns:
            Container DataFrame after BKFilter.
        """
        # Get cols to fit.

        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        operated_col = [
            int(x.strip('#'))
            for x in re.findall(r'#\d*#', self.hyperparams['rule'])
        ]
        if set(operated_col) != set(self._training_indices):
            # print(operated_col, self._training_indices)
            raise RuntimeError(
                "Column numbers in 'rule' and 'use_columns' are not matched.")

        if len(self._training_indices) > 0:
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        # if not self._fitted:
        #     raise PrimitiveNotFittedError("Primitive not fitted.")
        # sk_inputs = inputs
        # if self.hyperparams['use_semantic_types']:
        #     sk_inputs = inputs.iloc[:, self._training_indices]

        output_columns = []

        if len(self._training_indices) > 0:
            sk_output = self._rule_based_filter(inputs,
                                                self.hyperparams['rule'])
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)

            if len(outputs.columns) == len(self._input_column_names):
                outputs.columns = self._input_column_names
            output_columns = [outputs]

        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._training_indices,
            columns_list=output_columns)

        # self._write(outputs)
        # self.logger.warning('produce was called3')
        return CallResult(outputs)
Exemple #24
0
    def produce(
        self,
        *,
        inputs: base.FileReaderInputs,
        timeout: float = None,
        iterations: int = None,
    ) -> base_prim.CallResult[base.FileReaderOutputs]:
        logger.debug(f"Producing {__name__}")

        columns_to_use = self._get_columns(inputs.metadata)
        inputs_clone = inputs.copy()
        if len(columns_to_use) == 0:
            return base_prim.CallResult(inputs_clone)
        column_index = columns_to_use[0]

        band_column_indices = self._get_band_column(inputs.metadata)
        if len(band_column_indices) == 0:
            band_column_name = self.hyperparams["band_column"]
        else:
            band_column_name = inputs.columns[band_column_indices[0]]

        # need to flatten the dataframe, creating a list of files per tile
        grouping_column = self._get_grouping_key_column(inputs_clone)
        if grouping_column < 0:
            self.logger.warning(
                "no columns to use for grouping key so returning loaded images as output"
            )
            return base_prim.CallResult(inputs_clone)

        base_uri = inputs_clone.metadata.query(
            (metadata_base.ALL_ELEMENTS,
             column_index))["location_base_uris"][0]
        grouping_name = inputs_clone.columns[grouping_column]
        file_column_name = inputs_clone.columns[column_index]

        start = time.time()
        logger.debug("Loading images")

        # group by grouping key to get all the images loaded in one row
        groups = inputs_clone.groupby([grouping_name], sort=False)

        # use the max dimension for the first group as the max dimension for all groups
        group_key = groups[grouping_name].first()[0]
        max_dimension = self._get_group_image_size(groups.get_group(group_key),
                                                   file_column_name,
                                                   band_column_name, base_uri)

        # load images for each group and store them in a matrix of [band, x, y]
        jobs = [
            delayed(self._load_image_group)(
                group[1][file_column_name],
                group[1][band_column_name],
                base_uri,
                max_dimension,
            ) for group in tqdm(groups, total=len(groups))
        ]
        groups = Parallel(n_jobs=self.hyperparams["n_jobs"],
                          backend="loky",
                          verbose=10)(jobs)
        end = time.time()
        logger.debug(f"Loaded images in {end-start}s")

        logger.debug("Updating metadata")
        start = time.time()

        # auto-generate metdata for one row's worth of data - necessary to avoid having the generation step traverse all of the data
        # which is extremely slow
        first_df = container.DataFrame(
            {
                file_column_name: [groups[0]]
            }, generate_metadata=True).reset_index(drop=True)
        rest_df = container.DataFrame({file_column_name: groups[1:]})
        grouped_df = first_df.append(rest_df, ignore_index=True)

        grouped_df.metadata = grouped_df.metadata.update(
            (), {"dimension": {
                "length": grouped_df.shape[0]
            }})
        grouped_df.metadata = grouped_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0), "http://schema.org/ImageObject")
        end = time.time()
        logger.debug(f"Updated metadata in {end-start}s")

        # only keep one row / group from the input - use the first band value to select against
        first_band = inputs_clone[band_column_name][0]
        first_groups = inputs_clone.loc[inputs_clone[band_column_name] ==
                                        first_band].reset_index(drop=True)

        outputs = base_utils.combine_columns(
            first_groups,
            [column_index],
            [grouped_df],
            return_result=self.hyperparams["return_result"],
            add_index_columns=self.hyperparams["add_index_columns"],
        )
        if self.hyperparams["return_result"] == "append":
            outputs.metadata = self._reassign_boundaries(
                outputs.metadata, columns_to_use)
        outputs.metadata = outputs.metadata.update(
            (), {"dimension": {
                "length": outputs.shape[0]
            }})

        polygon_columns = outputs.metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/LocationPolygon",
             ))
        vector_columns = outputs.metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/FloatVector", ))
        if len(vector_columns) > 0 and len(polygon_columns) == 0:
            outputs.metadata = outputs.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, vector_columns[0]),
                "https://metadata.datadrivendiscovery.org/types/LocationPolygon",
            )

        return base_prim.CallResult(outputs)