def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : dataframe 

        Returns
        ----------
        Outputs
            The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter
            For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe
        """ 
    
        targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]
 
        X_test = inputs.drop(columns = list(inputs)[index[0]])
        X_test = X_test.drop(columns = target_names).values
        
        # special semi-supervised case - during training, only produce rows with labels
        series = inputs[target_names] != ''
        if series.any().any():
            inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        sc_df = d3m_DataFrame(pandas.DataFrame(self.sc.fit_predict(X_test), columns=['cluster_labels']))

        # just add last column of last column ('clusters')
        col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type(1)
        if self.hyperparams['task_type'] == 'classification':
            col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')
            col_dict['name'] = 'cluster_labels'
        else:
            col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')
            col_dict['name'] = target_names[0]
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict)
        
        df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) 
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = ('https://metadata.datadrivendiscovery.org/types/TabularColumn',)
        df_dict_1['length'] = 1        
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict)
                
        return CallResult(utils_cp.append_columns(inputs, sc_df))
                  
    def _scalar_filter(self, inputs, vector_column):
        max_value = self.hyperparams["maxs"]
        min_value = self.hyperparams["mins"]
        indices = inputs.index.tolist()

        if min_value == None:
            float("-inf")
        if max_value == None:
            float("inf")

        try:
            rows = np.stack(inputs.iloc[:, vector_column], axis=0)

            rows = np.logical_and(
                self._min_comparison_op(
                    rows,
                    min_value,
                ),
                self._max_comparision_op(rows, max_value),
            )
            rows_to_keep = rows.sum(axis=1) == rows.shape[1]
        except ValueError as error:
            rows = inputs.iloc[:, vector_column]

            def _filter_r(row, min_val, max_val):
                return np.logical_and(
                    self._min_comparison_op(
                        row,
                        min_val,
                    ),
                    self._max_comparision_op(
                        row,
                        max_val,
                    ),
                )

            rows = rows.apply(
                _filter_r,
                args=(min_value, max_value),
            )
            rows_to_keep = rows.apply(np.sum) == rows.apply(np.shape).apply(
                np.take, args=([0]))
        if self.hyperparams["inclusive"]:
            rows_to_keep = [
                indices[j] for j in range(len(indices)) if rows_to_keep[j]
            ]
        else:
            rows_to_keep = [
                indices[j] for j in range(len(indices)) if not rows_to_keep[j]
            ]
        return dataframe_utils.select_rows(inputs, rows_to_keep)
예제 #3
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : dataframe with attached metadata for semi-supervised or unsupervised data

        Returns
        ----------
        Outputs
            The output depends on the required_output hyperparameter and is either a dataframe containing a single column 
            where each entry is the cluster ID, or the input daatframe with the cluster ID of each row added as an additional feature. 
        """

        # find target and index variables
        targets = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]

        X_test = inputs.copy()
        if len(index):
            X_test = X_test.drop(columns=list(inputs)[index[0]])
        if len(target_names):
            X_test = X_test.drop(columns=target_names)
        X_test = X_test.values

        # special semi-supervised case - during training, only produce rows with labels
        series = inputs[target_names] != ''
        if series.any().any():
            inputs = dataframe_utils.select_rows(inputs,
                                                 np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        if self.hyperparams['required_output'] == 'feature':

            hdb_df = d3m_DataFrame(
                pandas.DataFrame(self.clf.fit_predict(X_test),
                                 columns=['cluster_labels']))

            # just add last column of last column ('clusters')
            col_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = 'cluster_labels'
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/Attribute')
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)

            df_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = 1
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(utils_cp.append_columns(inputs, hdb_df))
        else:

            hdb_df = d3m_DataFrame(
                pandas.DataFrame(self.clf.fit_predict(X_test),
                                 columns=[target_names[0]]))

            hdb_df = pandas.concat([inputs.d3mIndex, hdb_df], axis=1)

            col_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = index_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)

            col_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = target_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
            )
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 1), col_dict)

            df_dict = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                hdb_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = 2
            hdb_df.metadata = hdb_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(hdb_df)
예제 #4
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series 

        Returns
        ----------
        Outputs
            The output is a dataframe containing a single column where each entry is the associated series' cluster number.
        """

        hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
        )['primitive_code']['class_type_arguments']['Hyperparams']
        ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
            hyperparams=hyperparams_class.defaults().replace(
                {"dataframe_resource": "learningData"}))
        metadata_inputs = ds2df_client.produce(inputs=inputs).value

        # temporary (until Uncharted adds conversion primitive to repo)
        if not self.hyperparams['long_format']:
            formatted_inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            formatted_inputs = d3m_DataFrame(
                ds2df_client.produce(inputs=inputs).value)

        # store information on target, index variable
        targets = metadata_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = metadata_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(metadata_inputs)[t] for t in targets]
        index = metadata_inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')

        # parse values from output of time series formatter
        n_ts = len(formatted_inputs.d3mIndex.unique())
        if n_ts == formatted_inputs.shape[0]:
            X_test = formatted_inputs.drop(
                columns=list(formatted_inputs)[index[0]])
            X_test = X_test.drop(columns=target_names).values
        else:
            ts_sz = int(formatted_inputs.shape[0] / n_ts)
            X_test = np.array(formatted_inputs.value).reshape(n_ts, ts_sz)

        # special semi-supervised case - during training, only produce rows with labels
        series = metadata_inputs[target_names] != ''
        if series.any().any():
            metadata_inputs = dataframe_utils.select_rows(
                metadata_inputs, np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        sloth_df = d3m_DataFrame(
            pandas.DataFrame(self.clf.fit_predict(X_test),
                             columns=['cluster_labels']))
        # last column ('clusters')
        col_dict = dict(
            sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type(1)
        col_dict['name'] = 'cluster_labels'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/Attribute',
            'https://metadata.datadrivendiscovery.org/types/CategoricalData')
        sloth_df.metadata = sloth_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)
        df_dict = dict(sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(
            sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = (
            'https://metadata.datadrivendiscovery.org/types/TabularColumn', )
        df_dict_1['length'] = 1
        sloth_df.metadata = sloth_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, ), df_dict)

        return CallResult(utils_cp.append_columns(metadata_inputs, sloth_df))
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        vector_column = self._get_floatvector_column(inputs.metadata)
        if vector_column is None:
            return base.CallResult(inputs)

        maxs = self.hyperparams["maxs"]
        mins = self.hyperparams["mins"]

        if type(mins) == float or type(mins) == int:
            return base.CallResult(self._scalar_filter(inputs, vector_column))

        indices = inputs.index.tolist()

        mins = [float("-inf") if i == None else i for i in mins]
        maxs = [float("inf") if i == None else i for i in maxs]

        indices_to_keep = np.empty((inputs.shape[0], ))

        try:
            rows = np.stack(inputs.iloc[:, vector_column], axis=0)

            filter_length = rows.shape[1]

            rows = np.logical_and(
                self._min_comparison_op(
                    rows[:, :filter_length],
                    mins,
                ),
                self._max_comparision_op(rows[:, :filter_length], maxs),
            )
            rows_to_keep = rows.sum(axis=1) == filter_length
        except ValueError as error:
            # rows had uneven length
            rows = inputs.iloc[:, vector_column]
            # get length of each vector
            vector_lengths = rows.apply(np.shape).apply(np.take, args=([0]))

            filter_lengths = vector_lengths.values
            # need this to loop over lengths array while keeping vectorised
            # apply function over rows
            count_for_ref = [0]

            def _filter_r(row, filter_lengths, mins, maxs, counter):
                # in case fewer filters than row length
                filterable_range = min(filter_lengths[counter[0]], len(mins))

                mins_for_filter = np.array(mins[:filterable_range])
                maxs_for_filter = np.array(maxs[:filterable_range])

                filtered_row = np.logical_and(
                    self._min_comparison_op(row[:filterable_range],
                                            mins_for_filter),
                    self._max_comparision_op(
                        row[:filterable_range],
                        maxs_for_filter,
                    ),
                )
                counter[0] += 1
                return filtered_row

            rows = rows.apply(
                _filter_r,
                args=(filter_lengths, mins, maxs, count_for_ref),
            )
            rows_to_keep = rows.apply(np.sum).values == filter_lengths

        if self.hyperparams["inclusive"]:
            indices_to_keep = [
                indices[j] for j in range(len(indices)) if rows_to_keep[j]
            ]
        else:
            indices_to_keep = [
                indices[j] for j in range(len(indices)) if not rows_to_keep[j]
            ]

        outputs = dataframe_utils.select_rows(inputs, indices_to_keep)

        return base.CallResult(outputs)
예제 #6
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None
                ) -> CallResult[container.pandas.DataFrame]:
        """
        Parameters
        ----------
        inputs : D3M dataframe with associated metadata.

        Returns
        -------
        Outputs
            For unsupervised problems: The output is a dataframe containing a single column where each entry is the associated series' cluster number.
            For semi-supervised problems: The output is the input df containing an additional feature - cluster_label
        """
        #hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
        #ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(hyperparams = hyperparams_class.defaults().replace({"dataframe_resource":"learningData"}))
        #metadata_inputs = ds2df_client.produce(inputs = inputs).value

        #formatted_inputs = ds2df_client.produce(inputs = inputs).value

        # store information on target, index variable
        targets = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]

        # load and reshape training data
        n_ts = len(inputs.d3mIndex.unique())
        if n_ts == inputs.shape[0]:
            X_test = inputs.drop(columns=list(inputs)[index[0]])
            X_test = X_test.drop(columns=target_names).values
        else:
            ts_sz = int(inputs.shape[0] / n_ts)
            X_test = np.array(inputs.value).reshape(n_ts, ts_sz, 1)

        # special semi-supervised case - during training, only produce rows with labels
        if self.clustering:

            sloth_df = d3m_DataFrame(
                pandas.DataFrame(self._kmeans.predict(X_test),
                                 columns=[target_names[0]]))

            sloth_df = pandas.concat([inputs.d3mIndex, sloth_df], axis=1)

            # first column ('d3mTndex')

            col_dict = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type("1")
            col_dict['name'] = index_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
            )
            sloth_df.metadata = sloth_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)

            # second column ('Class')
            col_dict = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
            col_dict['structural_type'] = type("1")
            col_dict['name'] = target_names[0]
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
            )
            sloth_df.metadata = sloth_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 1), col_dict)

            df_dict = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = 2
            sloth_df.metadata = sloth_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(sloth_df)

        else:
            series = inputs[target_names] != ''
            if series.any().any():
                inputs = dataframe_utils.select_rows(inputs,
                                                     np.flatnonzero(series))
                X_test = X_test[np.flatnonzero(series)]

            sloth_df = d3m_DataFrame(
                pandas.DataFrame(self._kmeans.predict(X_test),
                                 columns=['cluster_labels']))

            # add clusters as a feature in the main dataframe - last column ('clusters')
            col_dict = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
            col_dict['structural_type'] = type(1)
            col_dict['name'] = 'cluster_labels'
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/Attribute',
                'https://metadata.datadrivendiscovery.org/types/CategoricalData'
            )
            sloth_df.metadata = sloth_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, 0), col_dict)
            df_dict = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict_1 = dict(
                sloth_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
            df_dict['dimension'] = df_dict_1
            df_dict_1['name'] = 'columns'
            df_dict_1['semantic_types'] = (
                'https://metadata.datadrivendiscovery.org/types/TabularColumn',
            )
            df_dict_1['length'] = 1
            sloth_df.metadata = sloth_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, ), df_dict)

            return CallResult(utils_cp.append_columns(inputs, sloth_df))