Esempio n. 1
0
class MetafeatureExtractor(FeaturizationTransformerPrimitiveBase[Inputs,
                                                                 Outputs,
                                                                 Hyperparams]):
    """
    A primitive which takes a DataFrame and computes metafeatures on the data.
    Target column is identified by being labeled with 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in 'semantic_types' metadata.
    Otherwise primitive assumes there is no target column and only metafeatures that do not involve targets are returned.
    If DataFrame metadata does not include semantic type labels for each column, columns will be classified as CATEGORICAL or NUMERIC according
    to their dtype: int and float are NUMERIC, all others are CATEGORICAL.
    Metafeatures are stored in the metadata object of the DataFrame, and the DataFrame itself is returned unchanged
    """

    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_base.PrimitiveMetadata({
        'id':
        '28d12214-8cb0-4ac0-8946-d31fcbcd4142',
        'version':
        __version__,
        'name':
        'Dataset Metafeature Extraction',
        'source': {
            'name': 'byu-dml',
            'contact': 'mailto:[email protected]',
            'uris': ['https://github.com/byu-dml/d3m-primitives']
        },
        'installation': [{
            'type': metadata_base.PrimitiveInstallationType.PIP,
            'package': 'byudml',
            'version': __version__,
        }],
        'location_uris': [
            'https://github.com/byu-dml/d3m-primitives/blob/master/byudml/metafeature_extraction/metafeature_extraction.py'
        ],
        'python_path':
        __metafeature_path__,
        'primitive_family':
        metadata_base.PrimitiveFamily.METALEARNING,
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.DATA_PROFILING,
            metadata_base.PrimitiveAlgorithmType.STATISTICAL_MOMENT_ANALYSIS,
            metadata_base.PrimitiveAlgorithmType.
            INFORMATION_THEORETIC_METAFEATURE_EXTRACTION,
            # metadata_base.PrimitiveAlgorithmType.LANDMARKING_METAFEATURE_EXTRACTION, # TODO
            # metadata_base.PrimitiveAlgorithmType.MODEL_BASED_METAFEATURE_EXTRACTION, # TODO
            metadata_base.PrimitiveAlgorithmType.
            STATISTICAL_METAFEATURE_EXTRACTION,
        ],
    })

    _mapping_file_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
        'metalearn_to_d3m_map.json')

    def __init__(
            self,
            *,
            hyperparams: Hyperparams,
            random_seed: int = 0,
            docker_containers: typing.Dict[str,
                                           DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

    # prepare the data, target_series, and column_types arguments necessary for metafeature computation
    def _get_data_for_metafeature_computation(self, metadata, data):
        column_types = {}
        target_col_names = []
        target_series = None
        for col_pos, column_name in enumerate(data.columns):
            column_metadata = metadata.query_column(col_pos)
            semantic_types = column_metadata.get('semantic_types', tuple())
            column_name = column_metadata.get('name', column_name)
            if not self._remove_redacted_column(data, column_name,
                                                semantic_types):
                self._update_column_type(data, column_name, semantic_types,
                                         column_types)
                self._append_target_column_name(column_name, semantic_types,
                                                target_col_names)
        if INDEX_COLUMN_NAME in data.columns:
            data.drop(INDEX_COLUMN_NAME, axis=1, inplace=True)
            del column_types[INDEX_COLUMN_NAME]
        if len(target_col_names) == 1:
            target_series = data[target_col_names[0]]
            data.drop(target_col_names[0], axis=1, inplace=True)
        elif len(target_col_names) > 1:
            self.logger.warning(
                f'\nWARNING: Target dependent metafeatures are not supported for multi-label datasets and will not be computed\n'
            )
        return data, target_series, column_types

    def _d3m_metafeature_name_to_metalearn_functions(self,
                                                     d3m_metafeature_name):
        metalearn_functions = []
        mapping = json.load(open(self._mapping_file_path))
        for function_name, properties in mapping.items():
            metafeature_name = properties['data_metafeatures_path'].split(
                '.')[0]
            if metafeature_name == d3m_metafeature_name:
                metalearn_functions.append(function_name)
        return metalearn_functions

    # recursively adds a value to a dictionary given a series of one or more keys
    def _place_value(self, dictionary, path, value):
        if len(path) == 0:
            return value
        sub_dict = dictionary.get(path[0], {})
        dictionary[path[0]] = self._place_value(sub_dict, path[1:], value)
        return dictionary

    # parses the mapping file to obtain a list of all the metalearn metafeatures that are classified as inexpensive
    def _get_inexpensive_subset(self):
        inexpensive_subset = []
        mapping = json.load(open(self._mapping_file_path))
        for key, value in mapping.items():
            if value['computation_time'] == 'inexpensive':
                d3m_metafeature_name = value['data_metafeatures_path'].split(
                    '.')[0]
                if d3m_metafeature_name not in inexpensive_subset:
                    inexpensive_subset.append(d3m_metafeature_name)
        return inexpensive_subset

    # returns the user's desired metafeature set according to hyperparam object
    def _get_metafeatures_to_compute(self):
        if self.hyperparams['metafeature_subset'] == 'CUSTOM':
            return self.hyperparams['metafeatures_to_compute']
        elif self.hyperparams['metafeature_subset'] == 'INEXPENSIVE':
            return self._get_inexpensive_subset()
        elif self.hyperparams['metafeature_subset'] == 'ALL':
            # Just get every metafeature name in the mapping
            mapping = json.load(open(self._mapping_file_path))
            return [
                mf_obj['data_metafeatures_path'].split('.')[0]
                for mf_obj in mapping.values()
            ]

    def _get_landmarking_metafeatures(self):
        landmarking_mfs = []
        mapping = json.load(open(self._mapping_file_path))
        for key, value in mapping.items():
            if 'landmarking' in value:
                if value['landmarking'] == True:
                    landmarking_mfs.append(key)
        return landmarking_mfs

    # set the 'primitive' and 'random_seed' fields for metafeatures who's results could vary depending on implementation
    def _set_implementation_fields(self, data_metafeatures,
                                   data_metafeatures_path):
        landmarking_name = data_metafeatures_path[0]
        if landmarking_name not in data_metafeatures:
            primitive_field_path = [landmarking_name, 'primitive']
            random_seed_field_path = [landmarking_name, 'random_seed']
            primitive_field_val = {
                'id': self.metadata.query()['id'],
                'version': __version__,
                'python_path': self.metadata.query()['python_path'],
                'name': self.metadata.query()['name']
            }
            if 'digest' in self.metadata.query():
                primitive_field_val['digest'] = self.metadata.query()['digest']
            random_seed_field_val = self.random_seed
            data_metafeatures = self._place_value(data_metafeatures,
                                                  primitive_field_path,
                                                  primitive_field_val)
            data_metafeatures = self._place_value(data_metafeatures,
                                                  random_seed_field_path,
                                                  random_seed_field_val)
        return data_metafeatures

    # populate metadata with metafeatures and return it
    def _populate_metadata(
        self,
        metafeatures,
        metadata,
    ):
        dataframe_metadata = dict(metadata.query((), ))
        data_metafeatures = dataframe_metadata.get('data_metafeatures', {})
        mapping = json.load(open(self._mapping_file_path))
        for column_name in metafeatures.columns:
            if column_name[-4:] != 'Time':
                data_metafeatures_path = mapping[column_name][
                    'data_metafeatures_path'].split('.')
                metafeature_val = metafeatures[column_name][0]
                if pd.notna(metafeature_val) and metafeature_val not in (
                        mf_consts.TIMEOUT, mf_consts.NO_TARGETS,
                        mf_consts.NUMERIC_TARGETS):
                    if column_name in self._get_landmarking_metafeatures():
                        data_metafeatures = self._set_implementation_fields(
                            data_metafeatures, data_metafeatures_path)
                    if mapping[column_name]['required_type'] == 'integer':
                        metafeature_val = int(metafeature_val)
                    data_metafeatures = self._place_value(
                        data_metafeatures, data_metafeatures_path,
                        metafeature_val)
        dataframe_metadata['data_metafeatures'] = data_metafeatures
        if 'schema' not in dataframe_metadata:
            dataframe_metadata[
                'schema'] = 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json'
        if 'structural_type' not in dataframe_metadata:
            dataframe_metadata['structural_type'] = DataFrame
        metadata = metadata.update((), dataframe_metadata)
        return metadata

    # given a d3m DataFrame, return it with the computed metafeatures (specified by the hyperparam) added to it's metadata
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if not isinstance(inputs, DataFrame):
            raise ValueError(
                'inputs must be an instance of \'d3m.container.pandas.DataFrame\''
            )
        metadata = self._produce(inputs.metadata, copy.copy(inputs))

        inputs.metadata = metadata.generate(inputs)

        return CallResult(inputs)

    # add the column types to the column_types dict and convert the column to the appropriate data types if necessary
    def _update_column_type(self, data, column_name, semantic_types,
                            column_types):
        if 'http://schema.org/Float' in semantic_types \
            or 'http://schema.org/Integer' in semantic_types and 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in semantic_types:
            column_types[column_name] = mf_consts.NUMERIC
            actual_type = str(data[column_name].dtype)
            if 'int' not in actual_type and 'float' not in actual_type:
                data[column_name] = pd.to_numeric(data[column_name])
        else:
            column_types[column_name] = mf_consts.CATEGORICAL

    # remove redacted column from data by checking if it has one of the redacted semantic types
    def _remove_redacted_column(self, data, column_name, semantic_types):
        if 'https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData' in semantic_types \
            or 'https://metadata.datadrivendiscovery.org/types/RedactedTarget' in semantic_types:
            data.drop(column_name, axis=1, inplace=True)
            return True
        return False

    # check if a column is a target and if so add it to the target_col_names list
    def _append_target_column_name(self, column_name, semantic_types,
                                   target_col_names):
        if 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in semantic_types:
            target_col_names.append(column_name)

    def _produce(self, metadata, data):
        # get data related inputs for the metafeature computation
        data, target_series, column_types = self._get_data_for_metafeature_computation(
            metadata, data)

        # translate d3m metafeature names to metalearn names
        d3m_metafeatures_to_compute = self._get_metafeatures_to_compute()
        if d3m_metafeatures_to_compute is not None:
            metalearn_metafeatures_to_compute = []
            for mf in d3m_metafeatures_to_compute:
                metalearn_functions = self._d3m_metafeature_name_to_metalearn_functions(
                    mf)
                metalearn_metafeatures_to_compute.extend(metalearn_functions)
        else:
            metalearn_metafeatures_to_compute = None

        # compute metafeatures and return in metadata
        metafeatures = Metafeatures().compute(
            data,
            target_series,
            column_types=column_types,
            metafeature_ids=metalearn_metafeatures_to_compute,
            seed=self.random_seed)
        metafeature_df = pd.DataFrame.from_dict([{
            mf: metafeatures[mf][mf_consts.VALUE_KEY]
            for mf in metafeatures
        }])
        metadata = self._populate_metadata(metafeature_df, metadata)
        return metadata
class RandomSamplingImputer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs,
                                                             Params,
                                                             Hyperparams]):
    """
    This imputes missing values in a DataFrame by sampling known values from each column independently. If the training
    data has no known values in a particular column, no values are imputed. Alternatively, columns with missing values
    can be dropped. By default columns of all missing values are dropped.
    """

    metadata = metadata_base.PrimitiveMetadata({
        'id':
        'ebfeb6f0-e366-4082-b1a7-602fd50acc96',
        'version':
        __version__,
        'name':
        'Random Sampling Imputer',
        'source': {
            'name': 'byu-dml',
            'contact': 'mailto:[email protected]',
            'uris': [
                'https://github.com/byu-dml/d3m-primitives',
            ]
        },
        'installation': [{
            'type': metadata_base.PrimitiveInstallationType.PIP,
            'package': 'byudml',
            'version': __version__,
        }],
        'location_uris': [
            'https://github.com/byu-dml/d3m-primitives/blob/master/byudml/imputer/random_sampling_imputer.py',
        ],
        'python_path':
        __imputer_path__,
        'primitive_family':
        metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.IMPUTATION,
        ],
        'effects': [
            # not the case if empty columns are just ignored
            metadata_base.PrimitiveEffect.NO_MISSING_VALUES,
        ]
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)
        self._random_state = np.random.RandomState(self.random_seed)

        self._fitted: bool = False
        self._training_inputs: Inputs = None
        self._known_values = None
        self._drop_cols = None
        self._drop_col_indices = None

    def set_training_data(self, *, inputs: Inputs) -> None:
        self._fitted = False
        self._training_inputs = inputs
        self._known_values = []
        self._drop_cols = []
        self._drop_col_indices = []

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None:
            raise d3m_exceptions.MissingValueError(
                'set_training_data must be called before fit')

        # operate on columns by index, not name
        for i, (col_name, col) in enumerate(self._training_inputs.iteritems()):
            drop_col = False
            if self.hyperparams['drop_missing_values']:
                if self.hyperparams['how'] == 'all' and col.isnull().all():
                    drop_col = True
                elif self.hyperparams['how'] == 'any' and col.isnull().any():
                    drop_col = True
            self._drop_cols.append(drop_col)
            if drop_col:
                self._drop_col_indices.append(i)

            col_known_values = None
            if not drop_col:
                col_known_values = col.dropna(axis=0, how='any').tolist()
            self._known_values.append(col_known_values)

        self._fitted = True
        self._training_inputs = None  # free memory

        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            raise d3m_exceptions.PrimitiveNotFittedError(
                'fit must be called before produce')

        if inputs.shape[1] != len(self._known_values):
            raise d3m_exceptions.DimensionalityMismatchError(
                'The number of input columns does not match the training data: {} != {}'
                .format(inputs.shape[1], len(self._known_values)))

        outputs = inputs.copy()
        for i, (col_name, col) in enumerate(inputs.iteritems()):
            if self._drop_cols[i]:
                assert self._known_values[i] is None
            else:
                indices_of_missing_values = col.isnull()
                n_missing = indices_of_missing_values.sum()
                n_known = len(self._known_values[i])
                if n_missing > 0 and n_known > 0:  # k_known == 0 implies drop_missing_values == False
                    outputs.loc[indices_of_missing_values,
                                col_name] = self._random_state.choice(
                                    self._known_values[i],
                                    n_missing,
                                    replace=True)
                    # TODO: update column metadata?

        outputs = outputs.remove_columns(self._drop_col_indices)

        # TODO: update global metadata if any values were imputed?

        return CallResult(outputs)

    def get_params(self) -> Params:
        if not self._fitted:
            raise d3m_exceptions.PrimitiveNotFittedError(
                'fit must be called before get_params')
        return Params(known_values=self._known_values,
                      drop_cols=self._drop_cols,
                      drop_col_indices=self._drop_col_indices)

    def set_params(self, *, params: Params) -> None:
        self._fitted = True
        self._training_inputs = None
        self._known_values = params['known_values']
        self._drop_cols = params['drop_cols']
        self._drop_col_indices = params['drop_col_indices']
class TimeSeriesFormatterPrimitive(
        transformer.TransformerPrimitiveBase[container.Dataset,
                                             container.Dataset, Hyperparams]):
    """
    Reads the time series files from a given column in an input dataset resource into a new M x N data resource,
    where each value in timeseries occupies one of M rows. Each row has N columns, representing the union of
    the fields found in the timeseries files and in the main data resource. The loading process assumes that
    each series file has an identical set of timestamps.  The `GroupingKey` semantic type will be added to the
    column that contains the file names, and the time column will be marked with the `Time` semantic type.

    Example output::

        filename    | time      | value     | label     |
        -------------------------------------------------
        f1.csv      | 0         | 0.1       | alpha     |
        f1.csv      | 1         | 0.12      | alpha     |
        f1.csv      | 2         | 0.13      | alpha     |
        f2.csv      | 0         | 0.72      | bravo     |
        f2.csv      | 1         | 0.77      | bravo     |
        f2.csv      | 2         | 0.67      | bravo     |
    """

    _semantic_types = (
        "https://metadata.datadrivendiscovery.org/types/FileName",
        "https://metadata.datadrivendiscovery.org/types/Timeseries",
        "http://schema.org/Text",
        "https://metadata.datadrivendiscovery.org/types/Attribute",
    )
    _media_types = ("text/csv", )
    _resource_id = "learningData"

    __author__ = ("Uncharted Software", )
    metadata = metadata_base.PrimitiveMetadata({
        "id":
        "6a1ce3ee-ee70-428b-b1ff-0490bdb23023",
        "version":
        version.__version__,
        "name":
        "Time series formatter",
        "python_path":
        "d3m.primitives.data_transformation.time_series_formatter.DistilTimeSeriesFormatter",
        "keywords": ["series", "reader", "csv"],
        "source": {
            "name":
            "Distil",
            "contact":
            "mailto:[email protected]",
            "uris": [
                "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/time_series_formatter.py",
                "https://gitlab.com/uncharted-distil/distil-primitives",
            ],
        },
        "installation": [
            CYTHON_DEP,
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives"
                .format(git_commit=d3m_utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION,
        ],
        "supported_media_types":
        _media_types,
        "primitive_family":
        metadata_base.PrimitiveFamily.DATA_TRANSFORMATION,
    })

    def produce(self,
                *,
                inputs: container.Dataset,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[container.Dataset]:

        # if this is a single resource dataset we don't need to reformat it
        if len(inputs) < 2:
            return base.CallResult(inputs)

        # find the main resource if supplied, infer if not
        main_resource_id, main_resource = base_utils.get_tabular_resource(
            inputs, self.hyperparams["main_resource_id"])
        if main_resource_id is None:
            raise exceptions.InvalidArgumentValueError(
                "no main resource specified")

        # find the csv file column resource if supplied, infer if not
        file_index = self.hyperparams["file_col_index"]
        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata, main_resource_id,
                                            file_index):
                raise exceptions.InvalidArgumentValueError(
                    "column idx=" + str(file_index) +
                    " from does not contain csv file names")
        else:
            file_index = self._find_csv_file_column(inputs.metadata,
                                                    main_resource_id)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    "no column from contains csv file names")

        # generate the long form timeseries data
        base_path = self._get_base_path(inputs.metadata, main_resource_id,
                                        file_index)
        csv_paths = [
            os.path.join(base_path, local_path)
            for local_path in inputs[main_resource_id].iloc[:, file_index]
        ]
        new_dfs = [pd.read_csv(path) for path in csv_paths]
        original_dfs = [
            pd.DataFrame(
                np.tile(row, (df.shape[0], 1)),
                columns=inputs[main_resource_id].columns,
                index=df.index,
            ) for row, df in zip(inputs[main_resource_id].values, new_dfs)
        ]
        combined_dfs = [
            original_df.join(new_df)
            for original_df, new_df in zip(original_dfs, new_dfs)
        ]
        output_data = pd.concat(combined_dfs)
        timeseries_dataframe = container.DataFrame(output_data)
        timeseries_dataframe.reset_index(drop=True, inplace=True)

        # make sure that all timeseries have the same length, most downstream tasks will appreciate this.
        if self.hyperparams["equal_length"]:
            min_length = (timeseries_dataframe.groupby(
                timeseries_dataframe.columns[file_index]).count().min().
                          values[0])
            group_count = timeseries_dataframe.groupby(
                timeseries_dataframe.columns[file_index]).cumcount()
            timeseries_dataframe = timeseries_dataframe.assign(
                group_count=group_count)
            timeseries_dataframe = timeseries_dataframe[
                timeseries_dataframe["group_count"] < min_length]
            timeseries_dataframe = timeseries_dataframe.drop(["group_count"],
                                                             axis=1)

        # create a dataset to hold the result
        timeseries_dataset = container.Dataset(
            {self._resource_id: timeseries_dataframe}, generate_metadata=True)
        timeseries_dataset.metadata = timeseries_dataset.metadata.update(
            (), {"id": inputs.metadata.query(())["id"]})
        timeseries_dataset.metadata = timeseries_dataset.metadata.update(
            (), {"digest": inputs.metadata.query(())["digest"]})

        # copy main resource column metadata to timeseries dataframe
        num_main_resource_cols = inputs.metadata.query(
            (main_resource_id,
             metadata_base.ALL_ELEMENTS))["dimension"]["length"]
        for i in range(num_main_resource_cols):
            source = inputs.metadata.query(
                (main_resource_id, metadata_base.ALL_ELEMENTS, i))
            timeseries_dataset.metadata = timeseries_dataset.metadata.update_column(
                i, source, at=(self._resource_id, ))

        # remove the foreign key entry from the filename column if it exists
        metadata = dict(
            timeseries_dataset.metadata.query(
                (self._resource_id, metadata_base.ALL_ELEMENTS, file_index)))
        metadata["foreign_key"] = metadata_base.NO_VALUE
        timeseries_dataset.metadata = timeseries_dataset.metadata.update(
            (self._resource_id, metadata_base.ALL_ELEMENTS, file_index),
            metadata)

        # copy timeseries column metadata to timeseries if its available in the metadata (which is not necssarily true anymore)
        source = self._find_timeseries_metadata(inputs)
        i = 0
        start_idx = 0
        if source is not None:
            for col_info in source["file_columns"]:
                timeseries_dataset.metadata = timeseries_dataset.metadata.update_column(
                    i + num_main_resource_cols,
                    col_info,
                    at=(self._resource_id, ))
                i += 1
            # flag all other columns as attributes
            start_idx = i + num_main_resource_cols
        else:
            # loop over the appended time series columns
            start_idx = original_dfs[0].shape[1]

        for i in range(start_idx, timeseries_dataframe.shape[1]):
            timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type(
                (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                "https://metadata.datadrivendiscovery.org/types/Attribute",
            )
            struct_type = timeseries_dataset.metadata.query(
                (self._resource_id, metadata_base.ALL_ELEMENTS,
                 i))["structural_type"]
            if struct_type == np.float64:
                timeseries_dataset.metadata = (
                    timeseries_dataset.metadata.add_semantic_type(
                        (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                        "http://schema.org/Float",
                    ))
            elif struct_type == np.int64:
                timeseries_dataset.metadata = (
                    timeseries_dataset.metadata.add_semantic_type(
                        (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                        "http://schema.org/Integer",
                    ))
            else:
                timeseries_dataset.metadata = (
                    timeseries_dataset.metadata.add_semantic_type(
                        (self._resource_id, metadata_base.ALL_ELEMENTS, i),
                        "http://schema.org/Text",
                    ))

        # mark the filename column as a grouping key
        timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type(
            (self._resource_id, metadata_base.ALL_ELEMENTS, file_index),
            "https://metadata.datadrivendiscovery.org/types/GroupingKey",
        )

        # mark the d3mIndex as a primary multi-key since there are now multiple instances of the value present
        primary_index_col = (
            timeseries_dataset.metadata.list_columns_with_semantic_types(
                ("https://metadata.datadrivendiscovery.org/types/PrimaryKey",
                 ),
                at=(self._resource_id, ),
            ))
        timeseries_dataset.metadata = timeseries_dataset.metadata.remove_semantic_type(
            (self._resource_id, metadata_base.ALL_ELEMENTS,
             primary_index_col[0]),
            "https://metadata.datadrivendiscovery.org/types/PrimaryKey",
        )
        timeseries_dataset.metadata = timeseries_dataset.metadata.add_semantic_type(
            (self._resource_id, metadata_base.ALL_ELEMENTS,
             primary_index_col[0]),
            "https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey",
        )

        return base.CallResult(timeseries_dataset)

    @classmethod
    def _find_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata,
                              res_id: str) -> typing.Optional[int]:
        indices = inputs_metadata.list_columns_with_semantic_types(
            cls._semantic_types, at=(res_id, ))
        for i in indices:
            if cls._is_csv_file_column(inputs_metadata, res_id, i):
                return i
        return None

    @classmethod
    def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            res_id: str, column_index: int) -> bool:
        # check to see if a given column is a file pointer that points to a csv file
        column_metadata = inputs_metadata.query(
            (res_id, metadata_base.ALL_ELEMENTS, column_index))

        if not column_metadata or column_metadata["structural_type"] != str:
            return False

        # check if a foreign key exists
        if "foreign_key" not in column_metadata:
            return False

        ref_col_index = column_metadata["foreign_key"]["column_index"]
        ref_res_id = column_metadata["foreign_key"]["resource_id"]

        return cls._is_csv_file_reference(inputs_metadata, ref_res_id,
                                          ref_col_index)

    @classmethod
    def _is_csv_file_reference(cls,
                               inputs_metadata: metadata_base.DataMetadata,
                               res_id: int, column_index: int) -> bool:
        # check to see if the column is a csv resource
        column_metadata = inputs_metadata.query(
            (res_id, metadata_base.ALL_ELEMENTS, column_index))

        if not column_metadata or column_metadata["structural_type"] != str:
            return False

        semantic_types = column_metadata.get("semantic_types", [])
        media_types = column_metadata.get("media_types", [])

        semantic_types_set = set(semantic_types)
        _semantic_types_set = set(cls._semantic_types)

        return bool(
            semantic_types_set.intersection(_semantic_types_set)) and set(
                cls._media_types).issubset(media_types)

    @classmethod
    def _find_timeseries_metadata(
        cls, dataset: container.Dataset
    ) -> typing.Optional[metadata_base.DataMetadata]:
        # loop over the dataset to find the resource that contains the timeseries file col info
        for resource_id, resource in dataset.items():
            metadata = dataset.metadata.query((resource_id, "ALL_ELEMENTS", 0))
            if "file_columns" in metadata:
                return metadata
        return None

    def _get_base_path(
        self,
        inputs_metadata: metadata_base.DataMetadata,
        res_id: str,
        column_index: int,
    ) -> str:
        # get the base uri from the referenced column
        column_metadata = inputs_metadata.query(
            (res_id, metadata_base.ALL_ELEMENTS, column_index))

        ref_col_index = column_metadata["foreign_key"]["column_index"]
        ref_res_id = column_metadata["foreign_key"]["resource_id"]

        return inputs_metadata.query((ref_res_id, metadata_base.ALL_ELEMENTS,
                                      ref_col_index))["location_base_uris"][0]

    def _get_ref_resource(
        self,
        inputs_metadata: metadata_base.DataMetadata,
        res_id: str,
        column_index: int,
    ) -> str:
        # get the referenced resource from the referenced column
        column_metadata = inputs_metadata.query(
            (res_id, metadata_base.ALL_ELEMENTS, column_index))
        ref_res_id = column_metadata["foreign_key"]["resource_id"]

        return ref_res_id
Esempio n. 4
0
class ColumnParserPrimitive(
    transformer.TransformerPrimitiveBase[
        container.DataFrame, container.DataFrame, Hyperparams
    ]
):
    """
    A primitive which parses columns and sets the appropriate dtypes according to it's respective metadata.
    """

    metadata = metadata_base.PrimitiveMetadata(
        {
            "id": "e8e78214-9770-4c26-9eae-a45bd0ede91a",
            "version": version.__version__,
            "name": "Column Parser",
            "python_path": "d3m.primitives.data_transformation.column_parser.DistilColumnParser",
            "source": {
                "name": "Distil",
                "contact": "mailto:[email protected]",
                "uris": [
                    "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/column_parser.py",
                    "https://gitlab.com/uncharted-distil/distil-primitives",
                ],
            },
            "installation": [
                CYTHON_DEP,
                {
                    "type": metadata_base.PrimitiveInstallationType.PIP,
                    "package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives".format(
                        git_commit=d3m_utils.current_git_commit(
                            os.path.dirname(__file__)
                        ),
                    ),
                },
            ],
            "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION],
            "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION,
        }
    )

    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        start = time.time()
        logger.debug(f"Producing {__name__}")

        cols = self._get_columns(inputs.metadata)
        # outputs = container.DataFrame(generate_metadata=False)
        outputs = [None] * inputs.shape[1]

        parsing_semantics = self.hyperparams["parsing_semantics"]

        def fromstring(x: str) -> np.ndarray:
            # if column isn't a string, we'll just pass it through assuming it doesn't need to be parsed
            if type(x) is not str:
                return x

            return np.fromstring(x, dtype=float, sep=",")

        for col_index in range(len(inputs.columns)):
            if col_index in cols:
                column_metadata = inputs.metadata.query(
                    (metadata_base.ALL_ELEMENTS, col_index)
                )
                semantic_types = column_metadata.get("semantic_types", [])
                desired_semantics = set(semantic_types).intersection(parsing_semantics)
                if desired_semantics:
                    if (
                        "https://metadata.datadrivendiscovery.org/types/FloatVector"
                        in desired_semantics
                    ):
                        outputs[col_index] = inputs.iloc[:, col_index].apply(
                            fromstring, convert_dtype=False
                        )
                        if outputs[col_index].shape[0] > 0:
                            inputs.metadata = inputs.metadata.update_column(
                                col_index,
                                {"structural_type": type(outputs[col_index][0])},
                            )
                    elif "http://schema.org/DateTime" in desired_semantics:
                        outputs[col_index] = inputs.iloc[:, col_index].apply(
                            utils.parse_datetime_to_float,
                            fuzzy=self.hyperparams["fuzzy_time_parsing"],
                            convert_dtype=False,
                        )
                        inputs.metadata = inputs.metadata.update_column(
                            col_index, {"structural_type": float}
                        )
                    elif (
                        "https://metadata.datadrivendiscovery.org/types/CategoricalData"
                        in desired_semantics
                    ):
                        # need to make sure if a categorical type is a numeric string, convert it
                        if inputs[inputs.columns[col_index]][0].isnumeric():
                            outputs[col_index] = pd.to_numeric(
                                inputs.iloc[:, col_index],
                                errors=self.hyperparams["error_handling"],
                            )
                            if outputs[col_index].shape[0] > 0:
                                updated_type = type(outputs[col_index][0].item())
                                inputs.metadata = inputs.metadata.update_column(
                                    col_index, {"structural_type": updated_type}
                                )
                        else:
                            # if it's categorical but not numerical, ensure the string stays
                            outputs[col_index] = inputs.iloc[:, col_index]
                    else:
                        outputs[col_index] = pd.to_numeric(
                            inputs.iloc[:, col_index],
                            errors=self.hyperparams["error_handling"],
                        )
                        # Update structural type to reflect the results of the to_numeric call.  We can't rely on the semantic type because
                        # error coersion may result in a type becoming a float due to the presence of NaN.
                        if outputs[col_index].shape[0] > 0:
                            updated_type = type(outputs[col_index][0].item())
                            inputs.metadata = inputs.metadata.update_column(
                                col_index, {"structural_type": updated_type}
                            )
                else:
                    # columns without specified semantics need to be concatenated
                    outputs[col_index] = inputs.iloc[:, col_index]
            else:
                # columns not specified still need to be concatenated
                outputs[col_index] = inputs.iloc[:, col_index]

        outputs = container.DataFrame(pd.concat(outputs, axis=1))
        outputs.metadata = inputs.metadata
        end = time.time()
        logger.debug(f"Produce {__name__} completed in {end - start} ms")

        return base.CallResult(outputs)

    def _get_columns(
        self, inputs_metadata: metadata_base.DataMetadata
    ) -> typing.List[int]:
        def can_use_column(column_index: int) -> bool:
            return True

        columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(
            inputs_metadata,
            self.hyperparams["use_columns"],
            self.hyperparams["exclude_columns"],
            can_use_column,
        )

        if self.hyperparams["use_columns"] and columns_not_to_use:
            self.logger.warning(
                "Not all specified columns can parsed. Skipping columns: %(columns)s",
                {
                    "columns": columns_not_to_use,
                },
            )

        return columns_to_use
Esempio n. 5
0
class duke(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        "46612a42-6120-3559-9db9-3aa9a76eb94f",
        'version':
        __version__,
        'name':
        "duke",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords':
        ['Dataset Descriptor', 'Text', 'NLP', 'Abstractive Summarization'],
        'source': {
            'name':
            __author__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/duke-d3m-wrapper",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [
            {
                'type':
                metadata_base.PrimitiveInstallationType.PIP,
                'package_uri':
                'git+https://github.com/NewKnowledge/duke-d3m-wrapper.git@{git_commit}#egg=DukeD3MWrapper'
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
            {
                "type":
                "TGZ",
                "key":
                "en.model",
                "file_uri":
                "http://public.datadrivendiscovery.org/en_1000_no_stem.tar.gz",
                "file_digest":
                "3b1238137bba14222ae7c718f535c68a3d7190f244296108c895f1abe8549861"
            },
        ],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.distil.duke',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.RECURRENT_NEURAL_NETWORK,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.DATA_CLEANING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 volumes: typing.Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         volumes=volumes)

        self.volumes = volumes

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce a summary for the tabular dataset input

        Parameters
        ----------
        inputs : Input pandas frame
        Returns
        -------
        Outputs
            The output is a string summary
        """
        """ Accept a pandas data frame, returns a string summary
        frame: a pandas data frame containing the data to be processed
        -> a string summary
        """

        # sub-sample percentage of records from data frame
        if not self.hyperparams:
            self.hyperparams['records'] = 1
        records = self.hyperparams['records']
        frame = inputs.sample(frac=records)

        # cast frame data type back to original, if numeric, to ensure
        # that duke can drop them, and not skew results (since d3m
        #  preprocessing prims turn everything into str/object)
        tmp = frame
        for i in range(frame.shape[1]):
            if (frame.metadata.query_column(i)['semantic_types'][0] ==
                    'http://schema.org/Integer'):
                tmp.ix[:, frame.columns[i]].replace('', 0, inplace=True)
                tmp[frame.columns[i]] = pandas.to_numeric(
                    tmp[frame.columns[i]], errors='coerce')
                # converting a string value like '32.0' to an int directly results in an error, so we first
                # convert everything to a float
                tmp = tmp.astype({frame.columns[i]: float})
                tmp = tmp.astype({frame.columns[i]: int})
            elif (frame.metadata.query_column(i)['semantic_types'][0] ==
                  'http://schema.org/Float'):
                tmp.ix[:, frame.columns[i]].replace('', 0, inplace=True)
                tmp[frame.columns[i]] = pandas.to_numeric(
                    tmp[frame.columns[i]], errors='coerce')
                tmp = tmp.astype({frame.columns[i]: float})
            # not yet sure if dropping CategoticalData is ideal, but it appears to work...
            # some categorical data may contain useful information, but the d3m transformation is not reversible
            # and not aware of a way to distinguish numerical from non-numerical CategoricalData
            elif (frame.metadata.query_column(i)['semantic_types'][0] ==
                  'https://metadata.datadrivendiscovery.org/types/CategoricalData'
                  ):
                tmp = tmp.drop(columns=[frame.columns[i]])

        # print('beginning summarization... \n')

        # get the path to the ontology class tree
        resource_package = "Duke"
        resource_path = '/'.join(
            ('ontologies', 'class-tree_dbpedia_2016-10.json'))
        tree_path = pkg_resources.resource_filename(resource_package,
                                                    resource_path)
        embedding_path = self.volumes['en.model'] + "/en_1000_no_stem/en.model"
        row_agg_func = mean_of_rows
        tree_agg_func = parent_children_funcs(np.mean, max)
        source_agg_func = mean_of_rows
        max_num_samples = 1e6
        verbose = True

        duke = DatasetDescriptor(
            dataset=tmp,
            tree=tree_path,
            embedding=embedding_path,
            row_agg_func=row_agg_func,
            tree_agg_func=tree_agg_func,
            source_agg_func=source_agg_func,
            max_num_samples=max_num_samples,
            verbose=verbose,
        )

        print('initialized duke dataset descriptor \n')

        N = 5
        out_tuple = duke.get_top_n_words(N)
        print('finished summarization \n')
        out_df_duke = pandas.DataFrame.from_records(list(out_tuple)).T
        out_df_duke.columns = ['subject tags', 'confidences']

        # initialize the output dataframe as input dataframe (results will be appended to it)
        # out_df = d3m_DataFrame(inputs)

        # create metadata for the duke output dataframe
        duke_df = d3m_DataFrame(out_df_duke)
        # first column ('subject tags')
        col_dict = dict(duke_df.metadata.query(
            (metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("it is a string")
        col_dict['name'] = "subject tags"
        col_dict['semantic_types'] = (
            'http://schema.org/Text',
            'https://metadata.datadrivendiscovery.org/types/Attribute')
        duke_df.metadata = duke_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)
        # second column ('confidences')
        col_dict = dict(duke_df.metadata.query(
            (metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("1.0")
        col_dict['name'] = "confidences"
        col_dict['semantic_types'] = (
            'http://schema.org/Float',
            'https://metadata.datadrivendiscovery.org/types/Attribute')
        duke_df.metadata = duke_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)

        # concatenate final output frame -- not real consensus from program, so commenting out for now
        #out_df = utils_cp.append_columns(out_df, duke_df)

        return CallResult(duke_df)
Esempio n. 6
0
class ContinuityValidation(transformer.TransformerPrimitiveBase[Inputs,
                                                                Outputs,
                                                                Hyperparams]):
    """
    Check whether the seires data is consitent in time interval and provide processing if not consistent.

    Parameters
    ----------
    continuity_option: enumeration
        Choose ablation or imputation.
            ablation: delete some rows and increase timestamp interval to keep the timestamp consistent
            imputation: linearly imputate the absent timestamps to keep the timestamp consistent
    interval: float
        Only used in imputation, give the timestamp interval. ‘interval’ should be an integral multiple of 'timestamp' or 'timestamp' should be an integral multiple of ‘interval’
    """

    __author__: "DATA Lab at Texas A&M University"
    metadata = metadata_base.PrimitiveMetadata({
        "name":
        "continuity validation primitive",
        "python_path":
        "d3m.primitives.tods.data_processing.continuity_validation",
        "source": {
            'name':
            'DATA Lab at Texas A&M University',
            'contact':
            'mailto:[email protected]',
            'uris': [
                'https://gitlab.com/lhenry15/tods.git',
                'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/ContinuityValidation.py'
            ]
        },
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.CONTINUITY_VALIDATION,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
        "id":
        "ef8fb025-d157-476c-8e2e-f8fe56162195",
        "hyperparams_to_tune": ['continuity_option', 'interval'],
        "version":
        "0.0.1",
    })

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        """
        Args:
            inputs: Container DataFrame
            timeout: Default
            iterations: Default

        Returns:
            Container DataFrame with consistent timestamp

        """
        # self.logger.warning('Hi, ContinuityValidation.produce was called!')
        if self.hyperparams['continuity_option'] == 'ablation':
            outputs = self._continuity_ablation(inputs)

        if self.hyperparams['continuity_option'] == 'imputation':
            outputs = self._continuity_imputation(inputs)

        outputs.reset_index(drop=True, inplace=True)
        self._update_metadata(outputs)

        # self._write(outputs)
        return base.CallResult(outputs)

    def _update_metadata(self, outputs):
        outputs.metadata = outputs.metadata.generate(outputs)

    def _continuity_ablation(self, inputs: Inputs):

        ablation_set = self._find_ablation_set(inputs)
        inputs = inputs.loc[inputs['timestamp'].isin(ablation_set)].copy()

        inputs.sort_values("timestamp", inplace=True)
        inputs['d3mIndex'] = list(range(inputs.shape[0]))

        return inputs

    def _find_ablation_set(self, inputs):
        """
        Find the longest series with minimum timestamp interval of inputs
        """
        # find the min inteval and max interval
        min_interval = inputs.iloc[1]['timestamp'] - inputs.iloc[0]['timestamp']
        for i in range(2, inputs.shape[0]):
            curr_interval = inputs.iloc[i]['timestamp'] - inputs.iloc[
                i - 1]['timestamp']
            if min_interval > curr_interval:
                min_interval = curr_interval

        max_interval = (
            (inputs.iloc[-1]['timestamp'] - inputs.iloc[0]['timestamp']) +
            min_interval * (2 - inputs.shape[0]))

        print((inputs.iloc[-1]['timestamp'] - inputs.iloc[0]['timestamp']),
              inputs.shape[0])

        interval = min_interval
        ablation_set = list()
        origin_set = set(inputs['timestamp'])

        print(min_interval, max_interval)

        while interval <= max_interval:
            start = 0
            while (inputs.iloc[start]['timestamp'] <=
                   inputs.iloc[0]['timestamp'] + max_interval) and (
                       inputs.iloc[start]['timestamp'] <=
                       inputs.iloc[-1]['timestamp']):
                tmp_list = list()
                tmp = utils.numpy.arange(start=inputs.iloc[start]['timestamp'],
                                         step=interval,
                                         stop=inputs.iloc[-1]['timestamp'])

                for i in tmp:
                    if i in origin_set:
                        tmp_list.append(i)
                    else:
                        break

                ablation_set.append(tmp_list)
                start += 1

            interval += min_interval

        max_size_index = 0
        for i in range(1, len(ablation_set)):
            if len(ablation_set[i]) > len(ablation_set[max_size_index]):
                max_size_index = i
        return ablation_set[max_size_index]

    def _continuity_imputation(self, inputs: Inputs):
        """
        Linearly imputate the missing timestmap and value of inputs
        """
        interval = self.hyperparams['interval']
        time1 = inputs.iloc[0]['timestamp']

        for i in range(1, inputs.shape[0]):
            time2 = inputs.iloc[i]['timestamp']
            if time2 - time1 != interval:

                blank_number = int(
                    (time2 - time1) / interval
                )  # how many imputation should there be between two timestamps in original data
                for j in range(1, blank_number):

                    dict = {
                        'timestamp': [time1 + interval * j],
                        'ground_truth': [int(inputs.iloc[i]['ground_truth'])]
                    }

                    for col in list(inputs.columns.values):
                        if not col in [
                                'd3mIndex', 'timestamp', 'ground_truth'
                        ]:
                            dict[col] = [
                                inputs.iloc[i - 1][col] +
                                (inputs.iloc[i][col] - inputs.iloc[i - 1][col])
                                / blank_number * j
                            ]

                    inputs = inputs.append(utils.pandas.DataFrame(dict),
                                           ignore_index=True,
                                           sort=False)

            time1 = time2

        inputs.sort_values("timestamp", inplace=True)
        inputs['d3mIndex'] = list(range(inputs.shape[0]))
        return inputs

    def _write(self, inputs: Inputs):
        """
        write inputs to current directory, only for test
        """
        inputs.to_csv(str(time.time()) + '.csv')
Esempio n. 7
0
class DuplicationValidation(transformer.TransformerPrimitiveBase[Inputs,
                                                                 Outputs,
                                                                 Hyperparams]):
    """
    Check whether the seires data involves duplicate data in one timestamp, and provide processing if the duplication exists.

    Parameters
    ----------
    keep_option: enumeration
        When dropping rows, choose to keep the first one or calculate the average
    """

    __author__: "DATA Lab at Texas A&M University"
    metadata = metadata_base.PrimitiveMetadata({
        "name":
        "duplication validation primitive",
        "python_path":
        "d3m.primitives.tods.data_processing.duplication_validation",
        "source": {
            'name':
            'DATA Lab at Texas A&M University',
            'contact':
            'mailto:[email protected]',
            'uris': [
                'https://gitlab.com/lhenry15/tods.git',
                'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/DuplicationValidation.py'
            ]
        },
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.DUPLICATION_VALIDATION,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
        "id":
        "cf6d8137-73d8-496e-a2e3-49f941ee716d",
        "hyperparams_to_tune": ['keep_option'],
        "version":
        "0.0.1",
    })

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        """
        Args:
            inputs: Container DataFrame
            timeout: Default
            iterations: Default

        Returns:
            Container DataFrame after drop the duplication
        """
        # self.logger.warning('Hi, DuplicationValidation.produce was called!')

        if self.hyperparams['keep_option'] == 'first':
            outputs = self._timestamp_keep_first(inputs)

        if self.hyperparams['keep_option'] == 'average':
            outputs = self._timestamp_keep_average(inputs)

        self._update_metadata(outputs)

        # self._write(outputs)
        return base.CallResult(outputs)

    def _update_metadata(self, outputs):
        outputs.metadata = outputs.metadata.generate(outputs)

    def _timestamp_keep_first(self, inputs: Inputs):
        return inputs.drop_duplicates(subset=['timestamp'], keep='first')

    def _timestamp_keep_average(self, inputs: Inputs):
        inputs_copy = inputs.copy()
        inputs = inputs.drop_duplicates(subset=['timestamp'], keep='first')

        inputs_copy = inputs_copy.groupby('timestamp').mean().reset_index()

        for col in list(inputs.columns.values):
            if not col in ['d3mIndex', 'timestamp', 'ground_truth']:

                inputs[col] = inputs_copy[col].values

        return inputs

    def _write(self, inputs: Inputs):
        """
        write inputs to current directory, only for test
        """
        inputs.to_csv(str(time.time()) + '.csv')
Esempio n. 8
0
class DeepArPrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
        Primitive that applies a deep autoregressive forecasting algorithm for time series
        prediction. The implementation is based off of this paper: https://arxiv.org/pdf/1704.04110.pdf
        and this implementation: https://gluon-ts.mxnet.io/index.html

        Training inputs: 1) Feature dataframe, 2) Target dataframe
        Outputs: Dataframe with predictions for specific time series at specific future time instances 
    
        Arguments:
            hyperparams {Hyperparams} -- D3M Hyperparameter object
        
        Keyword Arguments:
            random_seed {int} -- random seed (default: {0})
    """

    metadata = metadata_base.PrimitiveMetadata(
        {
            "id": "3410d709-0a13-4187-a1cb-159dd24b584b",
            "version": __version__,
            "name": "DeepAR",
            "keywords": [
                "time series",
                "forecasting",
                "recurrent neural network",
                "autoregressive",
            ],
            "source": {
                "name": __author__,
                "contact": __contact__,
                "uris": [
                    "https://github.com/kungfuai/d3m-primitives",
                ],
            },
            "installation": [
                {"type": "PIP", "package": "cython", "version": "0.29.16"}, 
                {
                    "type": metadata_base.PrimitiveInstallationType.PIP,
                    "package_uri": "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives".format(
                        git_commit=utils.current_git_commit(os.path.dirname(__file__)),
                    ),
                },
            ],
            "python_path": "d3m.primitives.time_series_forecasting.lstm.DeepAR",
            "algorithm_types": [
                metadata_base.PrimitiveAlgorithmType.RECURRENT_NEURAL_NETWORK,
            ],
            "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING,
        }
    )

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._freq = None
        self._is_fit = False
        self._all_preds = None

    def get_params(self) -> Params:
        return Params(
            deepar_dataset = self._deepar_dataset,
            timestamp_column = self._timestamp_column,
            real_cols = self._real_columns,
            group_cols = self._grouping_columns,
            cat_cols = self._cat_columns,
            output_column = self._output_column,
            freq = self._freq,
            reind_freq = self._reind_freq,
            is_fit = self._is_fit,
            min_trains = self._min_trains
        )

    def set_params(self, *, params: Params) -> None:
        self._deepar_dataset = params['deepar_dataset']
        self._timestamp_column = params['timestamp_column']
        self._real_columns = params['real_cols']
        self._grouping_columns = params['group_cols']
        self._cat_columns = params['cat_cols']
        self._output_column = params['output_column']
        self._freq = params['freq']
        self._reind_freq = params['reind_freq']
        self._is_fit = params['is_fit']
        self._min_trains = params['min_trains']

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """ Sets primitive's training data
        
            Arguments:
                inputs {Inputs} -- D3M dataframe containing attributes
                outputs {Outputs} -- D3M dataframe containing targets
            
            Raises:
                ValueError: If multiple columns are annotated with 'Time' or 'DateTime' metadata
        """

        self._output_column = outputs.columns[0]

        frame = inputs.append_columns(outputs)
        self._get_cols(frame)
        self._set_freq(frame)
        frame, self._min_trains, max_train_length, _ = self._reindex(frame)
        self._check_window_support(max_train_length)

        self._deepar_dataset = DeepARDataset(
            frame, 
            self._grouping_columns,
            self._cat_columns,
            self._real_columns,
            self._timestamp_column,
            self._target_column,
            self._freq,
            self.hyperparams['prediction_length'],
            self.hyperparams['context_length'],
            self._target_semantic_types,
            self.hyperparams['count_data']
        )
        self._train_data = self._deepar_dataset.get_data()

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """ Fits DeepAR model using training data from set_training_data and hyperparameters
            
            Keyword Arguments:
                timeout {float} -- timeout, considered (default: {None})
                iterations {int} -- iterations, considered (default: {None})
            
            Returns:
                CallResult[None]
        """

        if iterations is None:
            iterations = self.hyperparams["epochs"]
            has_finished = True
        else:
            has_finished = False

        estimator = DeepAREstimator(
            freq=self._freq,
            prediction_length=self.hyperparams['prediction_length'],
            context_length=self.hyperparams['context_length'],
            use_feat_static_cat=self._deepar_dataset.has_cat_cols() or self._deepar_dataset.has_group_cols(),
            use_feat_dynamic_real=self._deepar_dataset.has_real_cols(),
            cardinality=self._deepar_dataset.get_cardinality(),
            distr_output=self._deepar_dataset.get_distribution_type(),
            dropout_rate=self.hyperparams['dropout_rate'],
            trainer=Trainer(
                epochs=iterations,
                learning_rate=self.hyperparams['learning_rate'], 
                batch_size=self.hyperparams['training_batch_size'],
                num_batches_per_epoch=self.hyperparams['steps_per_epoch']
            )
        )

        logger.info(f"Fitting for {iterations} iterations")
        start_time = time.time()
        predictor = estimator.train(self._train_data)
        predictor.batch_size = self.hyperparams['inference_batch_size']
        self._is_fit = True
        logger.info(f"Fit for {iterations} epochs, took {time.time() - start_time}s")

        if not os.path.isdir(self.hyperparams['weights_dir']):
            os.mkdir(self.hyperparams['weights_dir'])
        predictor.serialize(Path(self.hyperparams['weights_dir']))

        return CallResult(None, has_finished=has_finished)
        
    def produce(
        self, *, inputs: Inputs, timeout: float = None, iterations: int = None
    ) -> CallResult[Outputs]:
        """ Produce primitive's predictions for specific time series at specific future time instances
            * these specific timesteps / series are specified implicitly by input dataset

            Arguments:
                inputs {Inputs} -- D3M dataframe containing attributes
            
            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})

            Raises:
                PrimitiveNotFittedError: if primitive not fit
            
            Returns:
                CallResult[Outputs] -- (N, 2) dataframe with d3m_index and value for each prediction slice requested.
                    prediction slice = specific horizon idx for specific series in specific regression 
        """
        if self._all_preds is None:
            self._all_preds, self._pred_intervals = self._produce(inputs)

        point_estimates = np.concatenate(
            [series[0][idxs] for series, idxs in zip(self._all_preds, self._pred_intervals)]
        )
        
        result_df = container.DataFrame(
            {self._output_column: point_estimates},
            generate_metadata=True,
        )

        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )
        return CallResult(result_df, has_finished=self._is_fit)

    def produce_confidence_intervals(
        self, *, inputs: Inputs, timeout: float = None, iterations: int = None
    ) -> CallResult[Outputs]:
        """ produce quantiles for each prediction timestep in dataframe
        
        Arguments:
            inputs {Inputs} -- D3M dataframe containing attributes
        
        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, considered (default: {None})
        
        Raises:
            PrimitiveNotFittedError: 
        
        Returns:
            CallResult[Outputs] -- 

            Ex. 
                0.50 | 0.05 | 0.95
                -------------------
                 5   |   3  |   7
                 6   |   4  |   8
                 5   |   3  |   7
                 6   |   4  |   8
        """

        if self._all_preds is None:
            self._all_preds, self._pred_intervals = self._produce(inputs)

        all_quantiles = [[] for q in range(len(self.hyperparams['quantiles']) + 1)]
        for series, idxs in zip(self._all_preds, self._pred_intervals):
            for i, quantile in enumerate(series):
                all_quantiles[i].append(quantile[idxs])
        all_quantiles = [np.concatenate(quantile) for quantile in all_quantiles]

        col_names = (0.5,) + self.hyperparams['quantiles']
        result_df = container.DataFrame(
            {col_name: quantile for col_name, quantile in zip(col_names, all_quantiles)},
            generate_metadata=True,
        )

        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )

        return CallResult(result_df, has_finished=self._is_fit)

    def _get_col_names(self, col_idxs, all_col_names):
        """ transform column indices to column names """
        return [all_col_names[i] for i in col_idxs]
        
    def _process_special_col(self, col_list, col_type):
        """ private util function that warns if multiple special columns 
        """

        if len(col_list) == 0:
            return None
        elif len(col_list) > 1:
            logger.warn(
                f"""There are more than one {col_type} marked. This primitive will use the first"""
            )
        return col_list[0]

    def _sort_by_timestamp(self, frame):
        """ private util function: convert to pd datetime and sort
        """

        time_name = frame.columns[self._timestamp_column]
        new_frame = frame.copy()

        if "http://schema.org/Integer" in frame.metadata.query_column_field(
            self._timestamp_column, "semantic_types"
        ):
            new_frame.iloc[:, self._timestamp_column] = pd.to_datetime(
                new_frame.iloc[:, self._timestamp_column] - 1, 
                unit = 'D'
            )
            self._freq = 'D'
            self._reind_freq = 'D'
        else:
            new_frame.iloc[:, self._timestamp_column] = pd.to_datetime(
                new_frame.iloc[:, self._timestamp_column], 
                unit = 's'
            )
        return new_frame.sort_values(by = time_name)

    def _set_freq(self, frame):
        """ sets frequency using differences in timestamp column in data frame
            ASSUMPTION: frequency is the same across all grouped time series
        """

        if len(self._grouping_columns) == 0:
            if self._freq is None:
                diff = frame.iloc[1, self._timestamp_column] - frame.iloc[0, self._timestamp_column]
                self._freq, self._reind_freq = calculate_time_frequency(diff, model = 'gluon')
        else:
            if self._freq is None:
                g_cols = self._get_col_names(self._grouping_columns, frame.columns)
                for g, df in frame.groupby(g_cols, sort = False):
                    diff = df.iloc[1, self._timestamp_column] - df.iloc[0, self._timestamp_column]
                    break
                self._freq, self._reind_freq = calculate_time_frequency(diff, model = 'gluon')

    def _robust_reindex(self, frame):
        """ reindex dataframe IFF it has > 1 row, interpolate real-valued columns, forward-filling
            categorical and grouping columns """ 

        frame = self._sort_by_timestamp(frame)
        original_times = frame.iloc[:, self._timestamp_column]
        frame = frame.drop_duplicates(subset = frame.columns[self._timestamp_column])
        frame.index = frame.iloc[:, self._timestamp_column]
        if frame.shape[0] > 1:
            frame = frame.reindex(
                pd.date_range(
                    frame.index[0], 
                    frame.index[-1], 
                    freq = self._reind_freq,
                )
            )
        frame.iloc[:, self._real_columns] = frame.iloc[:, self._real_columns].interpolate()
        frame.iloc[:, self._cat_columns + self._grouping_columns] = \
            frame.iloc[:, self._cat_columns + self._grouping_columns].ffill()

        return frame, original_times

    def _reindex(self, frame):
        """ reindex data, keeping NA values for target column, but interpolating feature columns
        """ 

        if len(self._grouping_columns) == 0:
            df, original_times = self._robust_reindex(frame)
            return df, [df.index[0]], df.shape[0], original_times
        else:
            all_dfs, min_trains, original_times = [], {}, OrderedDict()
            max_train_length = 0
            g_cols = self._get_col_names(self._grouping_columns, frame.columns)
            for grp, df in frame.groupby(g_cols, sort = False):
                df, orig_times = self._robust_reindex(df)
                if df.shape[0] > max_train_length:
                    max_train_length = df.shape[0]
                all_dfs.append(df)
                min_trains[grp] = df.index[0]
                original_times[grp] = orig_times
            return pd.concat(all_dfs), min_trains, max_train_length, original_times

    def _get_cols(self, frame):
        """ private util function: get indices of important columns from metadata 
        """

        input_metadata = frame.metadata

        # get target idx (first column by default)
        target_columns = input_metadata.list_columns_with_semantic_types(
            (
                "https://metadata.datadrivendiscovery.org/types/SuggestedTarget",
                "https://metadata.datadrivendiscovery.org/types/TrueTarget",
                "https://metadata.datadrivendiscovery.org/types/Target",
            )
        )
        if len(target_columns) == 0:
            raise ValueError("At least one column must be marked as a target")

        self._target_column = self._process_special_col(
            target_columns, "target column"
        )

        # get timestamp idx (first column by default)
        timestamp_columns = input_metadata.list_columns_with_semantic_types(
            (
                "https://metadata.datadrivendiscovery.org/types/Time",
                "http://schema.org/DateTime",
            )
        )
        self._timestamp_column = self._process_special_col(
            timestamp_columns, "timestamp column"
        )

        # get grouping idx 
        self._grouping_columns = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/GroupingKey",)
        )
        suggested_group_cols = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey",)
        )
        if len(self._grouping_columns) == 0:
            self._grouping_columns = suggested_group_cols

        def diff(li1, li2): 
            return list(set(li1) - set(li2))

        # categorical columns
        self._cat_columns = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/CategoricalData",)
        )
        self._cat_columns = diff(self._cat_columns, self._grouping_columns + suggested_group_cols)

        # real valued columns
        self._real_columns = input_metadata.list_columns_with_semantic_types(
            ("http://schema.org/Integer", "http://schema.org/Float")
        )

        self._real_columns = diff(
            self._real_columns, 
            [self._timestamp_column] + [self._target_column] + self._grouping_columns
        )

        # determine whether targets are count data
        self._target_semantic_types = input_metadata.query_column_field(
            self._target_column, "semantic_types"
        )

    def _check_window_support(self, max_train_length):
        """ ensures that at least one series of target series is >= context_length """

        if max_train_length < self.hyperparams['prediction_length']:
            raise ValueError(
                f"This training set does not support a prediction length of {self.hyperparams['prediction_length']} " +
                f"because its longest series has length {max_train_length} observations. Please " +
                f"choose a shorter prediction length."
            )

    def _get_pred_intervals(self, original_times):
        """ private util function that retrieves unevenly spaced prediction intervals from data frame 
        """

        if len(self._grouping_columns) == 0:
            intervals = discretize_time_difference(
                original_times,
                self._min_trains[0],
                self._freq, 
                zero_index = True
            )
            all_intervals = [np.array(intervals) + 1]
        else:
            all_intervals = []
            for grp, times in original_times.items():
                if grp in self._min_trains.keys():
                    intervals = discretize_time_difference(
                        times,
                        self._min_trains[grp],
                        self._freq, 
                        zero_index = True
                    )
                else:
                    logger.info(
                        f'Series with category {grp} did not exist in training data, ' +
                        f'These predictions will be returned as np.nan.'
                    )
                    intervals = np.zeros(times.shape[0]).astype(int)
                all_intervals.append(np.array(intervals) + 1)
        return all_intervals

    def _produce(self, inputs: Inputs):
        """ internal produce method to support produce() and produce_confidence_intervals() methods """
        
        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        test_frame = inputs.copy()
        deepar_forecast = DeepARForecast(
            self._deepar_dataset,
            self.hyperparams['weights_dir'],
            self.hyperparams['output_mean'],
            self.hyperparams['number_samples'],
            self.hyperparams['quantiles']
        )
        test_frame, _, _, original_times = self._reindex(test_frame)
        pred_intervals = self._get_pred_intervals(original_times)

        st = time.time()
        preds = deepar_forecast.predict(test_frame, pred_intervals)
        logger.info(f'Making predictions took {time.time() - st}s')
        return preds, pred_intervals
Esempio n. 9
0
class MatrixProfile(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
	"""
	A primitive that performs matrix profile on a DataFrame using Stumpy package
	Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html

	 Parameters
    	----------
    	T_A : ndarray
    	    The time series or sequence for which to compute the matrix profile
    	m : int
    	    Window size
    	T_B : ndarray
    	    The time series or sequence that contain your query subsequences
    	    of interest. Default is `None` which corresponds to a self-join.
    	ignore_trivial : bool
    	    Set to `True` if this is a self-join. Otherwise, for AB-join, set this
    	    to `False`. Default is `True`.
    	Returns
    	-------
    	out : ndarray
    	    The first column consists of the matrix profile, the second column
    	    consists of the matrix profile indices, the third column consists of
    	    the left matrix profile indices, and the fourth column consists of
    	    the right matrix profile indices.
	
	"""

	
	metadata = metadata_base.PrimitiveMetadata({
		'__author__': "DATA Lab @Texas A&M University",
		'name': "Matrix Profile",
		#'python_path': 'd3m.primitives.tods.feature_analysis.matrix_profile',
		'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile',
		'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:[email protected]',
                   'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/MatrixProfile.py']},
		'algorithm_types': [metadata_base.PrimitiveAlgorithmType.MATRIX_PROFILE,], 
		'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION,
		'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')),
		'hyperparams_to_tune': ['window_size'],
		'version': '0.0.2',		
		})


	def __init__(self, *, hyperparams: Hyperparams) -> None:
		super().__init__(hyperparams=hyperparams)
		self._clf = MP(window_size = hyperparams['window_size'])
		self.primitiveNo = PrimitiveCount.primitive_no
		PrimitiveCount.primitive_no+=1

	def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:	

		"""

		Args:

			inputs: Container DataFrame

			timeout: Default

			iterations: Default

		Returns:

		    Container DataFrame containing Matrix Profile of selected columns
		
		"""

		# Get cols to fit.
		self._fitted = False
		self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams)
		self._input_column_names = self._training_inputs.columns


		if len(self._training_indices) > 0:
			self._fitted = True
		else:
			if self.hyperparams['error_on_no_input']:
				raise RuntimeError("No input columns were selected")
			self.logger.warn("No input columns were selected")

		if not self._fitted:
			raise PrimitiveNotFittedError("Primitive not fitted.")
		
		sk_inputs = inputs
		if self.hyperparams['use_semantic_types']:
			sk_inputs = inputs.iloc[:, self._training_indices]
		output_columns = []
		if len(self._training_indices) > 0:
			sk_output = self._clf.produce(sk_inputs)
			if sparse.issparse(sk_output):
				sk_output = sk_output.toarray()
			outputs = self._wrap_predictions(inputs, sk_output)
			
			if len(outputs.columns) == len(self._input_column_names):
				outputs.columns = self._input_column_names
			output_columns = [outputs]

		else:
			if self.hyperparams['error_on_no_input']:
				raise RuntimeError("No input columns were selected")
			self.logger.warn("No input columns were selected")

		outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
							   add_index_columns=self.hyperparams['add_index_columns'],
							   inputs=inputs, column_indices=self._training_indices,
							   columns_list=output_columns)
		#print(outputs)
		#CallResult(outputs)
		#print("___")
		print(outputs.columns)
		#outputs.columns = [str(x) for x in outputs.columns]

		return CallResult(outputs)

		# assert isinstance(inputs, container.DataFrame), type(container.DataFrame)
		# _, self._columns_to_produce = self._get_columns_to_fit(inputs, self.hyperparams)
		
		# #print("columns_to_produce ", self._columns_to_produce)
		
		# outputs = inputs
		# if len(self._columns_to_produce) > 0:
		# 	for col in self.hyperparams['use_columns']:
		# 		output = self._clf.produce(inputs.iloc[ : ,col])
				
		# 		outputs = pd.concat((outputs, pd.DataFrame({inputs.columns[col]+'_matrix_profile': output[:,0], 
		# 					inputs.columns[col]+'_matrix_profile_indices': output[:,1], 
		# 					inputs.columns[col]+'_left_matrix_profile_indices': output[:,2], 
		# 					inputs.columns[col]+'_right_matrix_profile_indices': output[:,3]})), axis = 1)

		# else:
		# 	if self.hyperparams['error_on_no_input']:
		# 		raise RuntimeError("No input columns were selected")
		# 	self.logger.warn("No input columns were selected")

		# #print(outputs)
		# self._update_metadata(outputs)

		# return base.CallResult(outputs)



	def _update_metadata(self, outputs):
		outputs.metadata = outputs.metadata.generate(outputs)
 
	@classmethod
	def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):

		"""

			Select columns to fit.
			Args:
				inputs: Container DataFrame
				hyperparams: d3m.metadata.hyperparams.Hyperparams

			Returns:
				list

		"""

		if not hyperparams['use_semantic_types']:
			return inputs, list(range(len(inputs.columns)))

		inputs_metadata = inputs.metadata

		

		def can_produce_column(column_index: int) -> bool:
			return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

		columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
					   use_columns=hyperparams['use_columns'],
					   exclude_columns=hyperparams['exclude_columns'],
					   can_use_column=can_produce_column)


		"""
		Encountered error: when hyperparams['use_columns'] = (2,3) and hyperparams['exclude_columns'] is (1,2)
		columns_to_produce is still [2]
		"""
		return inputs.iloc[:, columns_to_produce], columns_to_produce
		

	@classmethod
	def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:

		"""

			Output whether a column can be processed.
				Args:
					inputs_metadata: d3m.metadata.base.DataMetadata
					column_index: int

				Returns:
					bool

		"""

		column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

		accepted_structural_types = (int, float, np.integer, np.float64) #changed numpy to np
		accepted_semantic_types = set()
		accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")

		# print(column_metadata)
		# print(column_metadata['structural_type'], accepted_structural_types)

		if not issubclass(column_metadata['structural_type'], accepted_structural_types):
			return False

		semantic_types = set(column_metadata.get('semantic_types', []))

		# print(column_metadata)
		# print(semantic_types, accepted_semantic_types)

		if len(semantic_types) == 0:
			cls.logger.warning("No semantic types found in column metadata")
			return False

		# Making sure all accepted_semantic_types are available in semantic_types
		if len(accepted_semantic_types - semantic_types) == 0:
			return True

		return False

	def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:

		"""

			Wrap predictions into dataframe
		Args:
			inputs: Container Dataframe
			predictions: array-like data (n_samples, n_features)

		Returns:
			Dataframe

		"""

		outputs = d3m_dataframe(predictions, generate_metadata=True)
		target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo)
		outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
		return outputs



	@classmethod
	def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
									target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:

		"""

			Updata metadata for selected columns.
				Args:
					inputs_metadata: metadata_base.DataMetadata
					outputs: Container Dataframe
					target_columns_metadata: list

				Returns:
					d3m.metadata.base.DataMetadata

		"""

		outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

		for column_index, column_metadata in enumerate(target_columns_metadata):
			column_metadata.pop("structural_type", None)
			outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

		return outputs_metadata


	@classmethod
	def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo):
		"""
		Add target columns metadata
		Args:
			outputs_metadata: metadata.base.DataMetadata
			hyperparams: d3m.metadata.hyperparams.Hyperparams

		Returns:
			List[OrderedDict]
		"""
		outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
		target_columns_metadata: List[OrderedDict] = []
		for column_index in range(outputs_length):
			column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index)
			column_metadata = OrderedDict()
			semantic_types = set()
			semantic_types.add(hyperparams["return_semantic_type"])
			column_metadata['semantic_types'] = list(semantic_types)

			column_metadata["name"] = str(column_name)
			target_columns_metadata.append(column_metadata)
		return target_columns_metadata
Esempio n. 10
0
class So_GaalPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params,
                                                       Hyperparams]):
    """Single-Objective Generative Adversarial Active Learning.
    SO-GAAL directly generates informative potential outliers to assist the
    classifier in describing a boundary that can separate outliers from normal
    data effectively. Moreover, to prevent the generator from falling into the
    mode collapsing problem, the network structure of SO-GAAL is expanded from
    a single generator (SO-GAAL) to multiple generators with different
    objectives (MO-GAAL) to generate a reasonable reference distribution for
    the whole dataset.
    Read more in the :cite:`liu2019generative`.
    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e.
        the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.
    stop_epochs : int, optional (default=20)
        The number of epochs of training.
    lr_d : float, optional (default=0.01)
        The learn rate of the discriminator.
    lr_g : float, optional (default=0.0001)
        The learn rate of the generator.
    decay : float, optional (default=1e-6)
        The decay parameter for SGD.
    momentum : float, optional (default=0.9)
        The momentum parameter for SGD.
    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is fitted.
    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.
    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    __author__ = "DATA Lab at Texas A&M University",
    metadata = metadata_base.PrimitiveMetadata({
        'id':
        '56e6cfe9-d9e9-495f-83da-cfed6fa27da1',
        'version':
        '0.1.0',
        'name':
        'So_Gaal Anomaly Detection',
        'python_path':
        'd3m.primitives.tods.detection_algorithm.pyod_sogaal',
        'keywords': ['Time Series', 'GAN'],
        "hyperparams_to_tune":
        ['stop_epochs', 'lr_d', 'lr_g', 'decay', 'momentum'],
        'source': {
            'name':
            'DATA Lab at Texas A&M University',
            'uris': [
                'https://gitlab.com/lhenry15/tods.git',
                'https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/detection_algorithm/PyodSoGaal.py'
            ],
            'contact':
            'mailto:[email protected]'
        },
        'installation': [{
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.
            format(git_commit=d3m_utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.DATA_PROFILING,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION,
    })

    def __init__(
            self,
            *,
            hyperparams: Hyperparams,  #
            random_seed: int = 0,
            docker_containers: Dict[str, DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        self._clf = SO_GAAL(
            stop_epochs=hyperparams['stop_epochs'],
            lr_d=hyperparams['lr_d'],
            lr_g=hyperparams['lr_g'],
            decay=hyperparams['decay'],
            momentum=hyperparams['momentum'],
            contamination=hyperparams['contamination'],
        )

        return

    def set_training_data(self, *, inputs: Inputs) -> None:
        """
        Set training data for outlier detection.
        Args:
            inputs: Container DataFrame

        Returns:
            None
        """
        super().set_training_data(inputs=inputs)

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """
        Fit model with training data.
        Args:
            *: Container DataFrame. Time series data up to fit.

        Returns:
            None
        """
        return super().fit()

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame. Time series data up to outlier detection.

        Returns:
            Container DataFrame
            1 marks Outliers, 0 marks normal.
        """
        return super().produce(inputs=inputs,
                               timeout=timeout,
                               iterations=iterations)

    def get_params(self) -> Params:
        """
        Return parameters.
        Args:
            None

        Returns:
            class Params
        """
        return super().get_params()

    def set_params(self, *, params: Params) -> None:
        """
        Set parameters for outlier detection.
        Args:
            params: class Params

        Returns:
            None
        """
        super().set_params(params=params)
Esempio n. 11
0
class SeededGraphMatching( UnsupervisedLearnerPrimitiveBase[Inputs, Outputs,Params, Hyperparams]):
    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': 'ff22e721-e4f5-32c9-ab51-b90f32603a56',
        'version': "0.1.0",
        'name': "jhu.sgm",
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.jhu_primitives.SeededGraphMatching',
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['graph matching'],
        'source': {
            'name': "JHU",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://github.com/neurodata/primitives-interfaces/jhu_primitives/sgm/sgm.py',
#                'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py',
                'https://github.com/neurodata/primitives-interfaces.git',
            ],
        },
        'installation': [{
                'type': 'UBUNTU',
                'package': 'r-base',
                'version': '3.4.2'
            },
            {
                'type': 'UBUNTU',
                'package': 'libxml2-dev',
                'version': '2.9.4'
            },
            {
                'type': 'UBUNTU',
                'package': 'libpcre3-dev',
                'version': '2.9.4'
            },{
            'type': metadata_module.PrimitiveInstallationType.PIP,
            'package_uri': 'git+https://github.com/neurodata/primitives-interfaces.git@{git_commit}#egg=jhu_primitives'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
                ),
        }],
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.FRANK_WOLFE_ALGORITHM
        ],
        'primitive_family': metadata_module.PrimitiveFamily.GRAPH_MATCHING
       })

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, base.DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
        self._training_dataset = None
        self._g1 = None
        self._g2 = None
        self._g1_node_attributes = None
        self._g2_node_attributes = None

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        return CallResult[None]

    def set_training_data(self,*,inputs: Inputs) -> None:
        self._training_dataset = inputs
        self._g1 = self._training_dataset['0']
        self._g2 = self._training_dataset['1']
        self._g1_node_attributes = list(networkx.get_node_attributes(self._g1, 'nodeID').values())
        self._g2_node_attributes = list(networkx.get_node_attributes(self._g2, 'nodeID').values())
        #technically, this is unsupervised, as there is no fit function
        #instead, we just hang on to the training data and run produce with the two graphs and seeds
        #and use that to predict later on.

    def get_params(self) -> None:
        return Params

    def set_params(self, *, params: Params) -> None:
        pass
    #UnsupervisedLearner
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        #produce takes the training dataset and runs seeded graph matching using the seeds
        #then predicts using the resulting permutation_matrix

        permutation_matrix = np.asmatrix(self._seeded_graph_match(training_data=self._training_dataset))

        predictions = self._get_predictions(permutation_matrix=permutation_matrix, inputs = inputs)

        return base.CallResult(predictions)

    def _get_predictions(self,*, permutation_matrix: np.matrix, inputs: Inputs):
        testing = inputs['2']

        threshold = self.hyperparams['threshold']

        for i in range(testing.shape[0]):
            testing['match'][i] = 0
            v1 = testing['G1.nodeID'][i]
            v2 = testing['G2.nodeID'][i]
            found = False
            j = 0
            while not found:
                if self._g1_node_attributes[j] == int(v1):
                    found = True
                    v1 = j
                j += 1
            # print(found)
            found = False
            j = 0

            while not found:
                if self._g2_node_attributes[j] == int(v2):
                    found = True
                    v2 = j
                j += 1

            if permutation_matrix[v1, v2] > threshold:
                testing['match'][i] = 1
            else:
                testing['match'][i] = 0

        df = container.DataFrame({"d3mIndex": testing['d3mIndex'], "match": testing['match']})
        return df

    def _seeded_graph_match(self,*, training_data = None):
        if training_data is None:
            training_data = self._training_dataset
        seeds = training_data['2']

        new_seeds = pd.DataFrame(
            {'G1.nodeID': seeds['G1.nodeID'], 'G2.nodeID': seeds['G2.nodeID'], 'match': seeds['match']})
        new_seeds = new_seeds[new_seeds['match'] == '1']
        # we now have a seeds correspondence of nodeIDs,
        #  but we need a seed correspondence of actual vertex numbers

        # initialize the integer values to nothing:
        new_seeds['g1_vertex'] = ""
        new_seeds['g2_vertex'] = ""

        # for every seed, locate the corresponding vertex integer
        for j in range(new_seeds.shape[0]):
            found = False
            i = 0
            while not found:
                if (int(new_seeds['G1.nodeID'][j]) == self._g1_node_attributes[i]):
                    new_seeds['g1_vertex'][j] = i
                    found = True
                i += 1

        for j in range(new_seeds.shape[0]):
            found = False
            i = 0
            while not found:
                if (int(new_seeds['G2.nodeID'][j]) == self._g2_node_attributes[i]):
                    new_seeds['g2_vertex'][j] = i
                    found = True
                i += 1

        # store the vertex pairs as an m x 2 array and convert to a matrix
        seeds_array = np.array(new_seeds[['g1_vertex', 'g2_vertex']])
        seeds_array = seeds_array.astype(int)

        seeds = seeds_array
        nr, nc = seeds.shape
        seeds = ro.r.matrix(seeds, nrow=nr, ncol=nc)
        ro.r.assign("seeds", seeds)

        g1_matrix = networkx.to_numpy_array(self._g1)
        nr, nc = g1_matrix.shape
        g1_matrix = ro.r.matrix(g1_matrix, nrow=nr, ncol=nc)
        ro.r.assign("g1_matrix", g1_matrix)

        g2_matrix = networkx.to_numpy_array(self._g2)
        nr, nc = g2_matrix.shape
        g2_matrix = ro.r.matrix(g2_matrix, nrow=nr, ncol=nc)
        ro.r.assign("g2_matrix", g2_matrix)

        reps = self.hyperparams['reps']
        ro.r.assign("reps",reps)

        # run the R code:
        path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                            "sgm.interface.R")
        path = file_path_conversion(path, uri="")

        cmd = """
                source("%s")
                fn <- function(g1_matrix, g2_matrix, seeds,reps) {
                    sgm.interface(g1_matrix, g2_matrix, seeds,reps)
                }
                """ % path
        
        result = np.array(ro.r(cmd)(g1_matrix, g2_matrix, seeds,reps))

        return container.ndarray(result)
class TimeSeriesBinnerPrimitive(
        transformer.TransformerPrimitiveBase[container.DataFrame,
                                             container.DataFrame,
                                             Hyperparams]):
    """
    Bins according to the binning_operation on timeseries values. The time value can be
    a datetime stamp or an integer. If there is a GroupingKey column, it will apply binning
    to the groups. This will also bin on any value columns set, not just for one column.

    Currently works for downsampling. If the column with the time semantic is a datetime,
    it can upsample but will leave NaN values for the time.
    """

    _grouping_key_semantic = (
        "https://metadata.datadrivendiscovery.org/types/GroupingKey", )
    _time_semantic = ("https://metadata.datadrivendiscovery.org/types/Time", )
    _target_semantic = (
        "https://metadata.datadrivendiscovery.org/types/Target", )

    metadata = metadata_base.PrimitiveMetadata({
        "id":
        "5fee7a91-b843-4636-a21e-a02bf0fd7f3a",
        "version":
        version.__version__,
        "name":
        "Time series binner",
        "python_path":
        "d3m.primitives.data_transformation.time_series_binner.DistilTimeSeriesBinner",
        "source": {
            "name":
            "Distil",
            "contact":
            "mailto:[email protected]",
            "uris": [
                "https://github.com/uncharted-distil/distil-primitives-contrib/blob/main/main/distil_primitives_contrib/time_series_binner.py",
                "https://gitlab.com/uncharted-distil/distil-primitives-contrib",
            ],
        },
        "installation": [
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/uncharted-distil/distil-primitives-contrib.git@{git_commit}#egg=distil-primitives-contrib"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        "algorithm_types":
        [metadata_base.PrimitiveAlgorithmType.DATA_NORMALIZATION],
        "primitive_family":
        metadata_base.PrimitiveFamily.DATA_TRANSFORMATION,
    })

    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        if inputs.shape[0] == 0:
            return base.CallResult(inputs)
        # cols = distil_utils.get_operating_columns(inputs, self.hyperparams['binning_columns'], self._semantic_types)
        init_index = inputs.index
        d3m_index = inputs.columns.get_loc("d3mIndex")
        d3m_col = inputs["d3mIndex"]
        group_key_index = self._get_grouping_key_index(inputs.metadata)
        time_index = self._get_time_index(inputs.metadata)
        value_indices = self._get_value_indices(inputs.metadata)
        self.time_col_name = inputs.columns[time_index]
        self.group_col_name = inputs.columns[group_key_index]

        self.time_col_dtype = inputs.dtypes[self.time_col_name]
        self.value_columns = inputs.columns[list(value_indices)]
        usable_cols = [self.group_col_name, self.time_col_name] + list(
            self.value_columns)
        inputs = inputs[usable_cols]

        groups = inputs.groupby(self.group_col_name, sort=False)

        outputs = pd.DataFrame()
        binned_groups = [None] * len(groups)
        group_col_values = []
        i = 0
        for group_name, group in groups:
            timeseries_group = group.drop(columns=[self.group_col_name])

            timeseries_group = self._applyBinningOperation(timeseries_group)

            group_col_values += [group_name] * len(timeseries_group)
            binned_groups[i] = timeseries_group
            i += 1
        outputs = container.DataFrame(pd.concat(binned_groups))

        is_datetime_index = isinstance(outputs.index, pd.DatetimeIndex)
        if is_datetime_index:
            datetime_index = outputs.index
        if len(outputs) <= len(init_index):
            outputs = outputs.set_index(init_index[0:len(outputs)])
            outputs.insert(loc=d3m_index,
                           column="d3mIndex",
                           value=d3m_col[0:len(outputs)])
            outputs.metadata = outputs.metadata.update_column(
                metadata=inputs.metadata.query(
                    (metadata_base.ALL_ELEMENTS, d3m_index)),
                column_index=d3m_index,
            )
        else:  # assume index and d3mIndex are int
            outputs = outputs.set_index(pd.Index(range(0, len(outputs), 1)))
            d3m_new_col = container.DataFrame(
                {"d3mIndex": range(0, len(outputs), 1)})
            outputs.insert(loc=d3m_index, column="d3mIndex", value=d3m_new_col)
        outputs.insert(loc=group_key_index,
                       column=self.group_col_name,
                       value=group_col_values)
        if is_datetime_index:
            outputs.insert(loc=time_index,
                           column=self.time_col_name,
                           value=datetime_index)
        outputs.metadata = inputs.metadata.select_columns(
            [d3m_index, group_key_index, time_index] + list(value_indices))
        return base.CallResult(outputs)

    def _get_grouping_key_index(self, inputs_metadata):
        group_key_index = self.hyperparams["grouping_key_col"]
        if group_key_index:
            return group_key_index
        grouping_key_indices = inputs_metadata.list_columns_with_semantic_types(
            self._grouping_key_semantic)
        if len(grouping_key_indices) > 0:
            return grouping_key_indices[0]
        raise exceptions.InvalidArgumentValueError(
            "no column with grouping key")

    def _get_time_index(self, inputs_metadata):
        time_index = self.hyperparams["time_col"]
        if time_index:
            return time_index
        time_indices = inputs_metadata.list_columns_with_semantic_types(
            self._time_semantic)
        if len(time_indices) > 0:
            return time_indices[0]
        raise exceptions.InvalidArgumentValueError("no column with time")

    def _get_value_indices(self, inputs_metadata):
        value_indices = self.hyperparams["value_cols"]
        if value_indices and len(value_indices) > 0:
            return value_indices
        value_indices = inputs_metadata.list_columns_with_semantic_types(
            self._target_semantic)
        if len(value_indices) > 0:
            return value_indices
        raise exceptions.InvalidArgumentValueError("no columns with target")

    def _granularityToRule(self):
        granularity = self.hyperparams["granularity"]
        if granularity == "seconds":
            return "S"
        elif granularity == "minutes":
            return "T"
        elif granularity == "hours":
            return "H"
        elif granularity == "days":
            return "D"
        elif granularity == "weeks":
            return "W"
        elif granularity == "months":
            return "M"
        elif granularity == "years":
            return "A"
        raise exceptions.InvalidArgumentValueError(
            "Given granularity argument not supported")

    def _applyBinningOperation(self, timeseries_group):
        if is_numeric_dtype(self.time_col_dtype):
            return self._applyIntegerNumericBinning(timeseries_group)
        timeseries_group = timeseries_group.set_index(
            pd.DatetimeIndex(timeseries_group[self.time_col_name]))
        df = timeseries_group.resample(self._granularityToRule())
        bin_oper = self.hyperparams["binning_operation"]
        return getattr(df, bin_oper)()

    def _applyIntegerNumericBinning(self, timeseries_group):
        bin_oper = self.hyperparams["binning_operation"]
        binning_size = self.hyperparams["binning_size"]
        (
            firstTime,
            right,
        ) = self._get_starting_bin_value(
            timeseries_group)  # timeseries_group[self.time_col_name][0]
        lastTime = timeseries_group[self.time_col_name].iloc[
            len(timeseries_group) - 1]
        amount_of_binning_numbers = int(
            (lastTime - firstTime) / binning_size) + 1
        amount_of_binning_intervals = amount_of_binning_numbers + 1
        binning_intervals = [
            i * binning_size + firstTime
            for i in range(amount_of_binning_intervals)
        ]
        binning_intervals[0] = binning_intervals[0] - int(right)
        timeseries_group["binned"] = pd.cut(
            x=timeseries_group[self.time_col_name],
            bins=binning_intervals,
            right=right)

        columnsToOperation = {}
        columnsToOperation[self.time_col_name] = "max"
        for value in self.value_columns:
            columnsToOperation[value] = bin_oper
        return (timeseries_group.groupby("binned").agg(
            columnsToOperation).reset_index(drop=True))

    def _get_starting_bin_value(self, df):
        if self.hyperparams["binning_starting_value"] == "zero":
            return (
                0,
                True,
            )
        else:
            return (
                df[self.time_col_name].iloc[0],
                False,
            )
Esempio n. 13
0
class IsolationForestPrimitive(
        unsupervised_learning.UnsupervisedLearnerPrimitiveBase[
            container.DataFrame, container.DataFrame, Params, Hyperparams]):
    """
    Uses scikit learn's Isolated Forest primitive to detect and label anomalies.
    """

    metadata = metadata_base.PrimitiveMetadata(
        {
            "id":
            "793f0b17-7413-4962-9f1d-0b285540b21f",
            "version":
            version.__version__,
            "name":
            "Isolation Forest",
            "python_path":
            "d3m.primitives.classification.isolation_forest.IsolationForestPrimitive",
            "source": {
                "name":
                "Distil",
                "contact":
                "mailto:[email protected]",
                "uris": [
                    "https://github.com/uncharted-distil/distil-primitives-contrib/blob/main/main/distil_primitives_contrib/isolation_forest.py",
                    "https://github.com/uncharted-distil/distil-primitives-contrib",
                ],
            },
            "installation": [
                {
                    "type":
                    metadata_base.PrimitiveInstallationType.PIP,
                    "package_uri":
                    "git+https://github.com/uncharted-distil/distil-primitives-contrib.git@{git_commit}#egg=distil-primitives-contrib"
                    .format(git_commit=utils.current_git_commit(
                        os.path.dirname(__file__)), ),
                },
            ],
            "algorithm_types": [
                metadata_base.PrimitiveAlgorithmType.BINARY_CLASSIFICATION,
            ],
            "primitive_family":
            metadata_base.PrimitiveFamily.CLASSIFICATION,
        }, )

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)
        self._model = IsolationForest(
            n_estimators=self.hyperparams["n_estimators"],
            random_state=np.random.RandomState(random_seed),
        )

    def set_training_data(self, *, inputs: container.DataFrame) -> None:
        self._inputs = inputs
        self._needs_fit = True

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[None]:
        logger.debug(f"Fitting {__name__}")

        if self._needs_fit:
            self._model.fit(self._inputs)
            self._needs_fit = False

        return base.CallResult(None)

    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        if self._needs_fit:
            self.fit()

        result = self._model.predict(inputs)

        result_df = container.DataFrame(
            {
                "outlier_label": result,
            },
            generate_metadata=True,
        )
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )

        return base.CallResult(result_df)

    def get_params(self) -> Params:
        return Params(
            model=self._model,
            needs_fit=self._needs_fit,
        )

    def set_params(self, *, params: Params) -> None:
        self._model = params["model"]
        self._needs_fit = params["needs_fit"]
        return
class TimeSeriesLoaderPrimitive(
        transformer.TransformerPrimitiveBase[container.DataFrame,
                                             container.DataFrame,
                                             Hyperparams]):
    """
    Reads the time series files from a given column in an input dataframe into a new M x N dataframe,
    where each timeseries occupies one of M rows, and each of the row's N entries represents a timestamp.
    The loading process assumes that each series file has an identical set of timestamps.
    """

    _semantic_types = (
        'https://metadata.datadrivendiscovery.org/types/FileName',
        'https://metadata.datadrivendiscovery.org/types/Timeseries')
    _media_types = ('text/csv', )

    __author__ = 'Uncharted Software',
    metadata = metadata_base.PrimitiveMetadata({
        'id':
        '1689aafa-16dc-4c55-8ad4-76cadcf46086',
        'version':
        '0.1.0',
        'name':
        'Time series loader',
        'python_path':
        'd3m.primitives.data_preprocessing.time_series_to_list.TimeSeriesLoader',
        'keywords': ['series', 'reader', 'csv'],
        'source': {
            'name':
            'Uncharted Software',
            'contact':
            'mailto:[email protected]',
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/sloth-d3m-wrapper",
            ],
        },
        'installation': [{
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://gitlab.com/unchartedsoftware/distil-timeseries-loader.git@'
            + '{git_commit}#egg=distil-timeseries-loader'.format(
                git_commit=d3m_utils.current_git_commit(
                    os.path.dirname(__file__)), ),
        }],
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION,
        ],
        'supported_media_types':
        _media_types,
        'primitive_family':
        metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
    })

    @classmethod
    def _find_csv_file_column(
            cls, inputs_metadata: metadata_base.DataMetadata
    ) -> typing.Optional[int]:
        indices = utils.list_columns_with_semantic_types(
            inputs_metadata, cls._semantic_types)
        for i in indices:
            if cls._is_csv_file_column(inputs_metadata, i):
                return i
        return None

    @classmethod
    def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            column_index: int) -> bool:
        # check to see if a given column is a file pointer that points to a csv file
        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        if not column_metadata or column_metadata['structural_type'] != str:
            return False

        semantic_types = column_metadata.get('semantic_types', [])
        media_types = column_metadata.get('media_types', [])

        return set(cls._semantic_types).issubset(semantic_types) and set(
            cls._media_types).issubset(media_types)

    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        file_index = self.hyperparams['file_col_index']
        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata, file_index):
                raise exceptions.InvalidArgumentValueError(
                    'column idx=' + str(file_index) + ' from ' +
                    str(inputs.columns) + ' does not contain csv file names')
        else:
            file_index = self._find_csv_file_column(inputs.metadata)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    'no column from ' + str(inputs.columns) +
                    ' contains csv file names')
        value_index = self.hyperparams['value_col_index']
        time_index = self.hyperparams['time_col_index']

        base_path = inputs.metadata.query(
            (metadata_base.ALL_ELEMENTS, file_index))['location_base_uris'][0]
        timeseries_dataframe: pd.DataFrame
        for idx, file_path in enumerate(inputs.iloc[:, file_index]):
            csv_path = os.path.join(base_path, file_path)
            timeseries_row = pd.read_csv(csv_path).transpose()
            # use the time values as the column headers
            if idx is 0:
                timeseries_dataframe = pd.DataFrame(
                    columns=timeseries_row.iloc[time_index])

            timeseries_dataframe = timeseries_dataframe.append(
                timeseries_row.iloc[value_index])

        # get the index to use a range of ints rather than the value col name
        timeseries_dataframe = timeseries_dataframe.reset_index(drop=True)

        # wrap as a D3M container - metadata should be auto generated
        return base.CallResult(container.DataFrame(data=timeseries_dataframe))

    @classmethod
    def can_accept(
        cls, *, method_name: str,
        arguments: typing.Dict[str,
                               typing.Union[metadata_base.Metadata,
                                            type]], hyperparams: Hyperparams
    ) -> typing.Optional[metadata_base.DataMetadata]:
        output_metadata = super().can_accept(method_name=method_name,
                                             arguments=arguments,
                                             hyperparams=hyperparams)

        # If structural types didn't match, don't bother.
        if output_metadata is None:
            return None

        if method_name != 'produce':
            return output_metadata

        if 'inputs' not in arguments:
            return output_metadata

        inputs_metadata = typing.cast(metadata_base.DataMetadata,
                                      arguments['inputs'])

        # make sure there's a file column that points to a csv (search if unspecified)
        file_col_index = hyperparams['file_col_index']
        if file_col_index is not None:
            can_use_column = cls._is_csv_file_column(inputs_metadata,
                                                     file_col_index)
            if not can_use_column:
                return None
        else:
            inferred_index = cls._find_csv_file_column(inputs_metadata)
            if inferred_index is None:
                return None
        # we don't have access to the data at this point so there's not much that we can
        # do to figure out the resulting shape etc
        return inputs_metadata
Esempio n. 15
0
class GaussianClustering(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,Hyperparams]):
    """
    Expecation-Maxmization algorithm for clustering
    """
    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': '5194ef94-3683-319a-9d8d-5c3fdd09de24',
        'version': "0.1.0",
        'name': "jhu.gclust",
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.graph_clustering.gaussian_clustering.JHU',
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['graph', 'gaussian clustering'],
        'source': {
            'name': "JHU",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://github.com/neurodata/primitives-interfaces/blob/master/jhu_primitives/gclust/gclust.py',
#                'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py',
                'https://github.com/neurodata/primitives-interfaces.git',
            ],
            'contact': 'mailto:[email protected]',
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [
            {
                'type': 'UBUNTU',
                'package': 'libxml2-dev',
                'version': '2.9.4'
            },
            {
                'type': 'UBUNTU',
                'package': 'libpcre3-dev',
                'version': '2.9.4'
            },
            {
            'type': metadata_module.PrimitiveInstallationType.PIP,
            'package_uri': 'git+https://github.com/neurodata/primitives-interfaces.git@{git_commit}#egg=jhu_primitives'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
                ),
        }],
        'description': 'Expecation-Maxmization algorithm for clustering',
        # URIs at which one can obtain code for the primitive, if available.
        # 'location_uris':
        #     'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format(
        #         git_commit=utils.current_git_commit(os.path.dirname(__file__)),
        #     ),
        # ],
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            "EXPECTATION_MAXIMIZATION_ALGORITHM"
        ],
        'primitive_family': "GRAPH_CLUSTERING",
        'preconditions': ['NO_MISSING_VALUES']
    })

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, base.DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

        self._embedding: container.ndarray = None

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        TODO: YP description

        **Positional Arguments:**

        inputs:
            - A matrix

        **Optional Arguments:**

        dim:
            - The number of clusters in which to assign the data
        """

        if self._embedding is None:
            self._embedding = inputs[0]

        nodeIDs = inputs[1]
        nodeIDS = np.array([int(i) for i in nodeIDs])

        max_clusters = self.hyperparams['max_clusters']

        if max_clusters < self._embedding.shape[1]:
            self._embedding = self._embedding[:, :max_clusters].copy()

        gclust_object = graspyGCLUST(min_components=max_clusters, covariance_type="all")
        gclust_object.fit(self._embedding)
        model = gclust_object.model_

        predictions = model.predict(self._embedding)

        testing = inputs[2]

        testing_nodeIDs = np.asarray(testing['G1.nodeID'])
        testing_nodeIDs = np.array([int(i) for i in testing_nodeIDs])
        final_labels = np.zeros(len(testing))

        for i in range(len(testing_nodeIDs)):
            label = predictions[i]
            final_labels[i] = int(label) + 1

        testing['classLabel'] = final_labels
        outputs = container.DataFrame(testing[['d3mIndex', 'classLabel']])
        outputs[['d3mIndex', 'classLabel']] = outputs[['d3mIndex', 'classLabel']].astype(int)
        
        return base.CallResult(outputs)


    def set_training_data(self, *, inputs: Inputs) -> None:
        self._training_inputs = inputs

    def get_params(self) -> Params:
        return Params(embedding = self._embedding)

    def set_params(self, *, params: Params) -> None:
        self._embedding = params['embedding']

    def fit(self, *, timeout: float = None, iterations: int = None) -> None:
        return base.CallResult(None)


        # clf.fit(self._embedding)
        # BIC_max = -clf.bic(self._embedding)
        # cluster_likelihood_max = 1
        # cov_type_likelihood_max = "spherical"

        # for i in range(1, max_clusters):
        #     for k in cov_types:
        #         clf = GaussianMixture(n_components=i,
        #                             covariance_type=k)

        #         clf.fit(self._embedding)

        #         current_bic = -clf.bic(self._embedding)

        #         if current_bic > BIC_max:
        #             BIC_max = current_bic
        #             cluster_likelihood_max = i
        #             cov_type_likelihood_max = k

        # clf = GaussianMixture(n_components = cluster_likelihood_max,
        #                 covariance_type = cov_type_likelihood_max)
        # clf.fit(self._embedding)
class SpectralClustering(TransformerPrimitiveBase[Inputs, Outputs,
                                                  Hyperparams]):
    '''
        Primitive that applies sklearn spectral clustering algorithm to unsupervised, 
        supervised or semi-supervised datasets. 
        
        Training inputs: D3M dataframe with features and labels, and D3M indices

        Outputs:D3M dataframe with cluster predictions and D3M indices. Clusterlabels are of "suggestTarget" semantic type if
        the task_type hyperparameter is clustering, and "Attribute" if the task_type is classification.  
    '''
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        "d13a4529-f0ba-44ee-a867-e0fdbb71d6e2",
        'version':
        __version__,
        'name':
        "tsne",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Clustering', 'Graph Clustering'],
        'source': {
            'name':
            __author__,
            'contact':
            __contact__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/D3M-Unsupervised",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type': metadata_base.PrimitiveInstallationType.PIP,
            'package': 'cython',
            'version': '0.29.14',
        }, {
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://github.com/NewKnowledge/D3M-Unsupervised.git@{git_commit}#egg=D3MUnsupervised'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.clustering.spectral_graph_clustering.SpectralClustering',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.SPECTRAL_CLUSTERING,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.CLUSTERING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self.sc = SC(n_clusters=self.hyperparams['n_clusters'],
                     n_init=self.hyperparams['n_init'],
                     n_neighbors=self.hyperparams['n_neighbors'],
                     affinity=self.hyperparams['affinity'],
                     random_state=self.random_seed)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : dataframe 

        Returns
        ----------
        Outputs
            The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter
            For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe
        """

        targets = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]

        X_test = inputs.drop(columns=list(inputs)[index[0]])
        X_test = X_test.drop(columns=target_names).values

        # special semi-supervised case - during training, only produce rows with labels
        series = inputs[target_names] != ''
        if series.any().any():
            inputs = dataframe_utils.select_rows(inputs,
                                                 np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        sc_df = d3m_DataFrame(
            pandas.DataFrame(self.sc.fit_predict(X_test),
                             columns=['cluster_labels']))

        # just add last column of last column ('clusters')
        col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type(1)
        if self.hyperparams['task_type'] == 'classification':
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/Attribute')
            col_dict['name'] = 'cluster_labels'
        else:
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
            )
            col_dict['name'] = target_names[0]
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0),
                                               col_dict)

        df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = (
            'https://metadata.datadrivendiscovery.org/types/TabularColumn', )
        df_dict_1['length'] = 1
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, ),
                                               df_dict)

        return CallResult(utils_cp.append_columns(inputs, sc_df))
class Sent2Vec(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    """
        Produce numerical representations (features) for short texts or sentences.

        Parameters
        ----------
        inputs : Input pandas dataframe

        Returns
        -------
        Outputs
            The output is a pandas dataframe
        """

    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        "id":
        "cf450079-9333-4a3f-aed4-b77a4e8c7be7",
        "version":
        __version__,
        "name":
        "sent2vec_wrapper",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        "keywords":
        ["Sent2Vec", "Embedding", "NLP", "Natural Language Processing"],
        "source": {
            "name":
            __author__,
            "contact":
            __contact__,
            "uris": [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/nk-sent2vec-d3m-wrapper"
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        "installation": [
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/NewKnowledge/nk-sent2vec-d3m-wrapper.git@{git_commit}#egg=sent2vec_wrapper"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__))),
            },
            {
                "type":
                "FILE",
                "key":
                "sent2vec_model",
                "file_uri":
                "http://public.datadrivendiscovery.org/twitter_bigrams.bin",
                "file_digest":
                "9e8ccfea2aaa4435ca61b05b11b60e1a096648d56fff76df984709339f423dd6",
            },
        ],
        # The same path the primitive is registered with entry points in setup.py.
        "python_path":
        "d3m.primitives.feature_extraction.nk_sent2vec.Sent2Vec",
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        "algorithm_types":
        [metadata_base.PrimitiveAlgorithmType.VECTORIZATION],
        "primitive_family":
        metadata_base.PrimitiveFamily.FEATURE_EXTRACTION,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 volumes: typing.Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         volumes=volumes)

        self.volumes = volumes

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce numerical representations (features) for short texts or sentences.

        Parameters
        ----------
        inputs : Input pandas dataframe

        Returns
        -------
        Outputs
            The output is a pandas dataframe
        """

        # extract sentences from stored in nested media files
        text_columns = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/FileName')
        base_paths = [
            inputs.metadata.query(
                (metadata_base.ALL_ELEMENTS,
                 t))['location_base_uris'][0].replace('file:///', '/')
            for t in text_columns
        ]
        txt_paths = [[
            os.path.join(base_path, filename)
            for filename in inputs.iloc[:, col]
        ] for base_path, col in zip(base_paths, text_columns)]
        txt = [[
            open(path, 'r').read().replace('\n', '') for path in path_list
        ] for path_list in txt_paths]
        txt_df = pd.DataFrame(np.array(txt).T)

        # concatenate with text columns that aren't stored in nested files
        local_text_columns = inputs.metadata.get_columns_with_semantic_type(
            'http://schema.org/Text')
        local_text_columns = [
            col for col in local_text_columns if col not in text_columns
        ]
        frame = pd.concat((txt_df, inputs[local_text_columns]), axis=1)

        # delete columns with path names of nested media files
        outputs = inputs.remove_columns(text_columns)

        try:
            vectorizer = _Sent2Vec(path=self.volumes["sent2vec_model"])
            #print('loaded sent2vec model', file = sys.__stdout__)
            output_vectors = []
            for col in range(frame.shape[1]):
                text = frame.iloc[:, col].tolist()
                embedded_sentences = vectorizer.embed_sentences(sentences=text)
                output_vectors.append(embedded_sentences)
            embedded_df = pd.DataFrame(
                np.array(output_vectors).reshape(len(embedded_sentences), -1))
        except ValueError:
            # just return inputs with file names deleted if vectorizing fails
            return CallResult(outputs)

        #print('successfully vectorized text\n', file = sys.__stdout__)

        # create df with vectorized columns and append to input df
        embedded_df = d3m_DataFrame(embedded_df)
        for col in range(embedded_df.shape[1]):
            col_dict = dict(
                embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, col)))
            col_dict['structural_type'] = type(1.0)
            col_dict['name'] = "vector_" + str(col)
            col_dict["semantic_types"] = (
                "http://schema.org/Float",
                "https://metadata.datadrivendiscovery.org/types/Attribute",
            )
            embedded_df.metadata = embedded_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, col), col_dict)
        df_dict = dict(
            embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(
            embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = (
            'https://metadata.datadrivendiscovery.org/types/TabularColumn', )
        df_dict_1['length'] = embedded_df.shape[1]
        embedded_df.metadata = embedded_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, ), df_dict)
        return CallResult(outputs.append_columns(embedded_df))
Esempio n. 18
0
class StatisticalAbsEnergyPrimitive(
        transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    """
    Primitive to find abs_energy of time series
    """

    __author__ = "DATA Lab at Texas A&M University",
    metadata = metadata_base.PrimitiveMetadata({
        'id':
        '73299ffe-d8bb-43c6-a6cc-9261f5e17a5e',
        'version':
        '0.1.0',
        'name':
        'Time Series Statistical Abs Energy',
        'python_path':
        'd3m.primitives.tods.feature_analysis.statistical_abs_energy',
        'keywords': ['Time Series', 'AbsEnergy'],
        "hyperparams_to_tune": ['window_size'],
        'source': {
            'name':
            'DATA Lab at Texas A&M University',
            'uris': [
                'https://gitlab.com/lhenry15/tods.git',
                'https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalAbsEnergy.py'
            ],
            'contact':
            'mailto:[email protected]'
        },
        'installation': [{
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.
            format(git_commit=d3m_utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.DATA_PROFILING,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION,
    })

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        """

        Args:
            inputs: Container DataFrame
            timeout: Default
            iterations: Default

        Returns:
            Container DataFrame containing abs_energy of  time series
        """
        self.logger.info('Statistical AbsEnergy  Primitive called')

        # Get cols to fit.
        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if len(self._training_indices) > 0:
            # self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")
        statistical_abs_energy_input = inputs
        if self.hyperparams['use_semantic_types']:
            statistical_abs_energy_input = inputs.iloc[:,
                                                       self._training_indices]
        output_columns = []
        if len(self._training_indices) > 0:
            statistical_abs_energy_output = self._abs_energy(
                statistical_abs_energy_input, self.hyperparams["window_size"])

            if sparse.issparse(statistical_abs_energy_output):
                statistical_abs_energy_output = statistical_abs_energy_output.toarray(
                )
            outputs = self._wrap_predictions(inputs,
                                             statistical_abs_energy_output)

            #if len(outputs.columns) == len(self._input_column_names):
            # outputs.columns = self._input_column_names

            output_columns = [outputs]

        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._training_indices,
            columns_list=output_columns)

        self.logger.info('Statistical AbsEnergy  Primitive returned')

        return base.CallResult(outputs)

    @classmethod
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
        """
        Select columns to fit.
        Args:
            inputs: Container DataFrame
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            list
        """
        if not hyperparams['use_semantic_types']:
            return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index,
                                           hyperparams)

        use_columns = hyperparams['use_columns']
        exclude_columns = hyperparams['exclude_columns']

        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(
            inputs_metadata,
            use_columns=use_columns,
            exclude_columns=exclude_columns,
            can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
        # return columns_to_produce

    @classmethod
    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            column_index: int,
                            hyperparams: Hyperparams) -> bool:
        """
        Output whether a column can be processed.
        Args:
            inputs_metadata: d3m.metadata.base.DataMetadata
            column_index: int

        Returns:
            bool
        """
        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
        accepted_semantic_types = set()
        accepted_semantic_types.add(
            "https://metadata.datadrivendiscovery.org/types/Attribute")
        if not issubclass(column_metadata['structural_type'],
                          accepted_structural_types):
            return False

        semantic_types = set(column_metadata.get('semantic_types', []))
        return True
        if len(semantic_types) == 0:
            cls.logger.warning("No semantic types found in column metadata")
            return False

        # Making sure all accepted_semantic_types are available in semantic_types
        if len(accepted_semantic_types - semantic_types) == 0:
            return True

        return False

    @classmethod
    def _update_predictions_metadata(
        cls, inputs_metadata: metadata_base.DataMetadata,
        outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]
    ) -> metadata_base.DataMetadata:
        """
        Updata metadata for selected columns.
        Args:
            inputs_metadata: metadata_base.DataMetadata
            outputs: Container Dataframe
            target_columns_metadata: list

        Returns:
            d3m.metadata.base.DataMetadata
        """
        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

        for column_index, column_metadata in enumerate(
                target_columns_metadata):
            column_metadata.pop("structural_type", None)
            outputs_metadata = outputs_metadata.update_column(
                column_index, column_metadata)

        return outputs_metadata

    def _wrap_predictions(self, inputs: Inputs,
                          predictions: ndarray) -> Outputs:
        """
        Wrap predictions into dataframe
        Args:
            inputs: Container Dataframe
            predictions: array-like data (n_samples, n_features)

        Returns:
            Dataframe
        """
        outputs = d3m_dataframe(predictions, generate_metadata=True)
        target_columns_metadata = self._add_target_columns_metadata(
            outputs.metadata, self.hyperparams)
        outputs.metadata = self._update_predictions_metadata(
            inputs.metadata, outputs, target_columns_metadata)

        return outputs

    @classmethod
    def _add_target_columns_metadata(
            cls, outputs_metadata: metadata_base.DataMetadata, hyperparams):
        """
        Add target columns metadata
        Args:
            outputs_metadata: metadata.base.DataMetadata
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            List[OrderedDict]
        """
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']
        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            # column_name = "output_{}".format(column_index)
            column_metadata = OrderedDict()
            semantic_types = set()
            semantic_types.add(hyperparams["return_semantic_type"])
            column_metadata['semantic_types'] = list(semantic_types)

            # column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        return target_columns_metadata

    def _write(self, inputs: Inputs):
        inputs.to_csv(str(time.time()) + '.csv')

    def _abs_energy(self, X, window_size):
        """ statistical abs_energy of time series sequence
           Args:
            X : DataFrame
               Time series.
        Returns:
            DataFrame
            A object with abs_energy
        """
        if (window_size == -1):
            window_size = len(X)
        transformed_X = utils.pandas.DataFrame()
        for column in X.columns:
            column_value = X[column].values
            column_abs_energy = np.zeros(len(column_value))
            for iter in range(window_size - 1, len(column_value)):
                sequence = column_value[iter - window_size + 1:iter + 1]
                column_abs_energy[iter] = np.round(np.sum(sequence * sequence),
                                                   4)
            column_abs_energy[:window_size -
                              1] = column_abs_energy[window_size - 1]
            transformed_X[column + "_abs_energy"] = column_abs_energy

        return transformed_X
class IVectorExtractor(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs,
                                                        Params, Hyperparams]):
    """
    BBN D3M I-vector extractor extracts i-vectors for variable-length input sequences of feature vectors.
    Input: List of arrays with feature vectors extracted for frames [ num_frames, num_features ]
    Output: Array of i-vectors of shape [ num_inputs, ivec_dim ]
    Applications include: audio, time-series classification
    """

    __git_commit__ = utils.current_git_commit(os.path.dirname(__file__))
    metadata = metadata_module.PrimitiveMetadata({
        'id':
        '1c5080bd-7b2f-4dbb-ac5f-0a65b59526a7',
        'version':
        __version__,
        'name':
        "I-vector extractor",
        'description':
        """BBN D3M I-vector extractor extracts i-vectors for variable-length input sequences of feature vectors.\n
                        Input: List of arrays with feature vectors extracted for frames [ num_frames, num_features ]\n
												Output: Array of i-vectors of shape [ num_inputs, ivec_dim ]\n
                        Applications include: audio, time-series classification""",
        'keywords': [],
        'source': {
            'name':
            __author__,
            'contact':
            'mailto:[email protected]',
            'uris': [
                'https://github.com/BBN-E/d3m-bbn-primitives/blob/{git_commit}/bbn_primitives/time_series/ivector_extraction.py'
                .format(git_commit=__git_commit__),
                'https://github.com/BBN-E/d3m-bbn-primitives.git',
            ],
        },
        'installation': [{
            'type':
            'PIP',
            'package_uri':
            'git+https://github.com/BBN-E/d3m-bbn-primitives.git@{git_commit}#egg={egg}'
            .format(git_commit=__git_commit__, egg='bbn_primitives'),
        }],
        'python_path':
        'd3m.primitives.data_transformation.i_vector_extractor.IVectorExtractor',  #'d3m.primitives.bbn.time_series.IVectorExtractor', #'d3m.primitives.data_transformation.ivector_extractor.BBN',
        'algorithm_types':
        [metadata_module.PrimitiveAlgorithmType.DATA_CONVERSION],
        'primitive_family':
        metadata_module.PrimitiveFamily.DATA_TRANSFORMATION,
    })

    def __init__(
            self,
            *,
            hyperparams: Hyperparams,
            random_seed: int = 0,
            docker_containers: typing.Dict[str,
                                           DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        self._training_inputs = None
        self._gmm = GaussianMixture(
            n_components=self.hyperparams['num_gauss'],
            covariance_type=self.hyperparams['gmm_covariance_type'],
            max_iter=self.hyperparams['max_gmm_iter'])
        self._v = None
        self._fitted: bool = False

    def set_training_data(self, *, inputs: Inputs) -> None:
        self._training_inputs = inputs
        self._fitted = False

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None:
            raise Exception('Missing training data')

        with stopit.ThreadingTimeout(timeout) as timer:
            # Train GMM
            _logger.info('Training GMM')
            num_data = len(self._training_inputs)
            #for idx in range(num_data):
            #    X = self._training_inputs[idx]
            #    print(X.shape)
            self._gmm.fit(
                np.vstack(
                    [x for x in self._training_inputs if len(x.shape) == 2]))

            # Train i-vector extractor
            self._v = np.random.randn(
                self._gmm.n_components * self._gmm.means_.shape[1],
                self.hyperparams['ivec_dim'])
            _logger.info('Training i-vector extractor')
            N = np.zeros((num_data, self._gmm.n_components))
            F = np.zeros(
                (num_data, self._gmm.n_components * self._gmm.means_.shape[1]))
            # TODO: Do the E-step in mini-batches to prevent memory overflow
            for idx in range(num_data):
                X = self._training_inputs[idx]
                if len(X.shape) != 2:
                    continue
                gamma = self._gmm.predict_proba(X)
                N0 = gamma.T.sum(axis=1)
                F0 = gamma.T.dot(X)
                N0, F0 = normalize_stats(N0, F0, self._gmm.means_,
                                         self._gmm.precisions_cholesky_)
                N[idx, :] = N0
                F[idx, :] = F0.flatten()

            for ivec_iter in range(self.hyperparams['num_ivec_iter']):
                _logger.info('Training i-vector extractor - iteration %d' %
                             ivec_iter)
                num_data = len(self._training_inputs)
                A, C, Amd, Cmd, Nmd = None, None, None, None, None
                VtV, I = None, None

                A, C, Amd, Cmd, Nmd = E_step_with_MD(N, F, self._v, VtV, I, A,
                                                     C, Amd, Cmd, Nmd)
                em_v = M_step(A, C)
                md_v = M_step_MD(Amd, Cmd, Nmd, em_v)
                self._v = md_v.reshape(
                    (self._gmm.n_components * self._gmm.means_.shape[1],
                     self.hyperparams['ivec_dim']))

            self._fitted = True

        if timer.state == timer.EXECUTED:
            return CallResult(None)
        else:
            raise TimeoutError('IVectorExtractor exceeded time limit')

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        with stopit.ThreadingTimeout(timeout) as timer:
            num_data = len(inputs)
            outputs = np.empty((num_data, self.hyperparams['ivec_dim']),
                               dtype=self._v.dtype)
            VtV = compute_VtV(self._v, self._gmm.n_components)
            I = np.eye(self.hyperparams['ivec_dim'], dtype=self._v.dtype)
            for idx in range(num_data):
                X = inputs[idx]
                if len(X.shape) != 2:
                    outputs[idx] = np.zeros((self.hyperparams['ivec_dim']))
                    continue
                gamma = self._gmm.predict_proba(X)
                N0 = gamma.T.sum(axis=1)
                F0 = gamma.T.dot(X)
                N0, F0 = normalize_stats(N0, F0, self._gmm.means_,
                                         self._gmm.precisions_cholesky_)
                ivec = estimate_i(row(N0.astype(self._v.dtype)),
                                  row(F0.astype(self._v.dtype)), self._v, VtV,
                                  I)
                outputs[idx] = ivec.flatten()
            #adding normalization
            if (self.hyperparams['ivec_normalize']):
                outputs = preprocessing.normalize(outputs, norm='l2')
            outputs = d3m_dataframe(outputs, generate_metadata=False)

            metadata = inputs.metadata.clear(
                {
                    'schema':
                    metadata_module.CONTAINER_SCHEMA_VERSION,
                    'structural_type':
                    type(outputs),
                    'semantic_types':
                    ['https://metadata.datadrivendiscovery.org/types/Table'],
                    'dimension': {
                        'length':
                        outputs.shape[0],
                        'name':
                        'rows',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularRow'
                        ]
                    }
                },
                for_value=outputs
            ).update(
                ((metadata_base.ALL_ELEMENTS, )), {
                    'dimension': {
                        'length':
                        outputs.shape[1],
                        'name':
                        'columns',
                        'semantic_types': [
                            'https://metadata.datadrivendiscovery.org/types/TabularColumn'
                        ]
                    }
                }
            ).update(
                ((metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS)),
                {
                    #'structural_type': self._v.dtype,
                    'semantic_types': [
                        'https://metadata.datadrivendiscovery.org/types/Attribute'
                    ],
                })

            # Set metadata attribute.
            outputs.metadata = metadata

        if timer.state == timer.EXECUTED:
            return CallResult(outputs)
        else:
            raise TimeoutError('IVectorExtractor exceeded time limit')

    def get_params(self) -> Params:
        return Params(weights=self._gmm.weights_,
                      means=self._gmm.means_,
                      covs=self._gmm.covariances_,
                      cov_type=self._gmm.covariance_type,
                      v=self._v)

    def set_params(self, *, params: Params) -> None:
        assert self._gmm.covariance_type == params['cov_type']
        # Consider adding additional assertations regarding dims

        self._gmm.weights_ = params['weights']
        self._gmm.means_ = params['means']
        self._gmm.covariances_ = params['covs']
        self._gmm.precisions_cholesky_ = sklearn.mixture.gaussian_mixture._compute_precision_cholesky(
            params['covs'], params['cov_type'])
        self._v = params['v']
Esempio n. 20
0
class ForecastingNBEATSPrimitive(
        SupervisedLearnerPrimitiveBase[Inputs, Outputs,
                                       ForecastingNBEATSParams,
                                       ForecastingNBEATSHyperparams]):
    """
    N-BEATS for time series forecasting
    """
    metadata = metadata_base.PrimitiveMetadata(
        {
            'id':
            'bd925663-aeaf-4240-9748-cd77dce33819',
            'version':
            '0.1.0',
            "name":
            "N-BEATS models for time series forecasting",
            'description':
            "Pytorch Implementation of N-BEATS. The model is doing local projections to basis "
            "functions. These functions include \"trends\" (with polynomial functions) and "
            "\"seasonalities\" (with harmonic functions). The prediction will consist of adding the "
            "local projections to these basis functions to the last available value in the ts (Naive "
            "1). The model decomposes the signals successively through different \"blocks\" of a fully "
            "connected residual NN.",
            'python_path':
            'd3m.primitives.time_series_forecasting.nbeats.DeepNeuralNetwork',
            'source': {
                'name': nbeats.__author__,
                'uris': ['https://github.com/autonlab/nbeats'],
                'contact': 'mailto:[email protected]'
            },
            'installation': [{
                'type':
                metadata_base.PrimitiveInstallationType.PIP,
                'package_uri':
                'git+https://github.com/autonlab/nbeats.git@{git_commit}#egg=nbeats'
                .format(git_commit=d3m_utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            }],
            'algorithm_types': [
                metadata_base.PrimitiveAlgorithmType.DEEP_NEURAL_NETWORK,
            ],
            'primitive_family':
            metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING,
        }, )

    def __init__(self,
                 *,
                 hyperparams: ForecastingNBEATSHyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._is_fitted = False

        self._device = 'cpu' if not torch.cuda.is_available(
        ) or hyperparams['device'] == 'cpu' else hyperparams['device']
        print("Use " + self._device)
        self.logger.info("Use " + self._device)

        self._nbeats = Nbeats(
            input_size_multiplier=hyperparams['input_size_multiplier'],
            window_sampling_limit_multiplier=hyperparams[
                'window_sampling_limit_multiplier'],
            shared_weights=hyperparams['shared_weights'],
            output_size=hyperparams['output_size'],
            stack_types=hyperparams['stack_types'],
            n_blocks=hyperparams['n_blocks'],
            n_layers=hyperparams['n_layers'],
            n_hidden=hyperparams['n_hidden'],
            n_harmonics=hyperparams['n_harmonics'],
            n_polynomials=hyperparams['n_polynomials'],
            n_iterations=hyperparams['n_iterations'],
            learning_rate=hyperparams['learning_rate'],
            lr_decay=hyperparams['lr_decay'],
            n_lr_decay_steps=hyperparams['n_lr_decay_steps'],
            batch_size=hyperparams['batch_size'],
            loss=hyperparams['loss'],
            seasonality=hyperparams['seasonality'],
            # random_seed=random_seed,  # FIXME pipelines are tuned on NBeats default seed
            random_seed=1,
            device=self._device)
        self._time_column = None
        self._integer_time = False
        self.filter_idxs = []
        self._year_column = None
        self._constant = 0  # the constant term to avoid nan
        self._y_mean = 0  # the mean of the target variable in the training data

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if not self._is_fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        inputs_copy = inputs.copy()

        # if datetime columns are integers, parse as # of days
        if self._integer_time:
            inputs_copy[self._time_column] = pd.to_datetime(
                inputs_copy[self._time_column], unit="D")
        else:
            inputs_copy[self._time_column] = pd.to_datetime(
                inputs_copy[self._time_column], unit="s")

        # find marked 'GroupingKey' or 'SuggestedGroupingKey'
        grouping_keys = inputs_copy.metadata.get_columns_with_semantic_type(
            "https://metadata.datadrivendiscovery.org/types/GroupingKey")
        suggested_grouping_keys = inputs_copy.metadata.get_columns_with_semantic_type(
            "https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey"
        )
        if len(grouping_keys) == 0:
            grouping_keys = suggested_grouping_keys
        else:
            inputs_copy = inputs_copy.drop(columns=[
                list(inputs_copy)[i] for i in suggested_grouping_keys
            ])

        # check whether no grouping keys are labeled
        if len(grouping_keys) == 0:
            concat = pd.concat([inputs_copy[self._time_column]], axis=1)
            concat.columns = ['ds']
            concat['unique_id'] = 'series1'  # We have only one series
        else:
            # concatenate columns in `grouping_keys` to unique_id column
            concat = inputs_copy.loc[:, self.filter_idxs].apply(
                lambda x: ' '.join([str(v) for v in x]), axis=1)
            concat = pd.concat([concat, inputs_copy[self._time_column]],
                               axis=1)
            concat.columns = ['unique_id', 'ds']

        X_test = concat[['unique_id', 'ds']]

        predictions = self._nbeats.predict(X_test)
        predictions['y_hat'] -= self._constant
        predictions['y_hat'] = self._fillna(predictions['y_hat'])
        output = container.DataFrame(predictions['y_hat'],
                                     generate_metadata=True)
        return base.CallResult(output)

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        data = inputs.horizontal_concat(outputs)
        data = data.copy()

        # mark datetime column
        times = data.metadata.list_columns_with_semantic_types((
            "https://metadata.datadrivendiscovery.org/types/Time",
            "http://schema.org/DateTime",
        ))
        if len(times) != 1:
            raise ValueError(
                f"There are {len(times)} indices marked as datetime values. Please only specify one"
            )
        self._time_column = list(data)[times[0]]

        # if datetime columns are integers, parse as # of days
        if ("http://schema.org/Integer"
                in inputs.metadata.query_column(times[0])["semantic_types"]):
            self._integer_time = True
            data[self._time_column] = pd.to_datetime(data[self._time_column],
                                                     unit="D")
        else:
            data[self._time_column] = pd.to_datetime(data[self._time_column],
                                                     unit="s")

        # sort by time column
        data = data.sort_values(by=[self._time_column])

        # mark key and grp variables
        self.key = data.metadata.get_columns_with_semantic_type(
            "https://metadata.datadrivendiscovery.org/types/PrimaryKey")

        # mark target variables
        self._targets = data.metadata.list_columns_with_semantic_types((
            "https://metadata.datadrivendiscovery.org/types/SuggestedTarget",
            "https://metadata.datadrivendiscovery.org/types/TrueTarget",
            "https://metadata.datadrivendiscovery.org/types/Target",
        ))
        self._target_types = [
            "i" if "http://schema.org/Integer"
            in data.metadata.query_column(t)["semantic_types"] else "c"
            if "https://metadata.datadrivendiscovery.org/types/CategoricalData"
            in data.metadata.query_column(t)["semantic_types"] else "f"
            for t in self._targets
        ]
        self._targets = [list(data)[t] for t in self._targets]

        self.target_column = self._targets[0]

        # see if 'GroupingKey' has been marked
        # otherwise fall through to use 'SuggestedGroupingKey'
        grouping_keys = data.metadata.get_columns_with_semantic_type(
            "https://metadata.datadrivendiscovery.org/types/GroupingKey")
        suggested_grouping_keys = data.metadata.get_columns_with_semantic_type(
            "https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey"
        )
        if len(grouping_keys) == 0:
            grouping_keys = suggested_grouping_keys
            drop_list = []
        else:
            drop_list = suggested_grouping_keys

        grouping_keys_counts = [
            data.iloc[:, key_idx].nunique() for key_idx in grouping_keys
        ]
        grouping_keys = [
            group_key for count, group_key in sorted(
                zip(grouping_keys_counts, grouping_keys))
        ]
        self.filter_idxs = [list(data)[key] for key in grouping_keys]

        # drop index
        data.drop(columns=[list(data)[i] for i in drop_list + self.key],
                  inplace=True)

        # check whether no grouping keys are labeled
        if len(grouping_keys) == 0:
            concat = pd.concat(
                [data[self._time_column], data[self.target_column]], axis=1)
            concat.columns = ['ds', 'y']
            concat['unique_id'] = 'series1'  # We have only one series
        else:
            # concatenate columns in `grouping_keys` to unique_id column
            concat = data.loc[:, self.filter_idxs].apply(
                lambda x: ' '.join([str(v) for v in x]), axis=1)
            concat = pd.concat(
                [concat, data[self._time_column], data[self.target_column]],
                axis=1)
            concat.columns = ['unique_id', 'ds', 'y']

        if len(grouping_keys):
            # Infer frequency
            freq = self._nbeats.frequency
            if not freq:
                freq = pd.infer_freq(concat.head()['ds'])
                if freq is None and len(concat['unique_id']) > 0:
                    freq = pd.infer_freq(concat[concat['unique_id'] ==
                                                concat['unique_id'][0]]['ds'])
                if freq is None:
                    freq = 'D'
                    self.logger.warn('Cannot infer frequency. Use "D".')
                else:
                    self.logger.info('Inferred frequency: {}'.format(freq))

            # Series must be complete in the frequency
            concat = ForecastingNBEATSPrimitive._ffill_missing_dates_per_serie(
                concat, freq)

        # remove duplicates
        concat = concat.drop_duplicates(['unique_id', 'ds'])

        self._data = concat

        self._y_mean = self._data['y'].mean()

        # if min of y is negative, then add the absolute value of it to the constant
        if self._data['y'].min() <= 0:
            self._constant = 1 - self._data['y'].min()

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        y_train = self._data[['unique_id', 'ds', 'y']]
        y_train['y'] += self._constant
        self._nbeats.fit(y_train, verbose=False)
        self._is_fitted = True

        return base.CallResult(None)

    def get_params(self) -> Params:
        return ForecastingNBEATSParams(is_fitted=self._is_fitted,
                                       time_column=self._time_column,
                                       integer_time=self._integer_time,
                                       filter_idxs=self.filter_idxs,
                                       y_mean=self._y_mean,
                                       nbeats=self._nbeats)

    def set_params(self, *, params: Params) -> None:
        self._is_fitted = params['is_fitted']
        self._time_column = params['time_column']
        self._integer_time = params['integer_time']
        self.filter_idxs = params['filter_idxs']
        self._y_mean = params['y_mean']
        self._nbeats = params['nbeats']

    @staticmethod
    def _ffill_missing_dates_particular_serie(serie, min_date, max_date, freq):
        date_range = pd.date_range(start=min_date, end=max_date, freq=freq)
        unique_id = serie['unique_id'].unique()
        df_balanced = pd.DataFrame({
            'ds': date_range,
            'key': [1] * len(date_range),
            'unique_id': unique_id[0]
        })

        # Check balance
        check_balance = df_balanced.groupby(
            ['unique_id']).size().reset_index(name='count')
        assert len(set(check_balance['count'].values)) <= 1
        df_balanced = df_balanced.merge(serie,
                                        how="left",
                                        on=['unique_id', 'ds'])

        df_balanced['y'] = df_balanced['y'].fillna(method='ffill')

        return df_balanced

    @staticmethod
    def _ffill_missing_dates_per_serie(df, freq="D", fixed_max_date=None):
        """Receives a DataFrame with a date column and forward fills the missing gaps in dates, not filling dates before


        Parameters
        ----------
        df: DataFrame
            Input DataFrame
        key: str or list
            Name(s) of the column(s) which make a unique time series
        date_col: str
            Name of the column that contains the time column
        freq: str
            Pandas time frequency standard strings, like "W-THU" or "D" or "M"
        numeric_to_fill: str or list
            Name(s) of the columns with numeric values to fill "fill_value" with
        """
        if fixed_max_date is None:
            df_max_min_dates = df[['unique_id',
                                   'ds']].groupby('unique_id').agg(
                                       ['min', 'max']).reset_index()
        else:
            df_max_min_dates = df[['unique_id',
                                   'ds']].groupby('unique_id').agg(
                                       ['min']).reset_index()
            df_max_min_dates['max'] = fixed_max_date

        df_max_min_dates.columns = df_max_min_dates.columns.droplevel()
        df_max_min_dates.columns = ['unique_id', 'min_date', 'max_date']

        df_list = []
        for index, row in df_max_min_dates.iterrows():
            df_id = df[df['unique_id'] == row['unique_id']]
            df_id = ForecastingNBEATSPrimitive._ffill_missing_dates_particular_serie(
                df_id, row['min_date'], row['max_date'], freq)
            df_list.append(df_id)

        df_dates = pd.concat(df_list).reset_index(drop=True).drop(
            'key', axis=1)[['unique_id', 'ds', 'y']]

        return df_dates

    def _fillna(self, series):
        if series.isnull().any():
            # self.logger.warning("The prediction contains NAN. Fill with mean of prediction.")
            tofill = series.mean(
            )  # use the prediction mean if possible. Otherwise use the mean of the training data.
            if pd.isna(tofill):
                # self.logger.warn('The predictions are all NAN')
                tofill = self._y_mean
            return series.fillna(tofill)
        return series
Esempio n. 21
0
class RuleBasedFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs,
                                                           Hyperparams]):
    """
    Filter the selected columns according to the rule.

    Parameters
    ---------- 
    rule: String
        The rule to follow when performing the filter. Write it like how we write 'if' in python. And wrap column index with two '#': #col_num#.
        e.g. "#1# > 10" means that the numbers in column 1 must be greater than 10.
        The indicies of columns should be same with those in 'use_columns'. 

    use_columns: Set
        A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.
        The indicies of columns should be same with those in 'rule'.
    
    exclude_columns: Set
        A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.
    
    return_result: Enumeration
        Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned?

    add_index_columns: Bool
        Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".
    
    error_on_no_input: Bool(
        Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.
    
    return_semantic_type: Enumeration[str](
        Decides what semantic type to attach to generated attributes'
    """

    __author__: "DATA Lab at Texas A&M University"
    metadata = metadata_base.PrimitiveMetadata({
        "name":
        "Rule-Based Filtering",
        "python_path":
        "d3m.primitives.tods.reinforcement.rule_filter",
        "source": {
            'name': 'DATA Lab at Texas A&M University',
            'contact': 'mailto:[email protected]',
            'uris': [
                'https://gitlab.com/lhenry15/tods.git',
            ]
        },
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.RULE_BASED_FILTER,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.REINFORCEMENT,
        "id":
        "42744c37-8879-4785-9f18-6de9d612ea93",
        "hyperparams_to_tune": [
            'rule',
        ],
        "version":
        "0.0.1",
    })

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame.

        Returns:
            Container DataFrame after BKFilter.
        """
        # Get cols to fit.

        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        operated_col = [
            int(x.strip('#'))
            for x in re.findall(r'#\d*#', self.hyperparams['rule'])
        ]
        if set(operated_col) != set(self._training_indices):
            # print(operated_col, self._training_indices)
            raise RuntimeError(
                "Column numbers in 'rule' and 'use_columns' are not matched.")

        if len(self._training_indices) > 0:
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        # if not self._fitted:
        #     raise PrimitiveNotFittedError("Primitive not fitted.")
        # sk_inputs = inputs
        # if self.hyperparams['use_semantic_types']:
        #     sk_inputs = inputs.iloc[:, self._training_indices]

        output_columns = []

        if len(self._training_indices) > 0:
            sk_output = self._rule_based_filter(inputs,
                                                self.hyperparams['rule'])
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)

            if len(outputs.columns) == len(self._input_column_names):
                outputs.columns = self._input_column_names
            output_columns = [outputs]

        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(
            return_result=self.hyperparams['return_result'],
            add_index_columns=self.hyperparams['add_index_columns'],
            inputs=inputs,
            column_indices=self._training_indices,
            columns_list=output_columns)

        # self._write(outputs)
        # self.logger.warning('produce was called3')
        return CallResult(outputs)

    @classmethod
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
        """
        Select columns to fit.
        Args:
            inputs: Container DataFrame
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            list
        """
        # if not hyperparams['use_semantic_types']:
        #     return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index,
                                           hyperparams)

        use_columns = []
        exclude_columns = []

        # if hyperparams['columns_using_method'] == 'name':
        #     inputs_cols = inputs.columns.values.tolist()
        #     for i in range(len(inputs_cols)):
        #         if inputs_cols[i] in hyperparams['use_columns_name']:
        #             use_columns.append(i)
        #         elif inputs_cols[i] in hyperparams['exclude_columns_name']:
        #             exclude_columns.append(i)
        # else:
        use_columns = hyperparams['use_columns']
        exclude_columns = hyperparams['exclude_columns']

        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(
            inputs_metadata,
            use_columns=use_columns,
            exclude_columns=exclude_columns,
            can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
        # return columns_to_produce

    @classmethod
    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            column_index: int,
                            hyperparams: Hyperparams) -> bool:
        """
        Output whether a column can be processed.
        Args:
            inputs_metadata: d3m.metadata.base.DataMetadata
            column_index: int

        Returns:
            bool
        """
        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
        accepted_semantic_types = set()
        accepted_semantic_types.add(
            "https://metadata.datadrivendiscovery.org/types/Attribute")
        if not issubclass(column_metadata['structural_type'],
                          accepted_structural_types):
            return False

        semantic_types = set(column_metadata.get('semantic_types', []))

        if len(semantic_types) == 0:
            cls.logger.warning("No semantic types found in column metadata")
            return False

        # Making sure all accepted_semantic_types are available in semantic_types
        if len(accepted_semantic_types - semantic_types) == 0:
            return True

        return False

    @classmethod
    def _update_predictions_metadata(
        cls, inputs_metadata: metadata_base.DataMetadata,
        outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]
    ) -> metadata_base.DataMetadata:
        """
        Updata metadata for selected columns.
        Args:
            inputs_metadata: metadata_base.DataMetadata
            outputs: Container Dataframe
            target_columns_metadata: list

        Returns:
            d3m.metadata.base.DataMetadata
        """
        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

        for column_index, column_metadata in enumerate(
                target_columns_metadata):
            column_metadata.pop("structural_type", None)
            outputs_metadata = outputs_metadata.update_column(
                column_index, column_metadata)

        return outputs_metadata

    def _wrap_predictions(self, inputs: Inputs,
                          predictions: ndarray) -> Outputs:
        """
        Wrap predictions into dataframe
        Args:
            inputs: Container Dataframe
            predictions: array-like data (n_samples, n_features)

        Returns:
            Dataframe
        """
        outputs = d3m_dataframe(predictions, generate_metadata=True)
        target_columns_metadata = self._add_target_columns_metadata(
            outputs.metadata, self.hyperparams)
        outputs.metadata = self._update_predictions_metadata(
            inputs.metadata, outputs, target_columns_metadata)
        return outputs

    @classmethod
    def _add_target_columns_metadata(
            cls, outputs_metadata: metadata_base.DataMetadata, hyperparams):
        """
        Add target columns metadata
        Args:
            outputs_metadata: metadata.base.DataMetadata
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            List[OrderedDict]
        """
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']
        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            # column_name = "output_{}".format(column_index)
            column_metadata = OrderedDict()
            semantic_types = set()
            semantic_types.add(hyperparams["return_semantic_type"])
            column_metadata['semantic_types'] = list(semantic_types)

            # column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        return target_columns_metadata

    def _write(self, inputs: Inputs):
        inputs.to_csv(str(time.time()) + '.csv')

    def _rule_based_filter(self, X, rule):
        """
        Filter the selected columns according to the rule.
        Args:
            X: slected rows to be performed
            rule: The rule to follow when performing the filter

        Returns:
            Dataframe, results of Rule-Based Filter
        """
        list_result = [0] * X.shape[0]

        rule = re.sub(r'#\d*#', lambda x: 'row[' + x.group(0).strip('#') + ']',
                      rule)

        for index, row in X.iterrows():
            if not eval(rule):
                list_result[index] = 1

        return utils.pandas.DataFrame({'result': list_result})
Esempio n. 22
0
class BKFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    """
    Filter a time series using the Baxter-King bandpass filter.

    Parameters
    ----------
    low: int
        Minimum period for oscillations, ie., Baxter and King suggest that the Burns-Mitchell U.S. business cycle has 6 for quarterly data and 1.5 for annual data.
    
    high: int
        Maximum period for oscillations BK suggest that the U.S. business cycle has 32 for quarterly data and 8 for annual data.

    K: int
        Lead-lag length of the filter. Baxter and King propose a truncation length of 12 for quarterly data and 3 for annual data.  

    use_columns: Set
        A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.
    
    exclude_columns: Set
        A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.
    
    return_result: Enumeration
        Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.
    
    use_semantic_types: Bool
        Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe.
    
    add_index_columns: Bool
        Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".
    
    error_on_no_input: Bool(
        Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.
    
    return_semantic_type: Enumeration[str](
        Decides what semantic type to attach to generated attributes'
    """

    __author__: "DATA Lab at Texas A&M University"
    metadata = metadata_base.PrimitiveMetadata({ 
         "name": "Baxter-King Filter Primitive",
         "python_path": "d3m.primitives.tods.feature_analysis.bk_filter",
         "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:[email protected]', 
         'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/DuplicationValidation.py']},
         "algorithm_types": [metadata_base.PrimitiveAlgorithmType.BK_FILTER,],
         "primitive_family": metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION,
         "id": "b2bfadc5-dbca-482c-b188-8585e5f245c4",
         "hyperparams_to_tune": ['low', 'high', 'K'],
         "version": "0.0.1",
    })


    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Process the testing data.
        Args:
            inputs: Container DataFrame.

        Returns:
            Container DataFrame after BKFilter.
        """
        # Get cols to fit.
        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns


        if len(self._training_indices) > 0:
            # self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")



        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")
        sk_inputs = inputs
        if self.hyperparams['use_semantic_types']:
            sk_inputs = inputs.iloc[:, self._training_indices]
        output_columns = []
        if len(self._training_indices) > 0:
            sk_output = self._bkfilter(sk_inputs, low=self.hyperparams['low'], high=self.hyperparams['high'], K=self.hyperparams['K'])
            if sparse.issparse(sk_output):
                sk_output = sk_output.toarray()
            outputs = self._wrap_predictions(inputs, sk_output)

            if len(outputs.columns) == len(self._input_column_names):
                outputs.columns = self._input_column_names
            output_columns = [outputs]           
            
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
                                               add_index_columns=self.hyperparams['add_index_columns'],
                                               inputs=inputs, column_indices=self._training_indices,
                                               columns_list=output_columns)

        # self._write(outputs)
        # self.logger.warning('produce was called3')
        return CallResult(outputs)
        
    
    @classmethod
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
        """
        Select columns to fit.
        Args:
            inputs: Container DataFrame
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            list
        """
        if not hyperparams['use_semantic_types']:
            return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

        use_columns = []
        exclude_columns = []

        # if hyperparams['columns_using_method'] == 'name':
        #     inputs_cols = inputs.columns.values.tolist()
        #     for i in range(len(inputs_cols)):
        #         if inputs_cols[i] in hyperparams['use_columns_name']:
        #             use_columns.append(i)
        #         elif inputs_cols[i] in hyperparams['exclude_columns_name']:
        #             exclude_columns.append(i)      
        # else: 
        use_columns=hyperparams['use_columns']
        exclude_columns=hyperparams['exclude_columns']           
        
        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
        # return columns_to_produce

    @classmethod
    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
        """
        Output whether a column can be processed.
        Args:
            inputs_metadata: d3m.metadata.base.DataMetadata
            column_index: int

        Returns:
            bool
        """
        column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
        accepted_semantic_types = set()
        accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
        if not issubclass(column_metadata['structural_type'], accepted_structural_types):
            return False

        semantic_types = set(column_metadata.get('semantic_types', []))

        if len(semantic_types) == 0:
            cls.logger.warning("No semantic types found in column metadata")
            return False
        
        # Making sure all accepted_semantic_types are available in semantic_types
        if len(accepted_semantic_types - semantic_types) == 0:
            return True

        return False
    
    
    @classmethod
    def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
                                     target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
        """
        Updata metadata for selected columns.
        Args:
            inputs_metadata: metadata_base.DataMetadata
            outputs: Container Dataframe
            target_columns_metadata: list

        Returns:
            d3m.metadata.base.DataMetadata
        """
        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

        for column_index, column_metadata in enumerate(target_columns_metadata):
            column_metadata.pop("structural_type", None)
            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

        return outputs_metadata

    def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
        """
        Wrap predictions into dataframe
        Args:
            inputs: Container Dataframe
            predictions: array-like data (n_samples, n_features)

        Returns:
            Dataframe
        """
        outputs = d3m_dataframe(predictions, generate_metadata=True)
        target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams)
        outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
        return outputs


    @classmethod
    def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams):
        """
        Add target columns metadata
        Args:
            outputs_metadata: metadata.base.DataMetadata
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            List[OrderedDict]
        """
        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_name = "output_{}".format(column_index)
            column_metadata = OrderedDict()
            semantic_types = set()
            semantic_types.add(hyperparams["return_semantic_type"])
            column_metadata['semantic_types'] = list(semantic_types)

            column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        return target_columns_metadata

    def _write(self, inputs:Inputs):
        inputs.to_csv(str(time.time())+'.csv')

    def _bkfilter(self, X, low, high, K):
        """
        Perform BKFilter
        Args:
            X: slected rows to be performed
            K, low, high: Parameters of BKFilter

        Returns:
            Dataframe, results of BKFilter
        """
        transformed_X = utils.pandas.DataFrame()
        for col in X.columns:
            cycle = sm.tsa.filters.bkfilter(X[col], low=low, high=high, K=K)
            cycle_df = utils.pandas.DataFrame(cycle)
            transformed_X = utils.pandas.concat([transformed_X,cycle_df], axis=1)

        return transformed_X
Esempio n. 23
0
class LinearRegressionPrimitive(
        ProbabilisticCompositionalityMixin[Inputs, Outputs, Params,
                                           Hyperparams],
        GradientCompositionalityMixin[Inputs, Outputs, Params, Hyperparams],
        SamplingCompositionalityMixin[Inputs, Outputs, Params, Hyperparams],
        SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    -------------
    Inputs:  DataFrame of features of shape: NxM, where N = samples and M = features.
    Outputs: DataFrame containing the target column of shape Nx1 or denormalized dataset.
    -------------
    """
    # Metadata
    __author__ = 'UBC DARPA D3M Team, Tony Joseph <*****@*****.**>'
    metadata = metadata_base.PrimitiveMetadata({
        "id":
        "7288e169-5c2b-434a-96f8-cb2144e7f9cc",
        "version":
        config.VERSION,
        "name":
        "Bayesian Linear Regression",
        "description":
        "A bayesian linear regression",
        "python_path":
        "d3m.primitives.regression.linear_regression.UBC",
        "primitive_family":
        metadata_base.PrimitiveFamily.REGRESSION,
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.LINEAR_REGRESSION,
        ],
        "source": {
            "name": config.D3M_PERFORMER_TEAM,
            "contact": config.D3M_CONTACT,
            "uris": [config.REPOSITORY],
        },
        "keywords": ['bayesian', 'regression'],
        "installation": [config.INSTALLATION],
        "hyperparams_to_tune": ['learning_rate', 'minibatch_size']
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 _verbose: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)
        self.hyperparams = hyperparams
        self._random_state = random_seed
        self._verbose = _verbose
        self._training_inputs: Inputs = None
        self._training_outputs: Outputs = None
        self.label_name_columns = None

        self._batch_size = hyperparams['minibatch_size']
        self._use_gradient_fit = hyperparams['use_gradient_fit']
        self._num_iterations = hyperparams['num_iterations']
        self._learning_rate = hyperparams['learning_rate']
        self._analytic_fit_threshold = hyperparams['analytic_fit_threshold']
        self._weights_prior = hyperparams['weights_prior']
        self._tune_prior_end_to_end = hyperparams['tune_prior_end_to_end']

        self._fit_term_temperature = 0.0
        self._weights = None  # type: torch.autograd.Variable
        self._noise_variance = None
        self._weights_variance = None
        self._iterations_done = None  # type: int
        self._has_finished = False
        self._new_training_data = True
        self._inputs = None
        self._outputs = None
        self._use_analytic_form = False

        # Is the model fit on data
        self._fitted = False

    def _curate_data(self, training_inputs, training_outputs, get_labels):
        # if self._training_inputs is None or self._training_outputs is None:
        if training_inputs is None:
            raise ValueError("Missing data.")

        # Get training data and labels data
        try:
            feature_columns_1 = training_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/Attribute')
        except:
            feature_columns_1 = None
        try:
            feature_columns_2 = training_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/FileName')
        except:
            feature_columns_2 = None
        # Remove columns if outputs present in inputs
        if len(feature_columns_2) >= 1:
            for fc_2 in feature_columns_2:
                try:
                    feature_columns_1.remove(fc_2)
                except ValueError:
                    pass

        # Get labels data if present in training input
        try:
            label_columns = training_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        except:
            label_columns = training_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        # If no error but no label-columns found, force try SuggestedTarget
        if len(label_columns) == 0 or label_columns == None:
            label_columns = training_inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        # Remove columns if outputs present in inputs
        if len(label_columns) >= 1:
            for lbl_c in label_columns:
                try:
                    feature_columns_1.remove(lbl_c)
                except ValueError:
                    pass

        # Training Set
        feature_columns_1 = [int(fc) for fc in feature_columns_1]
        try:
            new_XTrain = ((
                training_inputs.iloc[:, feature_columns_1]).to_numpy()).astype(
                    np.float)
        except ValueError:
            # Most likely Numpy ndarray series
            XTrain = training_inputs.iloc[:, feature_columns_1]
            XTrain_shape = XTrain.shape[0]
            XTrain = ((XTrain.iloc[:, -1]).to_numpy())
            # Unpack
            new_XTrain = []
            for arr in range(XTrain_shape):
                new_XTrain.append(XTrain[arr])

            new_XTrain = np.array(new_XTrain)

            # del to save memory
            del XTrain

        # Training labels
        if get_labels:
            if training_outputs is None:
                raise ValueError("Missing data.")

            # Get labelled dataset
            try:
                label_columns = training_outputs.metadata.get_columns_with_semantic_type(
                    'https://metadata.datadrivendiscovery.org/types/TrueTarget'
                )
            except ValueError:
                label_columns = training_outputs.metadata.get_columns_with_semantic_type(
                    'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
                )
            # If no error but no label-columns force try SuggestedTarget
            if len(label_columns) == 0 or label_columns == None:
                label_columns = training_outputs.metadata.get_columns_with_semantic_type(
                    'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
                )
            YTrain = ((
                training_outputs.iloc[:, label_columns]).to_numpy()).astype(
                    np.float)

            # Get label column names
            label_name_columns = []
            label_name_columns_ = list(training_outputs.columns)
            for lbl_c in label_columns:
                label_name_columns.append(label_name_columns_[lbl_c])
            self.label_name_columns = label_name_columns

            return new_XTrain, YTrain, feature_columns_1

        return new_XTrain, feature_columns_1

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        inputs, outputs, _ = self._curate_data(training_inputs=inputs,
                                               training_outputs=outputs,
                                               get_labels=True)
        N, P = inputs.shape
        if self._use_gradient_fit:
            self._use_analytic_form = False
        elif P < N and N / P < self._analytic_fit_threshold:
            self._use_analytic_form = True

        inputs_with_ones = np.insert(inputs, P, 1, axis=1)

        self._training_inputs = to_variable(inputs_with_ones,
                                            requires_grad=True)
        self._training_outputs = to_variable(outputs, requires_grad=True)
        self._new_training_data = True
        self._has_finished = False
        self._iterations_done = 0
        self._converged_count = 0
        self._best_rmse = np.inf

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        inputs: (num_inputs,  D) numpy array
        outputs : numpy array of dimension (num_inputs)
        """
        # Curate data
        XTest, feature_columns = self._curate_data(training_inputs=inputs,
                                                   training_outputs=None,
                                                   get_labels=False)

        XTest = self._offset_input(inputs=XTest)

        self._weights = refresh_node(self._weights)
        self._noise_variance = refresh_node(self._noise_variance)
        self._weights_variance = refresh_node(self._weights_variance)

        self._inputs = to_variable(XTest, requires_grad=True)
        mu = torch.mm(self._inputs,
                      self._weights.unsqueeze(0).transpose(0, 1)).squeeze()

        reparameterized_normal = torch.distributions.normal.Normal(
            mu, self._noise_variance.expand(len(mu)))
        self._outputs = reparameterized_normal.rsample()
        self._outputs.reqiures_grad = True
        predictions = self._outputs.data.numpy()

        # Delete columns with path names of nested media files
        outputs = inputs.remove_columns(feature_columns)

        # Convert from ndarray from DataFrame
        predictions = container.DataFrame(predictions, generate_metadata=True)

        # Update Metadata for each feature vector column
        for col in range(predictions.shape[1]):
            col_dict = dict(
                predictions.metadata.query((metadata_base.ALL_ELEMENTS, col)))
            col_dict['structural_type'] = type(1.0)
            col_dict['name'] = self.label_name_columns[col]
            col_dict["semantic_types"] = (
                "http://schema.org/Float",
                "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
            )
            predictions.metadata = predictions.metadata.update(
                (metadata_base.ALL_ELEMENTS, col), col_dict)
        # Rename Columns to match label columns
        predictions.columns = self.label_name_columns

        # Append to outputs
        outputs = outputs.append_columns(predictions)

        return base.CallResult(outputs)

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult:
        """
        Runs gradient descent for ``timeout`` seconds or ``iterations``
        iterations, whichever comes sooner, on log normal_density(self.weights * self.input
        - output, identity*self.noise_variance) +
        parameter_prior_primitives["weights"].score(self.weights) +
        parameter_prior_primitives["noise_variance"].score(noise_variance).
        """
        if self._fitted:
            return base.CallResult(None)

        iterations = self._num_iterations
        if self._new_training_data:
            self._weights = torch.FloatTensor(
                np.random.randn(self._training_inputs.size()[1]) * 0.001)
            self._noise_variance = torch.ones(1)
            # this should be a matrix
            self._weights_variance = torch.ones(1)
            self._new_training_data = False
        elif self._has_finished:
            return CallResult(None,
                              has_finished=self._has_finished,
                              iterations_done=self._iterations_done)

        if self._use_analytic_form:
            self._analytic_fit(iterations=iterations)
        else:
            self._gradient_fit(timeout=timeout,
                               iterations=iterations,
                               batch_size=self._batch_size)

        self._fitted = True

        return CallResult(None)

    def _analytic_fit(self, *, iterations):
        train_x = self._training_inputs.data.numpy()
        train_y = self._training_outputs.data.numpy()

        cov_dim = self._training_inputs.shape[1]
        inv_covar = np.zeros([cov_dim, cov_dim])

        if self._weights_prior is not None:
            # just the prior on weights minus offset
            inv_covar[:cov_dim - 1, :cov_dim - 1] = np.linalg.inv(
                self._weights_prior.get_params()['covariance'])

        # this expression is (X^T*X + Lambda*I)^-1*X^T*Y
        # i.e. it is the solution to the problem argmin_w(E_D(w) + E_w(w))
        # where the first E_D(w) is the ML objective (least squares mvn)
        # and the second term E_w(w) is the regularizer, in this case Lambda/2*w^T*w
        w_sigma = np.dot(
            np.transpose(train_x),
            train_x) + inv_covar * float(self._noise_variance.data.numpy()[0])
        w_mu = np.dot(np.dot(np.linalg.inv(w_sigma), np.transpose(train_x)),
                      train_y)

        self._weights = torch.FloatTensor(w_mu.flatten())
        self._weights_variance = torch.FloatTensor(w_sigma)

        self._iterations_done = 1
        self._has_finished = True

    def _gradient_fit(self,
                      *,
                      timeout: float = None,
                      iterations: int = 100,
                      fit_threshold: float = 0,
                      batch_size: int) -> None:
        if self._training_inputs is None or self._training_outputs is None:
            raise ValueError("Missing training data.")

        if timeout is None:
            timeout = np.inf

        if batch_size is None:
            batch_size = 1
        x_batches = []
        y_batches = []
        # optionally do sampling with replacement
        for i in range(0, len(self._training_inputs), batch_size):
            x_batches.append(self._training_inputs[i:i + batch_size])
            y_batches.append(self._training_outputs[i:i + batch_size])
        num_batches = len(x_batches)

        start = time.time()
        #  self._weights_variance = torch.Tensor()

        iter_count = 0
        has_converged = False
        while iter_count < iterations and has_converged == False:
            iter_count += 1
            batch_no = iter_count % num_batches

            grads = [
                self._gradient_params_log_likelihood(input=training_input,
                                                     output=training_output)
                for training_input, training_output in zip(
                    x_batches[batch_no], y_batches[batch_no])
            ]
            weights_grad = sum(grad[0] for grad in grads) * num_batches
            noise_grad = sum(grad[1] for grad in grads) * num_batches
            if self._weights_prior is not None:
                # TODO scale this by bz over total data
                weights_grad += torch.from_numpy(
                    self._weights_prior.gradient_output(outputs=np.array(
                        [self._weights.data.numpy()]),
                                                        inputs=[]))
            self._weights.data += self._learning_rate * weights_grad * 1 / torch.norm(
                weights_grad)
            self._noise_variance.data += self._learning_rate * noise_grad * 1 / torch.norm(
                noise_grad)

            train_outputs = torch.mm(
                self._training_inputs,
                self._weights.unsqueeze(0).transpose(0, 1)).squeeze()
            train_y = self._training_outputs.data.numpy().flatten()
            rmse = mean_squared_error(train_outputs.data.numpy(), train_y)

            if rmse < self._best_rmse:
                self._converged_count = 0
                self._best_rmse = rmse
            else:
                self._converged_count += 1
            if self._converged_count > 1000:
                self._has_finished = True
                break

        self._iterations_done += iter_count

    def log_likelihoods(self,
                        *,
                        outputs: Outputs,
                        inputs: Inputs,
                        timeout: float = None,
                        iterations: int = None) -> CallResult[ndarray]:
        """
        input : D-length numpy ndarray
        output : float
        Calculates
        log(normal_density(self.weights * self.input - output, identity * self.noise_variance))
        for a single input/output pair.
        """
        result = np.array([
            self._log_likelihood(output=to_variable(output),
                                 input=to_variable(input)).data.numpy()
            for input, output in zip(inputs, outputs)
        ])
        return CallResult(result)

    def log_likelihood(self,
                       *,
                       outputs: Outputs,
                       inputs: Inputs,
                       timeout: float = None,
                       iterations: int = None) -> CallResult[float]:

        inputs = self._offset_input(inputs=inputs)
        result = self.log_likelihoods(outputs=outputs,
                                      inputs=inputs,
                                      timeout=timeout,
                                      iterations=iterations)

        return CallResult(sum(result.value),
                          has_finished=result.has_finished,
                          iterations_done=result.iterations_done)

    def _log_likelihood(
            self, output: torch.autograd.Variable,
            input: torch.autograd.Variable) -> torch.autograd.Variable:
        """
        All inputs are torch tensors (or variables if grad desired).
        input : D-length torch to_variable
        output : float
        """
        expected_output = torch.dot(self._weights, input).unsqueeze(0)
        covariance = to_variable(self._noise_variance).view(1, 1)

        return log_mvn_likelihood(expected_output, covariance, output)

    def _gradient_params_log_likelihood(
        self, *, output: torch.autograd.Variable,
        input: torch.autograd.Variable
    ) -> Tuple[torch.autograd.Variable, torch.autograd.Variable,
               torch.autograd.Variable]:
        """
        Output is ( D-length torch variable, 1-length torch variable )
        """

        self._weights = refresh_node(self._weights)
        self._noise_variance = refresh_node(self._noise_variance)
        log_likelihood = self._log_likelihood(output=output, input=input)
        log_likelihood.backward()
        return (self._weights.grad.data, self._noise_variance.grad.data)

    def _gradient_output_log_likelihood(
            self, *, output: ndarray,
            input: torch.autograd.Variable) -> torch.autograd.Variable:
        """
        output is D-length torch variable
        """

        output_var = to_variable(output)
        log_likelihood = self._log_likelihood(output=output_var, input=input)
        log_likelihood.backward()
        return output_var.grad

    def gradient_output(self, *, outputs: Outputs,
                        inputs: Inputs) -> Gradients[Outputs]:  # type: ignore
        """
        Calculates grad_output log normal_density(self.weights * self.input - output, identity * self.noise_variance)
        for a single input/output pair.
        """
        inputs = self._offset_input(inputs=inputs)

        outputs_vars = [
            to_variable(output, requires_grad=True) for output in outputs
        ]
        inputs_vars = [to_variable(input) for input in inputs]
        grad = sum(
            self._gradient_output_log_likelihood(output=output, input=input)
            for (input, output) in zip(inputs_vars, outputs_vars))

        return grad.data.numpy()

    def gradient_params(self, *, outputs: Outputs,
                        inputs: Inputs) -> Gradients[Params]:  # type: ignore
        """
        Calculates grad_weights fit_term_temperature *
        log normal_density(self.weights * self.input - output, identity * self.noise_variance)
        for a single input/output pair.
        """
        outputs_vars = [
            to_variable(output, requires_grad=True) for output in outputs
        ]
        inputs_vars = [to_variable(input) for input in inputs]

        grads = [
            self._gradient_params_log_likelihood(output=output, input=input)
            for (input, output) in zip(inputs_vars, outputs_vars)
        ]
        grad_weights = sum(grad[0] for grad in grads)
        grad_noise_variance = sum(grad[1] for grad in grads)

        return Params(weights=grad_weights,
                      offset=grad_offset,
                      noise_variance=grad_noise_variance)

    def _sample_once(self, *, inputs: Inputs) -> Outputs:
        """
        input : NxD numpy ndarray
        outputs : N-length numpy ndarray
        """
        if self._weights is None or self._noise_variance is None or self._weights_variance is None:
            raise ValueError("Params not set.")

        inputs = self._offset_input(inputs=inputs)

        weights = np.random.multivariate_normal(
            self._weights.detach().numpy(),
            self._weights_variance.detach().numpy())

        # sample outputs
        output_means = [np.dot(weights, input) for input in inputs]
        outputs = np.random.normal(output_means, self._noise_variance.data[0])

        return outputs

    def sample(self,
               *,
               inputs: Inputs,
               num_samples: int = 1,
               timeout: float = None,
               iterations: int = None) -> Sequence[Outputs]:
        """
        input : num_inputs x D numpy ndarray
        outputs : num_predictions x num_inputs numpy ndarray
        """

        return [self._sample_once(inputs=inputs) for _ in range(num_samples)]

    def backward(
        self,
        *,
        gradient_outputs: Gradients[Outputs],
        fine_tune: bool = False,
        fine_tune_learning_rate: float = 0.00001
    ) -> Tuple[Gradients[Inputs], Gradients[Params]]:  # type: ignore
        if self._inputs is None:
            raise Exception(
                'Cannot call backpropagation before forward propagation. Call "produce" before "backprop".'
            )
        else:
            if self._inputs.grad is not None:
                self._inputs.grad.data.zero_()

            self._outputs.backward(gradient=torch.Tensor(gradient_outputs))

            # this is the gradients given by end to end loss
            weights_grad = self._weights.grad.data
            noise_grad = self._noise_variance.grad.data

            if fine_tune:
                # this is gradients given by the annealed local loss
                if self._fit_term_temperature != 0:
                    # TODO use minibatches here
                    training_grads = [
                        self._gradient_params_log_likelihood(output=output,
                                                             input=input)
                        for (input, output) in zip(self._training_inputs,
                                                   self._training_outputs)
                    ]
                    weights_grad += self._fit_term_temperature * \
                        sum(grad[0] for grad in training_grads)
                    noise_grad += self._fit_term_temperature * \
                        sum(grad[1] for grad in training_grads)

                # make local update with temperature if required
                # TODO add the score frmo the prior primitive here
                self._weights.data += weights_grad * 1 / torch.norm(
                    weights_grad)
                self._noise_variance.data += noise_grad * 1 / torch.norm(
                    noise_grad)

                self._weights = refresh_node(self._weights)
                self._noise_variance = refresh_node(self._noise_variance)

            grad_inputs = self._inputs.grad
            grad_params = Params(weights=ndarray(weights_grad[:-1]),
                                 offset=float(weights_grad[-1]),
                                 noise_variance=float(noise_grad[0]),
                                 weights_variance=ndarray(
                                     np.zeros(self._weights_variance.shape)))

            if self._tune_prior_end_to_end:
                # update priors parameters here if specified
                self._weights_prior.backward(gradient_outputs=grad['weights'],
                                             fine_tune=True)

            return grad_inputs, grad_params

    def set_fit_term_temperature(self, *, temperature: float = 0) -> None:
        self._fit_term_temperature = temperature

    def _offset_input(self, *, inputs: Inputs) -> Inputs:
        if inputs.shape[1] == self._weights.shape[0]:
            return inputs
        else:
            return np.insert(inputs, inputs.shape[1], 1, axis=1)

    def get_call_metadata(self) -> CallResult:
        return CallResult(None,
                          has_finished=self._has_finished,
                          iterations_done=self._iterations_done)

    def get_params(self) -> Params:
        return Params(
            weights=ndarray(self._weights[:-1].data.numpy()),
            offset=float(self._weights[-1].data.numpy()),
            noise_variance=float(self._noise_variance.data.numpy()[0]),
            weights_variance=ndarray(self._weights_variance.data.numpy()),
            target_names_=self.label_name_columns)

    def set_params(self, *, params: Params) -> None:
        full_weights = np.append(params['weights'], params['offset'])
        self._weights = to_variable(full_weights, requires_grad=True)
        self._weights.retain_grad()
        self._weights_variance = to_variable(params['weights_variance'],
                                             requires_grad=True)
        self._noise_variance = to_variable(params['noise_variance'],
                                           requires_grad=True)
        self.label_name_columns = params['target_names_']
        self._fitted = True

    def __getstate__(self) -> dict:
        state = super().__getstate__()

        state['random_state'] = self._random_state

        return state

    def __setstate__(self, state: dict) -> None:
        super().__setstate__(state)

        self._random_state = state['random_state']
class ObjectDetectionRNPrimitive(PrimitiveBase[Inputs, Outputs, Params,
                                               Hyperparams]):
    """
    Primitive that utilizes RetinaNet, a convolutional neural network (CNN), for object
    detection. The methodology comes from "Focal Loss for Dense Object Detection" by
    Lin et al. 2017 (https://arxiv.org/abs/1708.02002). The code implementation is based
    off of the base library found at: https://github.com/fizyr/keras-retinanet.

    The primitive accepts a Dataset consisting of images, labels as input and returns
    a dataframe as output which include the bounding boxes for each object in each image.
    """

    metadata = metadata_base.PrimitiveMetadata({
        'id':
        'd921be1e-b158-4ab7-abb3-cb1b17f42639',
        'version':
        __version__,
        'name':
        'retina_net',
        'python_path':
        'd3m.primitives.object_detection.retina_net.ObjectDetectionRN',
        'keywords': [
            'object detection', 'convolutional neural network',
            'digital image processing', 'RetinaNet'
        ],
        'source': {
            'name': __author__,
            'contact': __contact__,
            'uris': [
                'https://github.com/kungfuai/d3m-primitives',
            ],
        },
        "installation": [{
            "type": "PIP",
            "package": "cython",
            "version": "0.29.16"
        }, {
            "type":
            metadata_base.PrimitiveInstallationType.PIP,
            "package_uri":
            "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives"
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }, {
            'type':
            "FILE",
            'key':
            "resnet50",
            'file_uri':
            "http://public.datadrivendiscovery.org/ResNet-50-model.keras.h5",
            'file_digest':
            "0128cdfa3963288110422e4c1a57afe76aa0d760eb706cda4353ef1432c31b9c"
        }],
        'algorithm_types': [metadata_base.PrimitiveAlgorithmType.RETINANET],
        'primitive_family':
        metadata_base.PrimitiveFamily.OBJECT_DETECTION,
        'can_use_gpus':
        True
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 volumes: typing.Dict[str, str] = None,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams,
                         volumes=volumes,
                         random_seed=random_seed)
        self.image_paths = None
        self.annotations = None
        self.base_dir = None
        self.classes = None
        self.backbone = None
        self.y_true = None

    def get_params(self) -> Params:
        return Params(base_dir=self.base_dir,
                      image_paths=self.image_paths,
                      annotations=self.annotations,
                      classes=self.classes)

    def set_params(self, *, params: Params) -> None:
        self.base_dir = params['base_dir']
        self.image_paths = params['image_paths']
        self.annotations = params['annotations']
        self.classes = params['classes']

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """
        Sets the primitive's training data and preprocesses the files for RetinaNet format.

        Parameters
        ----------
            inputs: numpy ndarray of size (n_images, dimension) containing the d3m Index, image name,
                    and bounding box for each image.

        Returns
        -------
            No returns. Function is called by pipeline at runtime.
        """

        # Prepare annotation file
        ## Generate image paths
        image_cols = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/FileName')
        self.base_dir = [
            inputs.metadata.query(
                (metadata_base.ALL_ELEMENTS,
                 t))['location_base_uris'][0].replace('file:///', '/')
            for t in image_cols
        ]
        self.image_paths = np.array([[
            os.path.join(self.base_dir, filename)
            for filename in inputs.iloc[:, col]
        ] for self.base_dir, col in zip(self.base_dir, image_cols)]).flatten()
        self.image_paths = pd.Series(self.image_paths)

        ## Arrange proper bounding coordinates
        bounding_coords = inputs.bounding_box.str.split(',', expand=True)
        bounding_coords = bounding_coords.drop(
            bounding_coords.columns[[2, 5, 6, 7]], axis=1)
        bounding_coords.columns = ['x1', 'y1', 'y2', 'x2']
        bounding_coords = bounding_coords[['x1', 'y1', 'x2', 'y2']]

        ## Generate class names
        class_name = pd.Series(['class'] * inputs.shape[0])

        ## Assemble annotation file
        self.annotations = pd.concat(
            [self.image_paths, bounding_coords, class_name], axis=1)
        self.annotations.columns = [
            'img_file', 'x1', 'y1', 'x2', 'y2', 'class_name'
        ]

        # Prepare ID file
        self.classes = pd.DataFrame({'class_name': ['class'], 'class_id': [0]})

    def _create_callbacks(self, model, training_model, prediction_model):
        """
        Creates the callbacks to use during training.

        Parameters
        ----------
            model                : The base model.
            training_model       : The model that is used for training.
            prediction_model     : The model that should be used for validation.
            validation_generator : The generator for creating validation data.

        Returns
        -------
            callbacks            : A list of callbacks used for training.
        """
        callbacks = []

        callbacks.append(
            keras.callbacks.ReduceLROnPlateau(monitor='loss',
                                              factor=0.1,
                                              patience=2,
                                              verbose=1,
                                              mode='auto',
                                              min_delta=0.0001,
                                              cooldown=0,
                                              min_lr=0))

        return callbacks

    def _create_models(self,
                       backbone_retinanet,
                       num_classes,
                       weights,
                       freeze_backbone=False,
                       lr=1e-5):
        """
        Creates three models (model, training_model, prediction_model).

        Parameters
        ----------
            backbone_retinanet : A function to call to create a retinanet model with a given backbone.
            num_classes        : The number of classes to train.
            weights            : The weights to load into the model.
            multi_gpu          : The number of GPUs to use for training.
            freeze_backbone    : If True, disables learning for the backbone.
            config             : Config parameters, None indicates the default configuration.

        Returns
        -------
            model              : The base model.
            training_model     : The training model. If multi_gpu=0, this is identical to model.
            prediction_model   : The model wrapped with utility functions to perform object detection (applies regression values and performs NMS).
        """

        modifier = freeze_model if freeze_backbone else None
        anchor_params = None
        num_anchors = None

        model = self._model_with_weights(backbone_retinanet(
            num_classes, num_anchors=num_anchors, modifier=modifier),
                                         weights=weights,
                                         skip_mismatch=True)
        training_model = model
        prediction_model = retinanet_bbox(model=model,
                                          anchor_params=anchor_params)
        training_model.compile(loss={
            'regression': losses.smooth_l1(),
            'classification': losses.focal()
        },
                               optimizer=keras.optimizers.adam(lr=lr,
                                                               clipnorm=0.001))

        return model, training_model, prediction_model

    def _num_classes(self):
        """
        Number of classes in the dataset.
        """
        return max(self.classes.values()) + 1

    def _model_with_weights(self, model, weights, skip_mismatch):
        """
        Load weights for model.

        Parameters
        ----------
            model         : The model to load weights for.
            weights       : The weights to load.
            skip_mismatch : If True, skips layers whose shape of weights doesn't match with the model.

        Returns
        -------
            model         : Model with loaded weights.
        """

        if weights is not None:
            model.load_weights(weights,
                               by_name=True,
                               skip_mismatch=skip_mismatch)
        return model

    def _create_generator(self, annotations, classes, shuffle_groups):
        """
        Create generator for evaluation.
        """

        validation_generator = CSVGenerator(self.annotations,
                                            self.classes,
                                            self.base_dir,
                                            self.hyperparams['batch_size'],
                                            self.backbone.preprocess_image,
                                            shuffle_groups=False)
        return validation_generator

    def _fill_empty_predictions(self, empty_predictions_image_names,
                                d3mIdx_image_mapping):
        """
        D3M metrics evaluator needs at least one prediction per image. If RetinaNet does not return
        predictions for an image, this method creates a dummy empty prediction row to add to results_df for that
        missing image.

        TODO: DUMMY CONFIDENCE SCORES LOWER AVERAGE PRECISION. FIND A FIX.
        """

        # Prepare D3M index
        empty_predictions_d3mIdx = [
            d3mIdx_image_mapping.get(key)
            for key in empty_predictions_image_names
        ]
        empty_predictions_d3mIdx = [
            item for sublist in empty_predictions_d3mIdx for item in sublist
        ]

        # Prepare dummy columns
        d3mIdx = empty_predictions_d3mIdx
        bounding_box = ["0,0,0,0,0,0,0,0"] * len(empty_predictions_d3mIdx)
        confidence = [float(0)] * len(empty_predictions_d3mIdx)

        empty_predictions_df = pd.DataFrame({
            'd3mIndex': d3mIdx,
            'bounding_box': bounding_box,
            'confidence': confidence
        })

        return empty_predictions_df

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """
        Creates the image generators and then trains RetinaNet model on the image paths in the input
        dataframe column.

        Can choose to use validation generator.

        If no weight file is provided, the default is to use the ImageNet weights.
        """

        # Create object that stores backbone information
        self.backbone = models.backbone(self.hyperparams['backbone'])

        # Create the generators
        train_generator = CSVGenerator(self.annotations,
                                       self.classes,
                                       self.base_dir,
                                       self.hyperparams['batch_size'],
                                       self.backbone.preprocess_image,
                                       shuffle_groups=False)

        # Running the model
        ## Assign weights
        if self.hyperparams['weights'] is False:
            weights = None
        else:
            weights = self.volumes[self.hyperparams['backbone']]

        ## Create model
        logger.info('Creating model...')

        model, training_model, prediction_model = self._create_models(
            backbone_retinanet=self.backbone.retinanet,
            num_classes=train_generator.num_classes(),
            weights=weights,
            freeze_backbone=self.hyperparams['freeze_backbone'],
            lr=self.hyperparams['learning_rate'])

        ### !!! vgg AND densenet BACKBONES CURRENTLY NOT IMPLEMENTED !!!
        ## Let the generator compute the backbone layer shapes using the actual backbone model
        # if 'vgg' in self.hyperparams['backbone'] or 'densenet' in self.hyperparams['backbone']:
        #     train_generator.compute_shapes = make_shapes_callback(model)
        #     if validation_generator:
        #         validation_generator.compute_shapes = train_generator.compute_shapes

        ## Set up callbacks
        callbacks = self._create_callbacks(
            model,
            training_model,
            prediction_model,
        )

        start_time = time.time()
        logger.info('Starting training...')

        training_model.fit_generator(
            generator=train_generator,
            steps_per_epoch=self.hyperparams['n_steps'],
            epochs=self.hyperparams['n_epochs'],
            verbose=1,
            callbacks=callbacks,
        )

        training_model.save_weights(self.hyperparams['weights_path'] +
                                    'model_weights.h5')

        logger.info(
            f'Training complete. Training took {time.time()-start_time} seconds.'
        )
        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce image detection predictions.

        Parameters
        ----------
            inputs  : numpy ndarray of size (n_images, dimension) containing the d3m Index, image name,
                      and bounding box for each image.

        Returns
        -------
            outputs : A d3m dataframe container with the d3m index, image name, bounding boxes as
                      a string (8 coordinate format), and confidence scores.
        """
        iou_threshold = 0.5  # Bounding box overlap threshold for false positive or true positive
        score_threshold = 0.05  # The score confidence threshold to use for detections
        max_detections = 100  # Maxmimum number of detections to use per image

        # Create object that stores backbone information
        backbone = models.backbone(self.hyperparams['backbone'])

        # Create the generators
        train_generator = CSVGenerator(self.annotations,
                                       self.classes,
                                       self.base_dir,
                                       self.hyperparams['batch_size'],
                                       backbone.preprocess_image,
                                       shuffle_groups=False)

        # Assign weights
        if self.hyperparams['weights'] is False:
            weights = None
        else:
            weights = self.volumes[self.hyperparams['backbone']]

        # Instantiate model
        model, training_model, prediction_model = self._create_models(
            backbone_retinanet=backbone.retinanet,
            num_classes=train_generator.num_classes(),
            weights=weights,
            freeze_backbone=self.hyperparams['freeze_backbone'],
            lr=self.hyperparams['learning_rate'])

        # Load model weights saved in fit
        training_model.load_weights(self.hyperparams['weights_path'] +
                                    'model_weights.h5')

        # Convert training model to inference model
        inference_model = models.convert_model(training_model)

        # Generate image paths
        image_cols = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/FileName')
        self.base_dir = [
            inputs.metadata.query(
                (metadata_base.ALL_ELEMENTS,
                 t))['location_base_uris'][0].replace('file:///', '/')
            for t in image_cols
        ]
        self.image_paths = np.array([[
            os.path.join(self.base_dir, filename)
            for filename in inputs.iloc[:, col]
        ] for self.base_dir, col in zip(self.base_dir, image_cols)]).flatten()
        self.image_paths = pd.Series(self.image_paths)

        # Initialize output objects
        box_list = []
        score_list = []
        image_name_list = []

        # Predict bounding boxes and confidence scores for each image
        image_list = [
            x for i, x in enumerate(self.image_paths.tolist())
            if self.image_paths.tolist().index(x) == i
        ]

        start_time = time.time()
        logger.info('Starting testing...')

        for i in image_list:
            image = read_image_bgr(i)

            # preprocess image for network
            image = preprocess_image(image)
            image, scale = resize_image(image)

            boxes, scores, labels = inference_model.predict_on_batch(
                tf.constant(np.expand_dims(image, axis=0), dtype=tf.float32))

            # correct for image scale
            boxes /= scale

            for box, score in zip(boxes[0], scores[0]):
                if score < 0.5:
                    break

                b = box.astype(int)
                box_list.append(b)
                score_list.append(score)
                image_name_list.append(i * len(b))

        logger.info(
            f'Testing complete. Testing took {time.time()-start_time} seconds.'
        )

        ## Convert predicted boxes from a list of arrays to a list of strings
        boxes = np.array(box_list).tolist()
        boxes = list(
            map(lambda x: [x[0], x[1], x[0], x[3], x[2], x[3], x[2], x[1]],
                boxes))  # Convert to 8 coordinate format for D3M
        boxes = list(map(lambda x: ",".join(map(str, x)), boxes))

        # Create mapping between image names and D3M index
        input_df = pd.DataFrame({
            'd3mIndex':
            inputs.d3mIndex,
            'image': [os.path.basename(list) for list in self.image_paths]
        })

        d3mIdx_image_mapping = input_df.set_index('image').T.to_dict('list')

        # Extract values for image name keys and get missing image predictions (if they exist)
        image_name_list = [os.path.basename(list) for list in image_name_list]
        d3mIdx = [d3mIdx_image_mapping.get(key) for key in image_name_list]
        empty_predictions_image_names = [
            k for k, v in d3mIdx_image_mapping.items() if v not in d3mIdx
        ]
        d3mIdx = [item for sublist in d3mIdx
                  for item in sublist]  # Flatten list of lists

        ## Assemble in a Pandas DataFrame
        results = pd.DataFrame({
            'd3mIndex': d3mIdx,
            'bounding_box': boxes,
            'confidence': score_list
        })

        # D3M metrics evaluator needs at least one prediction per image. If RetinaNet does not return
        # predictions for an image, create a dummy empty prediction row to add to results_df for that
        # missing image.
        if len(empty_predictions_image_names) != 0:
            # Create data frame of empty predictions for missing each image and concat with results.
            # Sort results_df.
            empty_predictions_df = self._fill_empty_predictions(
                empty_predictions_image_names, d3mIdx_image_mapping)
            results_df = pd.concat([results, empty_predictions_df
                                    ]).sort_values('d3mIndex')
        else:
            results_df = results

        # Convert to DataFrame container
        results_df = d3m_DataFrame(results_df)

        ## Assemble first output column ('d3mIndex)
        col_dict = dict(
            results_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'd3mIndex'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        results_df.metadata = results_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)

        ## Assemble second output column ('bounding_box')
        col_dict = dict(
            results_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'bounding_box'
        col_dict['semantic_types'] = (
            'http://schema.org/Text',
            'https://metadata.datadrivendiscovery.org/types/PredictedTarget',
            'https://metadata.datadrivendiscovery.org/types/BoundingPolygon')
        results_df.metadata = results_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)

        ## Assemble third output column ('confidence')
        col_dict = dict(
            results_df.metadata.query((metadata_base.ALL_ELEMENTS, 2)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'confidence'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/Score')
        results_df.metadata = results_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 2), col_dict)

        return CallResult(results_df)
Esempio n. 25
0
class EnrichDatesPrimitive(transformer.TransformerPrimitiveBase[Inputs,
                                                                Outputs,
                                                                Hyperparams]):
    """
    Enriches dates by converting to seconds from a base time and computing Z scores.  The results
    are appended to the existing dataset, and the original column is left in place for additional
    downstream processing.
    """

    metadata = metadata_base.PrimitiveMetadata(
        {
            "id":
            "b1367f5b-bab1-4dfc-a1a9-6a56430e516a",
            "version":
            version.__version__,
            "name":
            "Enrich dates",
            "python_path":
            "d3m.primitives.data_transformation.enrich_dates.DistilEnrichDates",
            "source": {
                "name":
                "Distil",
                "contact":
                "mailto:[email protected]",
                "uris": [
                    "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/enrich_dates.py",
                    "https://github.com/uncharted-distil/distil-primitives",
                ],
            },
            "installation": [
                CYTHON_DEP,
                {
                    "type":
                    metadata_base.PrimitiveInstallationType.PIP,
                    "package_uri":
                    "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives"
                    .format(git_commit=utils.current_git_commit(
                        os.path.dirname(__file__)), ),
                },
            ],
            "algorithm_types": [
                metadata_base.PrimitiveAlgorithmType.ENCODE_BINARY,
            ],
            "primitive_family":
            metadata_base.PrimitiveFamily.DATA_TRANSFORMATION,
        }, )

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        logger.debug(f"Running {__name__}")

        outputs = inputs.copy()
        outputs = self._enrich_dates(outputs)

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)

    def _enrich_dates(self, inputs: Inputs) -> Outputs:

        # determine columns we need to operate on
        cols = distil_utils.get_operating_columns(
            inputs, self.hyperparams["use_columns"],
            ("http://schema.org/DateTime", ))

        date_num = 0
        for c in cols:
            try:
                # compute z scores for column members
                inputs_seconds = (
                    (pd.to_datetime(inputs.iloc[:, c]) -
                     pd.to_datetime("2000-01-01")).dt.total_seconds().values)
                sec_mean = inputs_seconds.mean()
                sec_std = inputs_seconds.std()
                sec_val = 0.0
                if sec_std != 0.0:
                    sec_val = (inputs_seconds - sec_mean) / sec_std

                if self.hyperparams["replace"]:
                    inputs.metadata = inputs.metadata.add_semantic_type(
                        (metadata_base.ALL_ELEMENTS, c),
                        "http://schema.org/Float")
                    inputs.metadata = inputs.metadata.remove_semantic_type(
                        (metadata_base.ALL_ELEMENTS, c),
                        "http://schema.org/DateTime")
                    inputs.metadata = inputs.metadata.update(
                        (metadata_base.ALL_ELEMENTS, c),
                        {"structural_type": float})
                    inputs[inputs.columns[c]] = sec_val
                else:
                    # append the results and update semantic types
                    result = container.DataFrame(
                        {f"__date_{date_num}": sec_val},
                        generate_metadata=True)
                    result.metadata = result.metadata.add_semantic_type(
                        (metadata_base.ALL_ELEMENTS, 0),
                        "http://schema.org/Float")
                    inputs = inputs.append_columns(result)

                date_num += 1
            except:
                continue

        return inputs
class HighRankImputer(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                                     Hyperparams]):
    """
    This primitive imputes a dataset in which data points are drawn from multiple subspaces, which in pratice means the data have mutiple groups/classes. In such cases, the data matrices are often of high-rank. In such cases, Sparse Factorization based Matrix Completion (SFMC) can outperform classical low-rank matrix completion methods.
    The optimization is solved via accelerated proximal alternating minimization (APALM). The NaNs in the input matrix will be regarded as missing entries. The algorithm will recover the missing entries and return the recovered matrix as output.
    The method can be used for collaborative filtering (recommendation system) and data preprocessing.
    """

    metadata = metadata_base.PrimitiveMetadata({
        'id':
        'e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93',
        'version':
        __version__,
        'name':
        "Matrix Completion via Sparse Factorization",
        'keywords': [
            'Matrix completion',
            'low-rank matrix',
            'high-rank matrix',
            'sparse factorization',
        ],
        'source': {
            'name': __author__,
            'contact': 'mailto:[email protected]',
            'uris': [
                'https://github.com/cyangcornell/d3m-primitives.git',
            ],
        },
        'installation': [{
            'type':
            'PIP',
            'package_uri':
            'git+https://github.com/cyangcornell/d3m-primitives.git@{git_commit}#egg=pyglrm-d3m'
            .format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)))
        }],
        'python_path':
        'd3m.primitives.collaborative_filtering.high_rank_imputer.Cornell',
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.
            LOW_RANK_MATRIX_APPROXIMATIONS,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.COLLABORATIVE_FILTERING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 docker_containers: Dict[str, DockerContainer] = None,
                 _versbose: int = 0) -> None:

        super().__init__(hyperparams=hyperparams,
                         docker_containers=docker_containers)

        self.d: int = hyperparams['d']
        self.tol: float = hyperparams['tol']
        self.maxiter: int = hyperparams['maxiter']
        self.alpha: float = hyperparams['alpha']
        self.beta: float = hyperparams['beta']
        self._fitted = False
        self._CF = False

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._training_inputs = inputs
        self._training_outputs = outputs
        self._fitted = False
        self._keys = list(outputs)
        self._MC = False

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return

        Vt = self._training_inputs.values
        if Vt.shape[1] > 3:
            X_incomplete = self._training_inputs.copy()
            X = self._training_inputs.values.copy()
            X = X.T
            m0 = 1
            n0 = 0
        else:
            self._CF = True
            x_rating = self._training_inputs.copy()
            x_rating[self._training_outputs.
                     columns[0]] = self._training_outputs.values
            X_incomplete = x_rating.pivot(index=x_rating.columns[0],
                                          columns=x_rating.columns[1],
                                          values=x_rating.columns[2])
            X = X_incomplete.values.copy()
            m0, n0 = X.shape
            if m0 > n0:
                X = X.T

        tol = self.tol
        maxiter = self.maxiter
        m, n = X.shape
        M = np.ones([m, n])
        M[np.isnan(X)] = 0
        sr = M.sum() / m / n
        X[np.isnan(X)] = 0

        if self.d == 0:
            if sr > 0.5:
                d = np.int(np.round(0.5 * min(m, n)))
            else:
                d = np.int(3 * np.round(sr * min(m, n)))
        else:
            d = self.d

        alpha = self.alpha * n / d
        beta = self.beta * np.sqrt(n / d)
        self._beta = beta
        self._d = d
        A = np.random.randn(m, d)
        Z = np.zeros((d, n))
        rho = max(1.5 * np.sqrt(M.mean()), 0.5)
        iter = 0
        cc = 0.5

        while iter < self.maxiter:
            iter = iter + 1
            # Z_new
            if iter == 1:
                Z = Z
            else:
                Z = Z_new + cc * (Z_new - Z_old)

            tau = rho * np.linalg.norm(np.dot(A.T, A), 2)
            G = Z - (-np.dot(A.T, np.multiply(M, X - np.dot(A, Z)))) / tau
            Z_new = np.maximum(0, G - beta / tau) + np.minimum(
                0, G + beta / tau)

            # A_new
            if iter == 1:
                A = A
            else:
                A = A_new + cc * (A_new - A_old)

            kai = rho * np.linalg.norm(np.dot(Z_new, Z_new.T), 2)
            H = A + np.dot(np.multiply(M, X - np.dot(A, Z_new)), Z_new.T) / kai
            A_new = 1 / (alpha + kai) * H * kai

            # check convergence
            stopC = max(
                np.linalg.norm(Z_new - Z, 'fro') /
                np.linalg.norm(Z_new, 'fro'),
                np.linalg.norm(A_new - A, 'fro') /
                np.linalg.norm(A_new, 'fro'))
            isstopC = stopC < tol

            if isstopC:
                Z = Z_new
                A = A_new
                break
            Z_old = Z
            A_old = A
            Z = Z_new
            A = A_new

        #X_temp=np.multiply(X,M)+np.multiply(np.dot(A,Z),1-M)
        X_temp = np.dot(A, Z)

        if m0 > n0:
            X_temp = X_temp.T

        self._A = A
        #self._X=pd.DataFrame(X_temp,X_incomplete.index,X_incomplete.columns)
        self._X = container.DataFrame(X_temp,
                                      index=X_incomplete.index,
                                      columns=X_incomplete.columns)
        self._fitted = True

        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        testData = inputs
        if self._CF:
            Xp = self._X
            y_pred = np.zeros(testData.shape[0]) + Xp.values.mean()
            idr = testData[testData.columns[0]].isin(Xp.index)
            idc = testData[testData.columns[1]].isin(Xp.columns)
            dd = np.where(idr & idc == True)

            for i in dd[0]:
                tpc = testData.values[i, 1]

                #             id_col=Xp.columns.get_loc(np.str(tpc))
                id_col = Xp.columns.get_loc(tpc)
                tpr = testData.values[i, 0]
                id_row = Xp.index.get_loc(tpr)

                y_pred[i] = Xp.values[id_row, id_col]

            self._index = inputs.index
            outputs = container.DataFrame(y_pred,
                                          index=self._index,
                                          columns=self._keys)
        else:
            X = inputs.values.copy()
            tol = self.tol
            maxiter = self.maxiter
            X = X.T
            m, n = X.shape
            M = np.ones([m, n])
            M[np.isnan(X)] = 0
            sr = M.sum() / m / n
            X[np.isnan(X)] = 0
            beta = self._beta
            d = self._d
            A = self._A
            Z = np.zeros((d, n))
            rho = max(1.5 * np.sqrt(M.mean()), 0.5)
            iter = 0
            cc = 0.5
            while iter < self.maxiter:

                iter = iter + 1

                # Z_new
                if iter == 1:
                    Z = Z
                else:
                    Z = Z_new + cc * (Z_new - Z_old)

                tau = rho * np.linalg.norm(np.dot(A.T, A), 2)
                G = Z - (-np.dot(A.T, np.multiply(M, X - np.dot(A, Z)))) / tau
                Z_new = np.maximum(0, G - beta / tau) + np.minimum(
                    0, G + beta / tau)

                # check convergence
                stopC = np.linalg.norm(Z_new - Z, 'fro') / np.linalg.norm(
                    Z_new, 'fro')
                isstopC = stopC < tol

                if isstopC:
                    Z = Z_new
                    break

                Z_old = Z
                Z = Z_new

            #X_temp=np.multiply(X,M)+np.multiply(np.dot(A,Z),1-M)
            X_temp = np.dot(A, Z)

            X_temp = X_temp.T

            outputs = container.DataFrame(X_temp,
                                          index=testData.index,
                                          columns=testData.columns)

        outputs.metadata = inputs.metadata

        return CallResult(outputs)

    def get_params(self) -> Params:
        return Params(X=self._X)

    def set_params(self, *, params: Params) -> None:
        self._X = params.X
Esempio n. 27
0
class ObjectDetectionRNPrimitive(PrimitiveBase[Inputs, Outputs, Params,
                                               Hyperparams]):
    """
    Primitive that utilizes RetinaNet, a convolutional neural network (CNN), for object
    detection. The methodology comes from "Focal Loss for Dense Object Detection" by 
    Lin et al. 2017 (https://arxiv.org/abs/1708.02002). The code implementation is based
    off of the base library found at: https://github.com/fizyr/keras-retinanet.

    The primitive accepts a Dataset consisting of images, labels as input and returns
    a dataframe as output which include the bounding boxes for each object in each image.
    """

    metadata = metadata_base.PrimitiveMetadata({
        'id':
        'd921be1e-b158-4ab7-abb3-cb1b17f42639',
        'version':
        '0.1.0',
        'name':
        'retina_net',
        'python_path':
        'd3m.primitives.object_detection.retinanet_convolutional_neural_network',
        'keywords': [
            'object detection', 'convolutional neural network',
            'digital image processing', 'RetinaNet'
        ],
        'source': {
            'name': 'Sanjeev Namjoshi',
            'contact': 'mailto:[email protected]',
            'uris': ['https://github.com/NewKnowledge/object-detection'],
        },
        'installation': [
            {
                'type':
                'PIP',
                'package_uri':
                'git+https://github.com/NewKnowledge/object-detection.git@{git_commit}#egg=object-detection'
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), )
            },
            {
                'type':
                "FILE",
                'key':
                "resnet50",
                'file_uri':
                "http://public.datadrivendiscovery.org/ResNet-50-model.keras.h5",
                'file_digest':
                "0128cdfa3963288110422e4c1a57afe76aa0d760eb706cda4353ef1432c31b9c"  # TBD 
            }
        ],
        #'algorithm_types': [metadata_base.PrimitiveAlgorithmType.RETINANET_CONVOLUTIONAL_NEURAL_NETWORK],
        'algorithm_types':
        [metadata_base.PrimitiveAlgorithmType.CONVOLUTIONAL_NEURAL_NETWORK],
        #'primitive_family': metadata_base.PrimitiveFamily.OBJECT_DETECTION
        'primitive_family':
        metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 volumes: typing.Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams, volumes=volumes)
        self.image_paths = None
        self.annotations = None
        self.base_dir = None
        self.classes = None
        self.backbone = None
        self.y_true = None
        self.workers = 1
        self.multiprocessing = 1
        self.max_queue_size = 10

    def get_params(self) -> Params:
        return self._params

    def set_params(self, *, params: Params) -> None:
        self.params = params

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """ 
        Sets the primitive's training data and preprocesses the files for RetinaNet format.

        Parameters
        ----------
            inputs: numpy ndarray of size (n_images, dimension) containing the d3m Index, image name, 
                    and bounding box for each image.

        Returns
        -------
            No returns. Function is called by pipeline at runtime.
        """

        # Prepare annotation file
        ## Generate image paths
        image_cols = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/FileName')
        self.base_dir = [
            inputs.metadata.query(
                (metadata_base.ALL_ELEMENTS,
                 t))['location_base_uris'][0].replace('file:///', '/')
            for t in image_cols
        ]
        self.image_paths = np.array([[
            os.path.join(self.base_dir, filename)
            for filename in inputs.iloc[:, col]
        ] for self.base_dir, col in zip(self.base_dir, image_cols)]).flatten()
        self.image_paths = pd.Series(self.image_paths)

        ## Arrange proper bounding coordinates
        bounding_coords = inputs.bounding_box.str.split(',', expand=True)
        bounding_coords = bounding_coords.drop(
            bounding_coords.columns[[2, 5, 6, 7]], axis=1)
        bounding_coords.columns = ['x1', 'y1', 'y2', 'x2']
        bounding_coords = bounding_coords[['x1', 'y1', 'x2', 'y2']]

        ## Generate class names
        class_name = pd.Series(['class'] * inputs.shape[0])

        ## Assemble annotation file
        self.annotations = pd.concat(
            [self.image_paths, bounding_coords, class_name], axis=1)
        self.annotations.columns = [
            'img_file', 'x1', 'y1', 'x2', 'y2', 'class_name'
        ]

        # Prepare ID file
        self.classes = pd.DataFrame({'class_name': ['class'], 'class_id': [0]})

    def _create_callbacks(self, model, training_model, prediction_model):
        """
        Creates the callbacks to use during training.

        Parameters
        ----------
            model                : The base model.
            training_model       : The model that is used for training.
            prediction_model     : The model that should be used for validation.
            validation_generator : The generator for creating validation data.
        
        Returns
        -------
            callbacks            : A list of callbacks used for training.
        """
        callbacks = []

        callbacks.append(
            keras.callbacks.ReduceLROnPlateau(monitor='loss',
                                              factor=0.1,
                                              patience=2,
                                              verbose=1,
                                              mode='auto',
                                              min_delta=0.0001,
                                              cooldown=0,
                                              min_lr=0))

        return callbacks

    def _create_models(self,
                       backbone_retinanet,
                       num_classes,
                       weights,
                       freeze_backbone=False,
                       lr=1e-5):
        """ 
        Creates three models (model, training_model, prediction_model).

        Parameters
        ----------
            backbone_retinanet : A function to call to create a retinanet model with a given backbone.
            num_classes        : The number of classes to train.
            weights            : The weights to load into the model.
            multi_gpu          : The number of GPUs to use for training.
            freeze_backbone    : If True, disables learning for the backbone.
            config             : Config parameters, None indicates the default configuration.

        Returns
        -------
            model              : The base model. 
            training_model     : The training model. If multi_gpu=0, this is identical to model.
            prediction_model   : The model wrapped with utility functions to perform object detection (applies regression values and performs NMS).
        """

        modifier = freeze_model if freeze_backbone else None
        anchor_params = None
        num_anchors = None

        model = self._model_with_weights(backbone_retinanet(
            num_classes, num_anchors=num_anchors, modifier=modifier),
                                         weights=weights,
                                         skip_mismatch=True)
        training_model = model
        prediction_model = retinanet_bbox(model=model,
                                          anchor_params=anchor_params)
        training_model.compile(loss={
            'regression': losses.smooth_l1(),
            'classification': losses.focal()
        },
                               optimizer=keras.optimizers.adam(lr=lr,
                                                               clipnorm=0.001))

        return model, training_model, prediction_model

    def _num_classes(self):
        """ 
        Number of classes in the dataset.
        """
        return max(self.classes.values()) + 1

    def _model_with_weights(self, model, weights, skip_mismatch):
        """ 
        Load weights for model.

        Parameters
        ----------
            model         : The model to load weights for.
            weights       : The weights to load.
            skip_mismatch : If True, skips layers whose shape of weights doesn't match with the model.

        Returns
        -------
            model         : Model with loaded weights.
        """

        if weights is not None:
            model.load_weights(weights,
                               by_name=True,
                               skip_mismatch=skip_mismatch)
        return model

    def _create_generator(self, annotations, classes, shuffle_groups):
        """
        Create generator for evaluation.
        """

        validation_generator = CSVGenerator(self.annotations,
                                            self.classes,
                                            self.base_dir,
                                            self.hyperparams['batch_size'],
                                            self.backbone.preprocess_image,
                                            shuffle_groups=False)
        return validation_generator

    def _evaluate_model(self, generator, model, iou_threshold, score_threshold,
                        max_detections, save_path):
        """ 
        Evaluate a given dataset using a given model.

        Parameters
        ----------
        generator       : The generator that represents the dataset to evaluate.
        model           : The model to evaluate.
        iou_threshold   : The threshold used to consider when a detection is positive or negative.
        score_threshold : The score confidence threshold to use for detections.
        max_detections  : The maximum number of detections to use per image.
        save_path       : The path to save images with visualized detections to.
        
        Returns
        -------
        all_detections  : A list containing the predicted boxes for each image in the generator.
        """

        box_list = []
        score_list = []
        for i in range(generator.size()):
            raw_image = generator.load_image(i)
            image = generator.preprocess_image(raw_image.copy())
            image, scale = generator.resize_image(image)

            if keras.backend.image_data_format() == 'channels_first':
                image = image.transpose((2, 0, 1))

            # run network
            boxes, scores, labels = model.predict_on_batch(
                np.expand_dims(image, axis=0))[:3]

            # correct boxes for image scale
            boxes /= scale

            for box, score in zip(boxes[0], scores[0]):
                if score < 0.5:
                    break

                b = box.astype(int)
                box_list.append(b)
                score_list.append(score)

            ### !!! SAVEPATH CURRENTLY NOT IMPLEMENTED !!!
            ### This optional feature can be added later, maybe for TA3, allowing images to be output with
            ### bounding boxes to a specified directory after evaluation.
            # if save_path is True:
            #     draw_annotations(raw_image, generator.load_annotations(i), label_to_name = generator.label_to_name)
            #     draw_detections(raw_image, image_boxes, image_scores, image_labels, label_to_name = generator.label_to_name, score_threshold = score_threshold)

            #     cv2.imwrite(os.path.join(save_path, '{}.png'.format(i)), raw_image)

        return box_list, score_list

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """
        Creates the image generators and then trains RetinaNet model on the image paths in the input 
        dataframe column.

        Can choose to use validation generator. 
        
        If no weight file is provided, the default is to use the ImageNet weights.
        """

        # Create object that stores backbone information
        self.backbone = models.backbone(self.hyperparams['backbone'])

        # Set up specific GPU
        # if self.hyperparams['gpu_id'] is not None:
        #     setup_gpu(self.hyperparams['gpu_id'])

        # Create the generators
        train_generator = CSVGenerator(self.annotations, self.classes,
                                       self.base_dir,
                                       self.hyperparams['batch_size'],
                                       self.backbone.preprocess_image)

        # Running the model
        ## Assign weights
        if self.hyperparams['weights'] is False:
            weights = None
        else:
            weights = self.volumes[self.hyperparams['backbone']]

        ## Create model
        print('Creating model...', file=sys.__stdout__)

        model, self.training_model, prediction_model = self._create_models(
            backbone_retinanet=self.backbone.retinanet,
            num_classes=train_generator.num_classes(),
            weights=weights,
            freeze_backbone=self.hyperparams['freeze_backbone'],
            lr=self.hyperparams['learning_rate'])

        #print(model.summary(), file = sys.__stdout__)
        model.summary()

        ### !!! vgg AND densenet BACKBONES CURRENTLY NOT IMPLEMENTED !!!
        ## Let the generator compute the backbone layer shapes using the actual backbone model
        # if 'vgg' in self.hyperparams['backbone'] or 'densenet' in self.hyperparams['backbone']:
        #     train_generator.compute_shapes = make_shapes_callback(model)
        #     if validation_generator:
        #         validation_generator.compute_shapes = train_generator.compute_shapes

        ## Set up callbacks
        callbacks = self._create_callbacks(
            model,
            self.training_model,
            prediction_model,
        )

        start_time = time.time()
        print('Starting training...', file=sys.__stdout__)

        self.training_model.fit_generator(
            generator=train_generator,
            steps_per_epoch=self.hyperparams['n_steps'],
            epochs=self.hyperparams['n_epochs'],
            verbose=1,
            callbacks=callbacks,
            workers=self.workers,
            use_multiprocessing=self.multiprocessing,
            max_queue_size=self.max_queue_size)

        print(
            f'Training complete. Training took {time.time()-start_time} seconds.',
            file=sys.__stdout__)
        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce image detection predictions.

        Parameters
        ----------
            inputs  : numpy ndarray of size (n_images, dimension) containing the d3m Index, image name, 
                      and bounding box for each image.

        Returns
        -------
            outputs : A d3m dataframe container with the d3m index, image name, bounding boxes as 
                      a string (8 coordinate format), and confidence scores.
        """
        iou_threshold = 0.5  # Bounding box overlap threshold for false positive or true positive
        score_threshold = 0.05  # The score confidence threshold to use for detections
        max_detections = 100  # Maxmimum number of detections to use per image

        # create the generator
        generator = self._create_generator(self.annotations,
                                           self.classes,
                                           shuffle_groups=False)

        # Convert training model to inference model
        inference_model = models.convert_model(self.training_model)

        # Assemble output lists
        ## Generate predicted bounding boxes (8-coordinate format, list)
        boxes, scores = self._evaluate_model(generator, inference_model,
                                             iou_threshold, score_threshold,
                                             max_detections,
                                             self.hyperparams['output'])

        ## Convert predicted boxes from a list of arrays to a list of strings
        boxes = np.array(boxes).tolist()
        boxes = list(map(lambda x: ",".join(map(str, x)), boxes))

        ## Generate list of image names and d3m indices corresponding to predicted bounding boxes
        img_list = [
            os.path.basename(list)
            for list in self.annotations['img_file'].tolist()
        ]
        d3m_idx = inputs.d3mIndex.tolist()

        print(len(d3m_idx), file=sys.__stdout__)
        print(len(img_list), file=sys.__stdout__)
        print(len(boxes), file=sys.__stdout__)
        print(len(scores), file=sys.__stdout__)

        ## Assemble in a Pandas DataFrame
        results = pd.DataFrame({
            'd3mIndex': d3m_idx,
            'image': img_list,
            'bounding_box': boxes,
            'confidence': scores
        })

        # Convert to DataFrame container
        results_df = d3m_DataFrame(results)

        ## Assemble first output column ('d3mIndex)
        col_dict = dict(
            results_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'd3mIndex'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        results_df.metadata = results_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)

        ## Assemble second output column ('image')
        col_dict = dict(
            results_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'image'
        col_dict['semantic_types'] = (
            'http://schema.org/Text',
            'https://metadata.datadrivendiscovery.org/types/Attribute')
        results_df.metadata = results_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)

        ## Assemble third output column ('bounding_box')
        col_dict = dict(
            results_df.metadata.query((metadata_base.ALL_ELEMENTS, 2)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'bounding_box'
        col_dict['semantic_types'] = (
            'http://schema.org/Text',
            'https://metadata.datadrivendiscovery.org/types/PredictedTarget',
            'https://metadata.datadrivendiscovery.org/types/BoundingPolygon')
        results_df.metadata = results_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 2), col_dict)

        ## Assemble fourth output column ('confidence')
        col_dict = dict(
            results_df.metadata.query((metadata_base.ALL_ELEMENTS, 3)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'confidence'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/Score')
        results_df.metadata = results_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 3), col_dict)

        return CallResult(results_df)
Esempio n. 28
0
class CSVReader(FeaturizationTransformerPrimitiveBase[Inputs, Outputs,
                                                      Hyperparams]):
    """
    BBN D3M CSV Reader Primitive

    Arguments:
    """

    __git_commit__ = utils.current_git_commit(os.path.dirname(__file__))
    metadata = metadata_module.PrimitiveMetadata({
        'id':
        'a771e153-67d7-4f69-b5f9-1a764e502a23',
        'version':
        __version__,
        'name':
        "CSV Reader",
        'description':
        "BBN D3M CSV Reader Primitive.",
        'keywords': [],
        'source': {
            'name':
            __author__,
            'contact':
            'mailto:[email protected]',
            'uris': [
                'https://github.com/BBN-E/d3m-bbn-primitives/blob/{git_commit}/bbn_primitives/time_series/csv_reader.py'
                .format(git_commit=__git_commit__),
                'https://github.com/BBN-E/d3m-bbn-primitives.git',
            ],
        },
        'installation': [{
            'type': 'UBUNTU',
            'package': 'ffmpeg',
            'version': '7:2.8.11-0',
        }, {
            'type':
            'PIP',
            'package_uri':
            'git+https://github.com/BBN-E/d3m-bbn-primitives.git@{git_commit}#egg={egg}'
            .format(git_commit=__git_commit__, egg='bbn_primitives'),
        }],
        'python_path':
        'd3m.primitives.data_preprocessing.csv_reader.CSVReader',
        'algorithm_types':
        [metadata_module.PrimitiveAlgorithmType.DATA_CONVERSION
         ],  #['DATA_CONVERSION'], #  replaced 'AUDIO_MIXING'
        'primitive_family':
        metadata_module.PrimitiveFamily.DATA_PREPROCESSING,
        #'algorithm_types': ['DATA_CONVERSION'], # TODO: replace by a new algorithm_type, e.g. ?
        #'primitive_family': 'DATA_PREPROCESSING',
    })

    def __init__(
            self,
            *,
            hyperparams: Hyperparams,
            random_seed: int = 0,
            docker_containers: typing.Dict[str,
                                           DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)
        self._metadata_lookup = None
        return

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Arguments:
            - inputs: [ num_samples, num_channels ]

        Returns:
            - [ num_samples ]
        """

        with stopit.ThreadingTimeout(timeout) as timer:
            metadata_lookup = self.__class__._parse_metadata(
                metadata=inputs.metadata)
            if not metadata_lookup:
                return None

            outputs = Outputs()
            metadata = self.__class__._can_accept(self=self,
                                                  method_name='produce',
                                                  arguments={
                                                      'inputs':
                                                      inputs.metadata,
                                                  },
                                                  hyperparams=self.hyperparams,
                                                  outputs=outputs)

            csv_location_base_uris = inputs.metadata.query(
                metadata_lookup['location_base_uris']
                ['selector'])['location_base_uris'][0]
            for idx, row in inputs[metadata_lookup['primary_resource_id']
                                   ['selector'][0]].iterrows():
                #for idx in range(len(inputs[metadata_lookup['primary_resource_id']['selector'][0]])):
                #row = inputs[metadata_lookup['primary_resource_id']['selector'][0]][idx]
                #d3mIndex = row[metadata_lookup['primary_key']['selector'][-1]]
                d3mIndex = row['d3mIndex']
                csv_fn = row[metadata_lookup['csv_fn']['selector'][-1]]
                filename = os.path.join(csv_location_base_uris, csv_fn)
                filename = re.sub('^file://', '', filename)
                #csv_file= csv.load(filename)
                csv_file = pd.read_csv(filename, index_col=0)
                start = 0
                end = len(csv_file)

                outputs.append(csv_file)
                metadata = metadata.update((idx, ), {'sampling_rate': 1})

            metadata = metadata.update((),
                                       {'dimension': {
                                           'length': len(outputs)
                                       }})
            # Set metadata attribute.
            outputs.metadata = metadata

        if timer.state == timer.EXECUTED:
            return CallResult(outputs)
        else:
            raise TimeoutError('Reader exceeded time limit')

    @classmethod
    def can_accept(
        cls, *, method_name: str,
        arguments: typing.Dict[str, typing.Union[metadata_module.Metadata,
                                                 type]]
    ) -> typing.Optional[metadata_module.DataMetadata]:
        output_metadata = super().can_accept(method_name=method_name,
                                             arguments=arguments,
                                             hyperparams=hyperparams)

        return cls._can_accept(self=cls,
                               method_name=method_name,
                               arguments=arguments,
                               hyperparams=hyperparams,
                               outputs=Outputs())

    @classmethod
    def _can_accept(
            cls, *, self, method_name: str,
            arguments: typing.Dict[str, typing.Union[metadata_module.Metadata,
                                                     type]],
            hyperparams: Hyperparams,
            outputs: Outputs) -> typing.Optional[metadata_module.DataMetadata]:
        #output_metadata = super().can_accept(method_name=method_name, arguments=arguments, hyperparams=hyperparams)

        if 'inputs' not in arguments:
            return output_metadata

        inputs_metadata = typing.cast(metadata_module.DataMetadata,
                                      arguments['inputs'])

        metadata_lookup = cls._parse_metadata(metadata=inputs_metadata)

        num_data = inputs_metadata.query(metadata_lookup['primary_resource_id']
                                         ['selector'])['dimension']['length']

        metadata = inputs_metadata.clear(
            {
                'schema': metadata_module.CONTAINER_SCHEMA_VERSION,
                'structural_type': Outputs,
                'dimension': {
                    'length': num_data,
                }
            },
            for_value=outputs,
            source=self
        ).update(
            (metadata_module.ALL_ELEMENTS, ), {
                'structural_type':
                d3m_ndarray,
                'semantic_types':
                ('https://metadata.datadrivendiscovery.org/types/Timeseries', )
            },
            source=self)

        return metadata

    @classmethod
    def _update_metadata_lookup(cls, metadata_lookup, key, selector):
        if key not in metadata_lookup:
            raise Exception('Updating unknown key %s' % key)

        metadata_lookup[key]['found'] = True
        metadata_lookup[key]['selector'] = selector

    @classmethod
    def _valid_metadata_lookup(cls, metadata_lookup):
        for k in metadata_lookup.keys():
            if metadata_lookup[k][
                    'required'] and not metadata_lookup[k]['found']:
                return False
        return True

    @classmethod
    def _init_metadata_lookup(cls):
        metadata_lookup = dict()
        metadata_lookup['primary_key'] = {
            'required': True,
            'found': False,
            'selector': None,
        }
        metadata_lookup['primary_resource_id'] = {
            'required': True,
            'found': False,
            'selector': None,
        }
        metadata_lookup['csv_fn'] = {
            'required': True,
            'found': False,
            'selector': None,
        }
        metadata_lookup['location_base_uris'] = {
            'required': True,
            'found': False,
            'selector': None,
        }

        return metadata_lookup

    @classmethod
    def _parse_metadata(cls, *, metadata: metadata_module.DataMetadata):
        flatten = lambda l: [item for sublist in l for item in sublist]

        mdlu = cls._init_metadata_lookup()

        num_res = metadata.query(())['dimension']['length']
        resources = [str(x) for x in range(num_res - 1)]
        resources.append('learningData')
        primary_key = [[
            (res_id, metadata_module.ALL_ELEMENTS, col_id) for col_id in range(
                metadata.query((
                    res_id,
                    metadata_module.ALL_ELEMENTS))['dimension']['length'])
            if 'd3mIndex' == metadata.query((res_id,
                                             metadata_module.ALL_ELEMENTS,
                                             col_id))['name']
        ] for res_id in resources]
        primary_key = flatten(primary_key)
        if len(primary_key) != 1:
            raise Exception('One primary key supported')
        cls._update_metadata_lookup(mdlu, 'primary_key', primary_key[0])
        cls._update_metadata_lookup(mdlu, 'primary_resource_id',
                                    (primary_key[0][0], ))

        csv_res_type = 'https://metadata.datadrivendiscovery.org/types/Timeseries'
        primary_resource_cols = metadata.query(
            (mdlu['primary_resource_id']['selector'][0],
             metadata_module.ALL_ELEMENTS))
        for col_id in range(primary_resource_cols['dimension']['length']):
            cmd = metadata.query((mdlu['primary_resource_id']['selector'][0],
                                  metadata_module.ALL_ELEMENTS, col_id))
            if 'semantic_types' in cmd:
                st = cmd['semantic_types']
                if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in st:
                    # we already found primary key
                    pass
                elif 'https://metadata.datadrivendiscovery.org/types/Attribute' in st:
                    if 'foreign_key' in cmd and cmd['foreign_key'][
                            'type'] == 'COLUMN':
                        foreign_resource_id = cmd['foreign_key']['resource_id']
                        foreign_resource_md = metadata.query(
                            (foreign_resource_id, ))
                        foreign_col_selector = (
                            foreign_resource_id, metadata_module.ALL_ELEMENTS,
                            cmd['foreign_key']['column_index'])
                        foreign_col_md = metadata.query(foreign_col_selector)
                        if csv_res_type in foreign_col_md['semantic_types'] and \
                            'https://metadata.datadrivendiscovery.org/types/FileName' in foreign_col_md['semantic_types']:
                            cls._update_metadata_lookup(
                                mdlu, 'csv_fn',
                                (mdlu['primary_resource_id']['selector'][0],
                                 metadata_module.ALL_ELEMENTS, col_id))
                            cls._update_metadata_lookup(
                                mdlu, 'location_base_uris',
                                foreign_col_selector)
                        else:
                            _logger.warning(
                                'Expected foreign resource of type %s and column of semantic type Filename'
                                % (csv_res_type))
                    else:
                        _logger.warning('Unexpected semantic type Attribute')
                elif 'https://metadata.datadrivendiscovery.org/types/InstanceWeight' in st:
                    _logger.warning(
                        'Semantic type InstanceWeight recognized but unused in the current implementation'
                    )
                elif 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in st:
                    _logger.info(
                        'Semantic type SuggestedTarget is ignored by this primitive'
                    )
                #else:
                #raise Exception('Semantic type(s) %s does not match any supported types' % (st))

        return mdlu if cls._valid_metadata_lookup(mdlu) else None
Esempio n. 29
0
class DistilVertexNominationPrimitive(PrimitiveBase[container.List,
                                                    container.DataFrame,
                                                    Params, Hyperparams]):
    """
    A primitive that uses random forest to solve vertext nomination
    problems.
    """

    metadata = metadata_base.PrimitiveMetadata(
        {
            "id":
            "0130828c-1ac0-47a9-a167-f05bae5a3146",
            "version":
            version.__version__,
            "name":
            "VertexNomination",
            "python_path":
            "d3m.primitives.vertex_nomination.seeded_graph_matching.DistilVertexNomination",
            "source": {
                "name":
                "Distil",
                "contact":
                "mailto:[email protected]",
                "uris": [
                    "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/vertex_nomination.py",
                    "https://github.com/uncharted-distil/distil-primitives",
                ],
            },
            "installation": [
                CYTHON_DEP,
                {
                    "type":
                    metadata_base.PrimitiveInstallationType.PIP,
                    "package_uri":
                    "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives"
                    .format(git_commit=utils.current_git_commit(
                        os.path.dirname(__file__)), ),
                },
            ],
            "algorithm_types": [
                metadata_base.PrimitiveAlgorithmType.RANDOM_FOREST,
            ],
            "primitive_family":
            metadata_base.PrimitiveFamily.VERTEX_NOMINATION,
        }, )

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)
        self._model = VertexNominationCV(
            target_metric=self.hyperparams["metric"], random_seed=random_seed)

    def set_training_data(self, *, inputs: container.List,
                          outputs: container.DataFrame) -> None:
        self._inputs = inputs
        self._outputs = outputs
        self._target_col = outputs.columns[0]

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        logger.debug(f"Fitting {__name__}")

        X_train, y_train, U_train = self._inputs
        X_train = X_train.value
        y_train = y_train.squeeze()
        self._model.fit(X_train, y_train, U_train)

        return CallResult(None)

    def produce(self,
                *,
                inputs: container.List,
                timeout: float = None,
                iterations: int = None) -> CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        X_train, _, U = inputs
        X_train = X_train.value
        result = self._model.predict(X_train, U)

        # create dataframe to hold d3mIndex and result
        result_df = container.DataFrame({
            X_train.index.name: X_train.index,
            self._target_col: result
        })

        # mark the semantic types on the dataframe
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PrimaryKey",
        )
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )

        return base.CallResult(result_df)

    def get_params(self) -> Params:
        return Params(model=self._model, target_col=self._target_col)

    def set_params(self, *, params: Params) -> None:
        self._target_col = params["target_col"]
        self._model = params["model"]
        return
Esempio n. 30
0
class DeepAR(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
        Primitive that applies a deep autoregressive forecasting algorithm for time series
        prediction. The implementation is based off of this paper: https://arxiv.org/pdf/1704.04110.pdf
        and is implemented in AWS's Sagemaker interface.

        Training inputs: 1) Feature dataframe, 2) Target dataframe
        Outputs: Dataframe with predictions for specific time series at specific future time instances 
    
        Arguments:
            hyperparams {Hyperparams} -- D3M Hyperparameter object
        
        Keyword Arguments:
            random_seed {int} -- random seed (default: {0})
    """

    metadata = metadata_base.PrimitiveMetadata(
        {
            # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
            "id": "3410d709-0a13-4187-a1cb-159dd24b584b",
            "version": __version__,
            "name": "DeepAR",
            # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
            "keywords": [
                "time series",
                "forecasting",
                "recurrent neural network",
                "autoregressive",
            ],
            "source": {
                "name": __author__,
                "contact": __contact__,
                "uris": [
                    # Unstructured URIs.
                    "https://github.com/NewKnowledge/TimeSeries-D3M-Wrappers",
                ],
            },
            # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
            # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
            # install a Python package first to be even able to run setup.py of another package. Or you have
            # a dependency which is not on PyPi.
            "installation": [
                {"type": "PIP", "package": "cython", "version": "0.29.14"},
                {
                    "type": metadata_base.PrimitiveInstallationType.PIP,
                    "package_uri": "git+https://github.com/NewKnowledge/TimeSeries-D3M-Wrappers.git@{git_commit}#egg=TimeSeriesD3MWrappers".format(
                        git_commit=utils.current_git_commit(os.path.dirname(__file__)),
                    ),
                },
            ],
            # The same path the primitive is registered with entry points in setup.py.
            "python_path": "d3m.primitives.time_series_forecasting.lstm.DeepAR",
            # Choose these from a controlled vocabulary in the schema. If anything is missing which would
            # best describe the primitive, make a merge request.
            "algorithm_types": [
                metadata_base.PrimitiveAlgorithmType.RECURRENT_NEURAL_NETWORK,
            ],
            "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING,
        }
    )

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        # set seed for reproducibility
        tf.random.set_seed(random_seed)

        self._is_fit = False
        self._new_train_data = False

    def get_params(self) -> Params:
        return self._params

    def set_params(self, *, params: Params) -> None:
        self._params = params

    def _drop_multiple_special_cols(self, col_list, col_type):
        """
            private util function that creates list of duplicated special columns (for deletion)

            Arguments:
                col_list {List[int]} -- list of column indices 
                col_type {str} -- D3M semantic type

            Returns:
                int or None -- first column idx in col_list if any column idxs are marked (else None)
        """

        if len(col_list) == 0:
            return None
        elif len(col_list) > 1:
            logger.warn(
                f"""There are more than one {col_type} marked. This primitive will use the first and drop other {col_type}s."""
            )
            self._drop_cols += col_list[1:]
            if col_type != "target column":
                self._drop_cols_no_tgt += col_list[1:]
        return col_list[0]

    def _get_cols(self, input_metadata):
        """ private util function: get indices of important columns from metadata 

            Arguments:
                input_metadata {D3M Metadata object} -- D3M Metadata object for input frame

            Raises:
                ValueError: If Target column is not of type 'Integer' or 'Float'
        """

        self._drop_cols = []
        self._drop_cols_no_tgt = []

        # get target idx (first column by default)
        target_columns = input_metadata.list_columns_with_semantic_types(
            (
                "https://metadata.datadrivendiscovery.org/types/SuggestedTarget",
                "https://metadata.datadrivendiscovery.org/types/TrueTarget",
                "https://metadata.datadrivendiscovery.org/types/Target",
            )
        )
        if len(target_columns) == 0:
            raise ValueError("At least one column must be marked as a target")
        self._target_column = self._drop_multiple_special_cols(
            target_columns, "target column"
        )

        # get timestamp idx (first column by default)
        timestamp_columns = input_metadata.list_columns_with_semantic_types(
            (
                "https://metadata.datadrivendiscovery.org/types/Time",
                "http://schema.org/DateTime",
            )
        )
        self._timestamp_column = self._drop_multiple_special_cols(
            timestamp_columns, "timestamp column"
        )

        # get grouping idx and add suggested grouping keys to drop_cols list
        grouping_columns = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/GroupingKey",)
        )
        self._grouping_column = self._drop_multiple_special_cols(
            grouping_columns, "grouping column"
        )
        suggested_grouping_columns = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey",)
        )
        self._drop_cols += suggested_grouping_columns
        self._drop_cols_no_tgt += suggested_grouping_columns

        # get index_col (first index column by default)
        index_columns = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/PrimaryKey",)
        )
        self._index_column = self._drop_multiple_special_cols(
            index_columns, "index column"
        )

        # determine whether targets are count data
        target_semantic_types = input_metadata.query_column_field(
            self._target_column, "semantic_types"
        )
        if self.hyperparams["count_data"] is not None:
            self._count_data = self.hyperparams["count_data"]
        elif "http://schema.org/Integer" in target_semantic_types:
            if np.min(self._ts_frame.iloc[:, self._target_column]) > 0:
                self._count_data = True
            else:
                self._count_data = False
        elif "http://schema.org/Float" in target_semantic_types:
            self._count_data = False
        else:
            raise ValueError("Target column is not of type 'Integer' or 'Float'")
        #logger.info(f"count data: {self._count_data}")

    def _update_indices(self):
        """ private util function: 
            subtract length of drop cols from each marked idx to account for smaller df 
        """

        length = len(self._drop_cols)
        if self._target_column is not None:
            self._target_column -= length
        if self._timestamp_column is not None:
            self._timestamp_column -= length
        if self._grouping_column is not None:
            self._grouping_column -= length
        if self._index_column is not None:
            self._index_column -= length
        self._cols_after_drop = self._ts_frame.shape[0]

    def _create_data_object_and_learner(self, val_split):
        """ private util function:
            creates (or updates) train ds object and learner 

            Arguments:
                val_split {float} -- proportion of training data to withhold for validation

        """

        # Create TimeSeries dataset objects
        #logger.info(self._ts_frame.head())
        self._ts_object = TimeSeriesTrain(
            self._ts_frame,
            target_idx=self._target_column,
            timestamp_idx=self._timestamp_column,
            grouping_idx=self._grouping_column,
            index_col=self._index_column,
            count_data=self._count_data,
            negative_obs=self.hyperparams["negative_obs"],
            val_split=val_split,
            integer_timestamps=self._integer_timestamps,
            freq=self.freq,
        )
        #logger.info(self._ts_object.data.head())

        # Create learner
        self._learner = DeepARLearner(
            self._ts_object,
            emb_dim=self.hyperparams["emb_dim"],
            lstm_dim=self.hyperparams["lstm_dim"],
            dropout=self.hyperparams["dropout_rate"],
            lr=self.hyperparams["learning_rate"],
            batch_size=self.hyperparams["batch_size"],
            train_window=self.hyperparams["window_size"],
            verbose=0,
        )

        # save weights so we can restart fitting from scratch (if desired by caller)
        self._learner.save_weights("model_initial_weights.h5")

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """ Sets primitive's training data
        
            Arguments:
                inputs {Inputs} -- D3M dataframe containing attributes
                outputs {Outputs} -- D3M dataframe containing targets
            
            Raises:
                ValueError: If multiple columns are annotated with 'Time' or 'DateTime' metadata
        """

        # save copy of train data so we don't predict for each row in training
        self._output_columns = outputs.columns
        self._train_data = inputs.copy()

        # combine inputs and outputs for internal TimeSeries object
        self._ts_frame = inputs.append_columns(outputs)

        # Parse cols needed for ts object
        self._get_cols(self._ts_frame.metadata)

        # drop cols if multiple special type columns
        if len(self._drop_cols) > 0:
            self._ts_frame = self._ts_frame.remove_columns(self._drop_cols)
            self._update_indices()

        # assumption is that integer timestamps are days (treated this way by DeepAR objects)
        if "http://schema.org/Integer" in self._ts_frame.metadata.query_column_field(
            self._timestamp_column, "semantic_types"
        ):
            self._integer_timestamps = True
        else:
            self._integer_timestamps = False

        # calculate frequency of time series
        g_col, t_col = (
            self._ts_frame.columns[self._grouping_column],
            self._ts_frame.columns[self._timestamp_column],
        )
        if self._grouping_column is None:
            time_col_sorted = np.sort(self._ts_frame[t_col])
            self._min_train = time_col_sorted[0]
            self.freq = calculate_time_frequency(time_col_sorted[1] - self._min_train)
            # self._train_diff = int(
            #     np.diff(np.sort(self._ts_frame.iloc[:, self._timestamp_column]))[0]
            # )
        else:
            # assume frequency is the same across all time series
            self.freq = calculate_time_frequency(
                int(
                    self._ts_frame.groupby(g_col)[t_col]
                    .apply(lambda x: np.diff(np.sort(x)))
                    .iloc[0][0]
                )
            )
            self._min_train = self._ts_frame.groupby(g_col)[t_col].agg("min").min()

        # Create TimeSeries dataset object and learner
        self._create_data_object_and_learner(self.hyperparams["val_split"])

        # mark that new training data has been set
        self._new_train_data = True
        self._in_sample_preds = None

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """ Fits DeepAR model using training data from set_training_data and hyperparameters
            
            Keyword Arguments:
                timeout {float} -- timeout, considered (default: {None})
                iterations {int} -- iterations, considered (default: {None})
            
            Returns:
                CallResult[None]
        """

        # restore initial model weights if new training data
        if self._new_train_data:

            # only create new dataset object / model (w/out val) if new training data
            if iterations is not None:
                self._create_data_object_and_learner(0)
            self._learner.load_weights("model_initial_weights.h5")

        if iterations is None:
            iterations_set = False
            iterations = self.hyperparams["epochs"]
            validation = self.hyperparams["val_split"] > 0
        else:
            iterations_set = True
            validation = False

        # time training for 1 epoch so we can consider timeout argument thoughtfully
        if timeout:
            logger.info(
                """Timing the fitting procedure for one epoch so we
                can consider timeout thoughtfully"""
            )
            start_time = time.time()
            _, iterations_completed = self._learner.fit(
                validation=validation,
                steps_per_epoch=self.hyperparams["steps_per_epoch"],
                epochs=1,
                stopping_patience=self.hyperparams["early_stopping_patience"],
                stopping_delta=self.hyperparams["early_stopping_delta"],
                tensorboard=False,
            )
            epoch_time_estimate = time.time() - start_time
            # subract 1 for epoch that already happened and 1 more to be safe
            timeout_epochs = timeout // epoch_time_estimate - 2
            iters = min(timeout_epochs, iterations)
        else:
            iters = iterations

        # normal fitting
        logger.info(f"Fitting for {iters} iterations")
        start_time = time.time()

        _, iterations_completed = self._learner.fit(
            validation=validation,
            steps_per_epoch=self.hyperparams["steps_per_epoch"],
            epochs=iters,
            stopping_patience=self.hyperparams["early_stopping_patience"],
            stopping_delta=self.hyperparams["early_stopping_delta"],
            tensorboard=False,
        )
        logger.info(
            f"Fit for {iterations_completed} epochs, took {time.time() - start_time}s"
        )

        # maintain primitive state (mark that training data has been used)
        self._new_train_data = False
        self._is_fit = True

        # use fitting history to set CallResult return values
        if iterations_set:
            has_finished = False
        elif iters < iterations:
            has_finished = False
        else:
            has_finished = self._is_fit

        return CallResult(
            None, has_finished=has_finished, iterations_done=iterations_completed
        )

    def _get_pred_intervals(self, df, keep_all=False):
        """ private util function that retrieves unevenly spaced prediction intervals from data frame 

            Arguments:
                df {pandas df} -- df of predictions from which to extract prediction intervals

            Keyword Arguments:
                keep_all {bool} -- if True, take every interval slice, otherwise only take
                    those given by the df

            Returns:
                pd Series -- series of intervals, indexed by group, granularity of 1 interval 

        """

        # no grouping column
        if self._grouping_column is None:
            interval = discretize_time_difference(
                df.iloc[:, self._timestamp_column],
                self._min_train,
                self.freq,
                self._integer_timestamps,
            )
            if keep_all:
                interval = np.arange(min(interval), max(interval) + 1)
            return pd.Series([interval])

        # grouping column
        else:
            g_col, t_col = (
                df.columns[self._grouping_column],
                df.columns[self._timestamp_column],
            )
            all_intervals, groups = [], []
            for (group, vals) in df.groupby(g_col)[t_col]:
                interval = discretize_time_difference(
                    vals, self._min_train, self.freq, self._integer_timestamps
                )
                if keep_all:
                    interval = np.arange(min(interval), max(interval) + 1)
                all_intervals.append(interval)
                groups.append(group)
            return pd.Series(all_intervals, index=groups)

    def produce(
        self, *, inputs: Inputs, timeout: float = None, iterations: int = None
    ) -> CallResult[Outputs]:
        """ Produce primitive's predictions for specific time series at specific future time instances
            * these specific timesteps / series are specified implicitly by input dataset

            Arguments:
                inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target
            
            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})

            Raises:
                PrimitiveNotFittedError: if primitive not fit
            
            Returns:
                CallResult[Outputs] -- (N, 2) dataframe with d3m_index and value for each prediction slice requested.
                    prediction slice = specific horizon idx for specific series in specific regression 
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        if len(self._drop_cols_no_tgt) > 0 and inputs.shape[1] != self._cols_after_drop:
            test_frame = inputs.remove_columns(self._drop_cols_no_tgt)
        else:
            test_frame = inputs.copy()

        # Create TimeSeriesTest object
        if self._train_data.equals(inputs):
            ts_test_object = TimeSeriesTest(self._ts_object)
            include_all_training = True
        # test
        else:
            ts_test_object = TimeSeriesTest(self._ts_object, test_frame)
            include_all_training = self.hyperparams['seed_predictions_with_all_data']

        # get prediction slices
        pred_intervals = self._get_pred_intervals(test_frame)

        # make predictions with learner
        start_time = time.time()
        logger.info(f"Making predictions...")
        preds = self._learner.predict(ts_test_object, include_all_training=include_all_training)
        logger.info(
            f"Prediction took {time.time() - start_time}s. Predictions array shape: {preds.shape}"
        )

        # append saved in-sample predictions to test predictions if not seeding with all context
        if self._in_sample_preds is None:
            self._in_sample_preds = preds
        elif not self.hyperparams['seed_predictions_with_all_data']:
            preds = np.concatenate((self._in_sample_preds, preds), axis=1)

        # slice predictions with learned intervals
        all_preds = []
        for p, idxs in zip(preds, pred_intervals.values):
            # all_preds.extend(p[: len(idxs)])  # this takes first n predictions
            all_preds.extend(
                [p[i] for i in idxs]
            )  # this takes predictions at actual indices
        flat_list = np.array([p for pred_list in all_preds for p in pred_list])

        # if np.isinf(all_preds).any():
        #     logger.debug(f'There are {np.isinf(all_preds).sum()} inf preds')
        # if np.isnan(all_preds).any():
        #     logger.debug(f'There are {np.isnan(all_preds).sum()} nan preds')
        # logger.debug(f'Max: {preds.max()}, Min: {preds.min()}')

        # fill nans with 0s in case model predicted some (shouldnt need to - preventing edge case)
        flat_list = np.nan_to_num(flat_list)

        # create output frame
        result_df = container.DataFrame(
            {self._ts_frame.columns[self._target_column]: flat_list},
            generate_metadata=True,
        )
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )

        return CallResult(result_df, has_finished=self._is_fit)

    def produce_confidence_intervals(
        self, *, inputs: Inputs, timeout: float = None, iterations: int = None
    ) -> CallResult[Outputs]:
        """ produce confidence intervals for each series 'confidence_interval_horizon' periods into
                the future
        
        Arguments:
            inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target
        
        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, considered (default: {None})
        
        Raises:
            PrimitiveNotFittedError: 
        
        Returns:
            CallResult[Outputs] -- 

            Ex. 
                series | timestep | mean | 0.05 | 0.95
                --------------------------------------
                a      |    0     |  5   |   3  |   7
                a      |    1     |  6   |   4  |   8
                b      |    0     |  5   |   3  |   7
                b      |    1     |  6   |   4  |   8
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        alpha = self.hyperparams["confidence_interval_alpha"]

        if len(self._drop_cols_no_tgt) > 0 and inputs.shape[1] != self._cols_after_drop:
            test_frame = inputs.remove_columns(self._drop_cols_no_tgt)
        else:
            test_frame = inputs.copy()

        # Create TimeSeriesTest object
        if self._train_data.equals(inputs):
            ts_test_object = TimeSeriesTest(self._ts_object)
            include_all_training = True
            horizon = 0
        # test
        else:
            ts_test_object = TimeSeriesTest(self._ts_object, test_frame)
            include_all_training = self.hyperparams['seed_predictions_with_all_data']
            horizon = self.hyperparams["confidence_interval_horizon"]

        # make predictions with learner
        start_time = time.time()
        logger.info(f"Making predictions...")
        preds = self._learner.predict(
            ts_test_object,
            horizon=horizon,
            samples=self.hyperparams["confidence_interval_samples"],
            include_all_training=include_all_training,
            point_estimate = False
        )
        logger.info(
            f"Prediction took {time.time() - start_time}s. Predictions array shape: {preds.shape}"
        )

        # convert samples to percentiles
        means = np.percentile(preds, 50, axis=2).reshape(-1, 1)
        lowers = np.percentile(preds, alpha / 2 * 100, axis=2).reshape(-1, 1)
        uppers = np.percentile(preds, (1 - alpha / 2) * 100, axis=2).reshape(-1, 1)

        assert (lowers < means).all()
        assert (means < uppers).all()

        # convert to df
        if self._grouping_column is None:
            indices = np.repeat(self._output_columns[0], preds.shape[1])
        else:
            indices = np.repeat(
                test_frame[test_frame.columns[self._grouping_column]].unique(), preds.shape[1]
            )
        interval_df = pd.DataFrame(
            np.concatenate((means, lowers, uppers), axis=1),
            columns=["mean", str(alpha / 2), str(1 - alpha / 2)],
            index=indices,
        )        

        # add index column
        interval_df["horizon_index"] = np.tile(
            np.arange(preds.shape[1]), len(interval_df.index.unique())
        )
        
        logger.debug(interval_df.head())

        # structure return df
        return CallResult(
            container.DataFrame(interval_df, generate_metadata=True),
            has_finished=self._is_fit,
        )