Exemple #1
0
            "logger": "logging.Logger",
            "metadata": "d3m.metadata.base.PrimitiveMetadata"
        },
        "instance_attributes": {
            "hyperparams": "d3m.metadata.hyperparams.Hyperparams",
            "random_seed": "int",
            "docker_containers": "typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]",
            "volumes": "typing.Dict[str, str]",
            "temporary_directory": "typing.Union[NoneType, str]"
        }
    },
    "structural_type": "test_primitives.random.RandomPrimitive",
    "description": "A primitive which draws random samples from a normal distribution.\n\nAttributes\n----------\nmetadata:\n    Primitive's metadata. Available as a class attribute.\nlogger:\n    Primitive's logger. Available as a class attribute.\nhyperparams:\n    Hyperparams passed to the constructor.\nrandom_seed:\n    Random seed passed to the constructor.\ndocker_containers:\n    A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n    container's address under which the container is accessible by the primitive, and a\n    dict mapping exposed ports to ports on that address.\nvolumes:\n    A dict mapping volume keys from primitive's metadata to file and directory paths\n    where downloaded and extracted files are available to the primitive.\ntemporary_directory:\n    An absolute path to a temporary directory a primitive can use to store any files\n    for the duration of the current pipeline run phase. Directory is automatically\n    cleaned up after the current pipeline run phase finishes.",
    "digest": "__DIGEST__"
}
""".replace('__INTERFACES_VERSION__', d3m.__version__).replace('__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR)).replace('__DIGEST__', RandomPrimitive.metadata.query()['digest'])


class TestRandomPrimitive(unittest.TestCase):
    def call_primitive(self, primitive, method_name, **kwargs):
        return getattr(primitive, method_name)(**kwargs)

    def test_basic(self):
        hyperparams_class = RandomPrimitive.metadata.get_hyperparams()

        primitive = RandomPrimitive(random_seed=42, hyperparams=hyperparams_class.defaults())

        inputs = container.List(list(range(4)), generate_metadata=True)

        call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs)
class DistilLinkPredictionPrimitive(PrimitiveBase[container.List,
                                                  container.DataFrame, Params,
                                                  Hyperparams]):
    """
    A primitive that uses RESCAL to predict links in graphs.
    """

    metadata = metadata_base.PrimitiveMetadata(
        {
            "id":
            "fc138210-c317-4528-81ae-5eed3a1a0267",
            "version":
            version.__version__,
            "name":
            "LinkPrediction",
            "python_path":
            "d3m.primitives.link_prediction.link_prediction.DistilLinkPrediction",
            "source": {
                "name":
                "Distil",
                "contact":
                "mailto:[email protected]",
                "uris": [
                    "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/link_prediction.py",
                    "https://github.com/uncharted-distil/distil-primitives",
                ],
            },
            "installation": [
                CYTHON_DEP,
                {
                    "type":
                    metadata_base.PrimitiveInstallationType.PIP,
                    "package_uri":
                    "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives"
                    .format(git_commit=utils.current_git_commit(
                        os.path.dirname(__file__)), ),
                },
            ],
            "algorithm_types": [
                metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING,
            ],
            "primitive_family":
            metadata_base.PrimitiveFamily.LINK_PREDICTION,
        }, )

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)
        self._model = RescalLinkPrediction(
            target_metric=self.hyperparams["metric"], random_seed=random_seed)
        self._target_col = ""

    def set_training_data(self, *, inputs: container.List,
                          outputs: container.DataFrame) -> None:
        self._inputs = inputs
        self._outputs = outputs
        self._target_col = outputs.columns[0]

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        logger.debug(f"Fitting {__name__}")

        X_train, y_train, U_train = self._inputs
        X_train = X_train.value
        y_train = y_train.squeeze()
        self._model.fit(X_train, y_train, U_train)

        return CallResult(None)

    def produce(self,
                *,
                inputs: container.List,
                timeout: float = None,
                iterations: int = None) -> CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        X_train, _, _ = inputs
        X_train = X_train.value
        result = self._model.predict(X_train).astype(int)

        # create dataframe to hold d3mIndex and result
        result_df = container.DataFrame({
            X_train.index.name: X_train.index,
            self._target_col: result
        })

        # mark the semantic types on the dataframe
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PrimaryKey",
        )
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )

        return base.CallResult(result_df)

    def get_params(self) -> Params:
        return Params(model=self._model, target_col=self._target_col)

    def set_params(self, *, params: Params) -> None:
        self._model = params["model"]
        self._target_col = params["target_col"]
class SpectralClustering(TransformerPrimitiveBase[Inputs, Outputs,
                                                  Hyperparams]):
    '''
        Primitive that applies sklearn spectral clustering algorithm to unsupervised, 
        supervised or semi-supervised datasets. 
        
        Training inputs: D3M dataframe with features and labels, and D3M indices

        Outputs:D3M dataframe with cluster predictions and D3M indices. Clusterlabels are of "suggestTarget" semantic type if
        the task_type hyperparameter is clustering, and "Attribute" if the task_type is classification.  
    '''
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        "d13a4529-f0ba-44ee-a867-e0fdbb71d6e2",
        'version':
        __version__,
        'name':
        "tsne",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Clustering', 'Graph Clustering'],
        'source': {
            'name':
            __author__,
            'contact':
            __contact__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/D3M-Unsupervised",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type': metadata_base.PrimitiveInstallationType.PIP,
            'package': 'cython',
            'version': '0.29.7',
        }, {
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://github.com/NewKnowledge/D3M-Unsupervised.git@{git_commit}#egg=D3MUnsupervised'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.clustering.spectral_graph_clustering.SpectralClustering',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.SPECTRAL_CLUSTERING,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.CLUSTERING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self.sc = SC(n_clusters=self.hyperparams['n_clusters'],
                     n_init=self.hyperparams['n_init'],
                     n_neighbors=self.hyperparams['n_neighbors'],
                     affinity=self.hyperparams['affinity'],
                     random_state=self.random_seed)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : dataframe 

        Returns
        ----------
        Outputs
            The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter
            For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe
        """

        targets = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type(
                'https://metadata.datadrivendiscovery.org/types/SuggestedTarget'
            )
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]

        X_test = inputs.drop(columns=list(inputs)[index[0]])
        X_test = X_test.drop(columns=target_names).values

        # special semi-supervised case - during training, only produce rows with labels
        series = inputs[target_names] != ''
        if series.any().any():
            inputs = dataframe_utils.select_rows(inputs,
                                                 np.flatnonzero(series))
            X_test = X_test[np.flatnonzero(series)]

        sc_df = d3m_DataFrame(
            pandas.DataFrame(self.sc.fit_predict(X_test),
                             columns=['cluster_labels']))

        # just add last column of last column ('clusters')
        col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type(1)
        if self.hyperparams['task_type'] == 'classification':
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/Attribute')
            col_dict['name'] = 'cluster_labels'
        else:
            col_dict['semantic_types'] = (
                'http://schema.org/Integer',
                'https://metadata.datadrivendiscovery.org/types/PredictedTarget'
            )
            col_dict['name'] = target_names[0]
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0),
                                               col_dict)

        df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = (
            'https://metadata.datadrivendiscovery.org/types/TabularColumn', )
        df_dict_1['length'] = 1
        sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, ),
                                               df_dict)

        return CallResult(utils_cp.append_columns(inputs, sc_df))
class VectorBoundsFilterPrimitive(
        transformer.TransformerPrimitiveBase[container.DataFrame,
                                             container.DataFrame,
                                             Hyperparams]):
    """
    A primitive to filter columns with FloatVector semantics. based on the the i'th value of the mins/maxs
    list will indicate the appropriate min/max to filter out the indicated row indices. Note that the amount
    of row indices must match the amount of mins and maxs provided, otherwise the excess given indices won't
    have any filter applied on them.

    The filter assumes the mins and maxs are the same type of data. They can be of type int, list, and two
    dimensional list.

    If row_indices_list is empty, it filters on all indices.
    If the mins/maxs are an int, all values in all vectors will be filtered with those bounds.
    If the mins/maxs are a list, then it expect it to be the same length as the amount of indice lists given.
    i.e each scalar in the mins/maxs will correspond to each set of indices in row_indices_list to filter.
    If the mins/maxs are a two dimensional list, then each vector of filters in the list will correspond to
    each set of row_indices_list. In there, each i'th value in the filter vector will correspond to each i'th
    column in the vector to be filtered.
    i.e if we have the dataframe:
    d3mIndex | values
    0        | 10, 20, 30
    1        | 15, 25, 35
    2        | 40, 20, 50
    And you provide row_indices_list = [[0, 1], [2]],
    mins = [[12, 18, 31], [20, 25, 50]], maxs = [[20, 30, 40], [30, 25, 60]]
    Only row with index 1 will be returned, as row 0 has 10 < 12, and 30 < 31.
    Row 2 was filtered out because 40 > 20 and 50 > 40, 20 < 25.
    """

    metadata = metadata_base.PrimitiveMetadata({
        "id":
        "c2fa34c0-2d1b-42af-91d2-515da4a27752",
        "version":
        version.__version__,
        "name":
        "Vector bound filter",
        "python_path":
        "d3m.primitives.data_transformation.vector_bounds_filter.DistilVectorBoundsFilter",
        "source": {
            "name":
            "Distil",
            "contact":
            "mailto:[email protected]",
            "uris": [
                "https://github.com/uncharted-distil/distil-primitives-contrib/blob/main/main/distil_primitives_contrib/vector_filter.py",
                "https://github.com/uncharted-distil/distil-primitives-contrib",
            ],
        },
        "installation": [
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/uncharted-distil/distil-primitives-contrib.git@{git_commit}#egg=distil-primitives-contrib"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.DATA_TRANSFORMATION,
    })

    _floatvector_semantic = (
        "https://metadata.datadrivendiscovery.org/types/FloatVector", )

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        if self.hyperparams["strict"]:
            self._min_comparison_op = lambda x, y: x > y
            self._max_comparision_op = lambda x, y: x < y
        else:
            self._min_comparison_op = lambda x, y: x >= y
            self._max_comparision_op = lambda x, y: x <= y

    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        vector_column = self._get_floatvector_column(inputs.metadata)
        if vector_column is None:
            return base.CallResult(inputs)

        maxs = self.hyperparams["maxs"]
        mins = self.hyperparams["mins"]

        if type(mins) == float or type(mins) == int:
            return base.CallResult(self._scalar_filter(inputs, vector_column))

        indices = inputs.index.tolist()

        mins = [float("-inf") if i == None else i for i in mins]
        maxs = [float("inf") if i == None else i for i in maxs]

        indices_to_keep = np.empty((inputs.shape[0], ))

        try:
            rows = np.stack(inputs.iloc[:, vector_column], axis=0)

            filter_length = rows.shape[1]

            rows = np.logical_and(
                self._min_comparison_op(
                    rows[:, :filter_length],
                    mins,
                ),
                self._max_comparision_op(rows[:, :filter_length], maxs),
            )
            rows_to_keep = rows.sum(axis=1) == filter_length
        except ValueError as error:
            # rows had uneven length
            rows = inputs.iloc[:, vector_column]
            # get length of each vector
            vector_lengths = rows.apply(np.shape).apply(np.take, args=([0]))

            filter_lengths = vector_lengths.values
            # need this to loop over lengths array while keeping vectorised
            # apply function over rows
            count_for_ref = [0]

            def _filter_r(row, filter_lengths, mins, maxs, counter):
                # in case fewer filters than row length
                filterable_range = min(filter_lengths[counter[0]], len(mins))

                mins_for_filter = np.array(mins[:filterable_range])
                maxs_for_filter = np.array(maxs[:filterable_range])

                filtered_row = np.logical_and(
                    self._min_comparison_op(row[:filterable_range],
                                            mins_for_filter),
                    self._max_comparision_op(
                        row[:filterable_range],
                        maxs_for_filter,
                    ),
                )
                counter[0] += 1
                return filtered_row

            rows = rows.apply(
                _filter_r,
                args=(filter_lengths, mins, maxs, count_for_ref),
            )
            rows_to_keep = rows.apply(np.sum).values == filter_lengths

        if self.hyperparams["inclusive"]:
            indices_to_keep = [
                indices[j] for j in range(len(indices)) if rows_to_keep[j]
            ]
        else:
            indices_to_keep = [
                indices[j] for j in range(len(indices)) if not rows_to_keep[j]
            ]

        outputs = dataframe_utils.select_rows(inputs, indices_to_keep)

        return base.CallResult(outputs)

    def _scalar_filter(self, inputs, vector_column):
        max_value = self.hyperparams["maxs"]
        min_value = self.hyperparams["mins"]
        indices = inputs.index.tolist()

        if min_value == None:
            float("-inf")
        if max_value == None:
            float("inf")

        try:
            rows = np.stack(inputs.iloc[:, vector_column], axis=0)

            rows = np.logical_and(
                self._min_comparison_op(
                    rows,
                    min_value,
                ),
                self._max_comparision_op(rows, max_value),
            )
            rows_to_keep = rows.sum(axis=1) == rows.shape[1]
        except ValueError as error:
            rows = inputs.iloc[:, vector_column]

            def _filter_r(row, min_val, max_val):
                return np.logical_and(
                    self._min_comparison_op(
                        row,
                        min_val,
                    ),
                    self._max_comparision_op(
                        row,
                        max_val,
                    ),
                )

            rows = rows.apply(
                _filter_r,
                args=(min_value, max_value),
            )
            rows_to_keep = rows.apply(np.sum) == rows.apply(np.shape).apply(
                np.take, args=([0]))
        if self.hyperparams["inclusive"]:
            rows_to_keep = [
                indices[j] for j in range(len(indices)) if rows_to_keep[j]
            ]
        else:
            rows_to_keep = [
                indices[j] for j in range(len(indices)) if not rows_to_keep[j]
            ]
        return dataframe_utils.select_rows(inputs, rows_to_keep)

    def _get_floatvector_column(self,
                                inputs_metadata: metadata_base.DataMetadata):
        fv_column = self.hyperparams["column"]
        if fv_column:
            return fv_column
        fv_columns = inputs_metadata.list_columns_with_semantic_types(
            self._floatvector_semantic)
        if len(fv_columns) > 0:
            return fv_columns[0]
        logger.warning(
            "inputs provided contains no specified FloatVector column and lacks columns with FloatVector semantic"
        )
        return None
Exemple #5
0
import os
from d3m import utils

D3M_API_VERSION = '2018.1.26'
VERSION = "0.1.0"
TAG_NAME = "{git_commit}".format(git_commit=utils.current_git_commit(
    os.path.dirname(__file__)), )

REPOSITORY = "https://github.com/rooshenas/dsbox-spen"
PACAKGE_NAME = "dsbox-spen"

D3M_PERFORMER_TEAM = 'UMASS'

if TAG_NAME:
    PACKAGE_URI = "git+" + REPOSITORY + "@" + TAG_NAME
else:
    PACKAGE_URI = "git+" + REPOSITORY

PACKAGE_URI = PACKAGE_URI + "#egg=" + PACAKGE_NAME

INSTALLATION_TYPE = 'GIT'
if INSTALLATION_TYPE == 'PYPI':
    INSTALLATION = {"type": "PIP", "package": PACAKGE_NAME, "version": VERSION}
else:
    # INSTALLATION_TYPE == 'GIT'
    INSTALLATION = {
        "type": "PIP",
        "package_uri": PACKAGE_URI,
    }
class unicorn(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': "475c26dc-eb2e-43d3-acdb-159b80d9f099",
        'version': __version__,
        'name': "unicorn",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Image Clustering', 'fast fourier transfom', 'Image'],
        'source': {
            'name': __author__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/unicorn-d3m-wrapper",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        "installation": [
            {
                "type": "PIP",
                "package_uri": "git+https://github.com/NewKnowledge/d3m_unicorn.git@97b24ce39c3a26c1d753104c80012c352efd6920#egg=d3m_unicorn"
            },
            {
                "type": "PIP",
                "package_uri": "git+https://github.com/NewKnowledge/unicorn-d3m-wrapper.git@{git_commit}#egg=UNICORNd3mWrapper".format(
                    git_commit=utils.current_git_commit(os.path.dirname(__file__))
                ),
            },
                                  {
            "type": "TGZ",
            "key": "croc_weights",
            "file_uri": "http://public.datadrivendiscovery.org/croc.tar.gz",
            "file_digest":"0be3e8ab1568ec8225b173112f4270d665fb9ea253093cd9ea98c412c9053c92"
        },
        ],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.distil.unicorn',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.MULTILABEL_CLASSIFICATION # TODO
        ],
        "primitive_family": metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING
    })

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, volumes: typing.Dict[str,str]=None)-> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed,  volumes=volumes)
        
        self.volumes = volumes

    def _get_column_base_path(self, inputs: Inputs, column_name: str) -> str:
        # fetches the base path associated with a column given a name if it exists
        column_metadata = inputs.metadata.query((metadata_base.ALL_ELEMENTS,))
        if not column_metadata or len(column_metadata) == 0:
            return None

        num_cols = column_metadata['dimension']['length']
        for i in range(0, num_cols):
            col_data = inputs.metadata.query((metadata_base.ALL_ELEMENTS, i))
            if col_data['name'] == column_name and 'location_base_uris' in col_data:
                return col_data['location_base_uris'][0]

        return None

    def produce(self, *, inputs: Inputs) -> CallResult[Outputs]:
        """
            Produce image object classification predictions and OCR for an
            image provided as an URI or filepath

        Parameters
        ----------
        inputs : pandas dataframe where a column is a pd.Series of image paths/URLs

        Returns
        -------
        output : A dataframe with image labels/classifications/cluster assignments
        """

        target_columns = self.hyperparams['target_columns']
        output_labels = self.hyperparams['output_labels']

        imagepath_df = inputs
        image_analyzer = Unicorn(weights_path=self.volumes["croc_weights"]+"/inception_v3_weights_tf_dim_ordering_tf_kernels.h5")

        for i, ith_column in enumerate(target_columns):
            # initialize an empty dataframe
            result_df = pd.DataFrame()
            output_label = output_labels[i]

            # get the base uri from the column metadata and remove the the
            # scheme portion
            base_path = self._get_column_base_path(inputs, ith_column)
            if base_path:
                base_path = base_path.split('://')[1]

            # update the paths with the base if necessary
            col_paths = imagepath_df.loc[:, ith_column]
            if base_path:
                for i in range(0, len(col_paths)):
                    col_paths[i] = os.path.join(base_path, col_paths[i])

            result_df = image_analyzer.cluster_images(col_paths)

            imagepath_df = pd.concat(
                [imagepath_df.reset_index(drop=True), result_df], axis=1)

        K.clear_session()
        
        # create metadata for the unicorn output dataframe
        unicorn_df = d3m_DataFrame(imagepath_df)
        # first column (d3mIndex)
        col_dict = dict(unicorn_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'd3mIndex'
        col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')
        unicorn_df.metadata = unicorn_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict)
        # second column (filename)
        col_dict = dict(unicorn_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("it is a string")
        col_dict['name'] = "filename"
        col_dict['semantic_types'] = ('http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute')
        unicorn_df.metadata = unicorn_df.metadata.update((metadata_base.ALL_ELEMENTS, 1), col_dict)
        # third column (bounding_box)
        col_dict = dict(unicorn_df.metadata.query((metadata_base.ALL_ELEMENTS, 2)))
        col_dict['structural_type'] = type("it is a string")
        col_dict['name'] = "bounding_box"
        col_dict['semantic_types'] = ('http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute')
        unicorn_df.metadata = unicorn_df.metadata.update((metadata_base.ALL_ELEMENTS, 2), col_dict)
        # fourth column (label)
        col_dict = dict(unicorn_df.metadata.query((metadata_base.ALL_ELEMENTS, 3)))
        col_dict['structural_type'] = type("it is a string")
        col_dict['name'] = "label"
        col_dict['semantic_types'] = ('http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute')
        unicorn_df.metadata = unicorn_df.metadata.update((metadata_base.ALL_ELEMENTS, 3), col_dict)
        # fifth column (pred_class)
        col_dict = dict(unicorn_df.metadata.query((metadata_base.ALL_ELEMENTS, 4)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'pred_class'
        col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')
        unicorn_df.metadata = unicorn_df.metadata.update((metadata_base.ALL_ELEMENTS, 4), col_dict)
        
        return CallResult(unicorn_df)
Exemple #7
0
class GoatReversePrimitive(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    """
    Accept a set of lat/long pair, processes it and returns a set corresponding geographic location names
    
    Parameters
    ----------
    inputs : pandas dataframe containing 2 coordinate float values, i.e., [longitude,latitude] 
                representing each geographic location of interest - a pair of values
                per location/row in the specified target column

    Returns
    -------
    Outputs
        Pandas dataframe containing one location per longitude/latitude pair (if reverse
        geocoding possible, otherwise NaNs) appended as new columns
    """

    # Make sure to populate this with JSON annotations...
    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_base.PrimitiveMetadata(
        {
            # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
            "id": "f6e4880b-98c7-32f0-b687-a4b1d74c8f99",
            "version": __version__,
            "name": "Goat_reverse",
            # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
            "keywords": ["Reverse Geocoder"],
            "source": {
                "name": __author__,
                "contact": __contact__,
                "uris": [
                    # Unstructured URIs.
                    "https://github.com/NewKnowledge/goat-d3m-wrapper"
                ],
            },
            # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
            # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
            # install a Python package first to be even able to run setup.py of another package. Or you have
            # a dependency which is not on PyPi.
            "installation": [
                {
                    "type": metadata_base.PrimitiveInstallationType.PIP,
                    "package_uri": "git+https://github.com/NewKnowledge/goat-d3m-wrapper.git@{git_commit}#egg=GoatD3MWrapper".format(
                        git_commit=utils.current_git_commit(os.path.dirname(__file__))
                    ),
                },
                {
                    "type": "UBUNTU",
                    "package": "default-jre",
                    "version": "2:1.8-56ubuntu2",
                },
                {
                    "type": "TGZ",
                    "key": "photon-db-latest",
                    "file_uri": "http://public.datadrivendiscovery.org/photon.tar.gz",
                    "file_digest": "d7e3d5c6ae795b5f53d31faa3a9af63a9691070782fa962dfcd0edf13e8f1eab",
                },
            ],
            # The same path the primitive is registered with entry points in setup.py.
            "python_path": "d3m.primitives.data_cleaning.geocoding.Goat_reverse",
            # Choose these from a controlled vocabulary in the schema. If anything is missing which would
            # best describe the primitive, make a merge request.
            "algorithm_types": [metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD],
            "primitive_family": metadata_base.PrimitiveFamily.DATA_CLEANING,
        }
    )

    def __init__(
        self,
        *,
        hyperparams: Hyperparams,
        random_seed: int = 0,
        volumes: typing.Dict[str, str] = None,
    ) -> None:
        super().__init__(
            hyperparams=hyperparams,
            random_seed=random_seed,
            volumes=volumes,
        )

        self._decoder = JSONDecoder()
        self.volumes = volumes
        self.goat_cache = LRUCache(self.hyperparams["cache_size"])

    def produce(
        self, *, inputs: Inputs, timeout: float = None, iterations: int = None
    ) -> CallResult[Outputs]:
        """
        Accept a set of lat/long pair, processes it and returns a set corresponding geographic location names
        
        Parameters
        ----------
        inputs : pandas dataframe containing 2 coordinate float values, i.e., [longitude,latitude] 
                 representing each geographic location of interest - a pair of values
                 per location/row in the specified target column

        Returns
        -------
        Outputs
            Pandas dataframe containing one location per longitude/latitude pair (if reverse
            geocoding possible, otherwise NaNs)
        """

        # confirm that server is responding before proceeding
        address = "http://localhost:2322/"
        PopenObj = check_geocoding_server(
            address, self.volumes, self.hyperparams["rampup_timeout"]
        )

        # find location columns, real columns, and real-vector columns
        targets = inputs.metadata.get_columns_with_semantic_type(
            "https://metadata.datadrivendiscovery.org/types/Location"
        )
        real_values = inputs.metadata.get_columns_with_semantic_type(
            "http://schema.org/Float"
        )
        real_values += inputs.metadata.get_columns_with_semantic_type(
            "http://schema.org/Integer"
        )
        real_values = list(set(real_values))
        real_vectors = inputs.metadata.get_columns_with_semantic_type(
            "https://metadata.datadrivendiscovery.org/types/FloatVector"
        )
        target_column_idxs = []
        target_columns = []

        # convert target columns to list if they have single value and are adjacent in the df
        for target, target_col in zip(targets, [list(inputs)[idx] for idx in targets]):
            if target in real_vectors:
                target_column_idxs.append(target)
                target_columns.append(target_col)
            # pair of individual lat / lon columns already in list
            elif list(inputs)[target - 1] in target_columns:
                continue
            elif target in real_values:
                if target + 1 in real_values:
                    # convert to single column with list of [lat, lon]
                    col_name = "new_col_" + target_col
                    inputs[col_name] = inputs.iloc[
                        :, target : target + 2
                    ].values.tolist()
                    target_columns.append(col_name)
                    target_column_idxs.append(target)
                    target_column_idxs.append(target + 1)
                    target_column_idxs.append(inputs.shape[1] - 1)

        # make sure columns are structured as 1) lat , 2) lon pairs
        for col in target_columns:
            if inputs[col].apply(lambda x: x[0]).max() > 90:
                inputs[col] = inputs[col].apply(lambda x: x[::-1])

        # delete columns with path names of nested media files
        outputs = inputs.remove_columns(target_column_idxs)

        # reverse-geocode each requested location
        output_data = []
        for i, ith_column in enumerate(target_columns):
            j = 0
            for longlat in inputs[ith_column]:
                cache_ret = self.goat_cache.get(longlat)
                row_data = []
                if cache_ret == -1:
                    r = requests.get(
                        address
                        + "reverse?lat="
                        + str(longlat[0])
                        + "&lon="
                        + str(longlat[1])
                    )
                    tmp = self._decoder.decode(r.text)
                    if len(tmp["features"]) == 0:
                        if self.hyperparams["geocoding_resolution"] == "postcode":
                            row_data = float("nan")
                        else:
                            row_data = ""
                    elif (
                        self.hyperparams["geocoding_resolution"]
                        not in tmp["features"][0]["properties"].keys()
                    ):
                        if self.hyperparams["geocoding_resolution"] == "postcode":
                            row_data = float("nan")
                        else:
                            row_data = ""
                    else:
                        row_data = tmp["features"][0]["properties"][
                            self.hyperparams["geocoding_resolution"]
                        ]
                    self.goat_cache.set(longlat, row_data)
                else:
                    row_data = cache_ret

                if len(output_data) <= j:
                    output_data.append(row_data)
                else:
                    output_data[j] = output_data[j] + row_data
                j = j + 1

        # need to cleanup by closing the server when done...
        PopenObj.kill()

        # Build d3m-type dataframe
        out_df = pd.DataFrame(index=range(inputs.shape[0]),columns=target_columns)
        d3m_df = d3m_DataFrame(out_df)
        for i, ith_column in enumerate(target_columns):
            # for every column
            col_dict = dict(d3m_df.metadata.query((metadata_base.ALL_ELEMENTS, i)))
            if self.hyperparams["geocoding_resolution"] == "postcode":
                col_dict["structural_type"] = type(1)
                col_dict["semantic_types"] = (
                    "http://schema.org/Integer",
                    "https://metadata.datadrivendiscovery.org/types/Attribute",
                )
            else:
                col_dict["structural_type"] = type("it is a string")
                col_dict["semantic_types"] = (
                    "http://schema.org/Text",
                    "https://metadata.datadrivendiscovery.org/types/Attribute",
                )
            col_dict["name"] = target_columns[i]
            d3m_df.metadata = d3m_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, i), col_dict
            )
        df_dict = dict(d3m_df.metadata.query((metadata_base.ALL_ELEMENTS,)))
        df_dict_1 = dict(d3m_df.metadata.query((metadata_base.ALL_ELEMENTS,)))
        df_dict["dimension"] = df_dict_1
        df_dict_1["name"] = "columns"
        df_dict_1["semantic_types"] = (
            "https://metadata.datadrivendiscovery.org/types/TabularColumn",
        )
        df_dict_1["length"] = d3m_df.shape[1]
        d3m_df.metadata = d3m_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict)
        return CallResult(outputs.append_columns(d3m_df))
Exemple #8
0
class Tsne(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    '''
        Primitive that applies the T-distributed stochastic neighbour embedding algorith to unsupervised, supervised or semi-supervised datasets. 
        
        Training inputs: D3M dataset with features and labels, and D3M indices

        Outputs:D3M dataframe with t-SNE dimensions and D3M indices
    '''
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': "15586787-80d5-423e-b232-b61f55a117ce",
        'version': __version__,
        'name': "tsne",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Dimensionality Reduction'],
        'source': {
            'name': __author__,
            'contact': __contact__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/D3M-Unsupervised",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
         'installation': [
             {
                'type': metadata_base.PrimitiveInstallationType.PIP,
                'package': 'cython',
                'version': '0.29.14',
             },
             {
            'type': metadata_base.PrimitiveInstallationType.PIP,
            'package_uri': 'git+https://github.com/NewKnowledge/D3M-Unsupervised.git@{git_commit}#egg=D3MUnsupervised'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
             ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.dimensionality_reduction.t_distributed_stochastic_neighbor_embedding.Tsne',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.T_DISTRIBUTED_STOCHASTIC_NEIGHBOR_EMBEDDING,
        ],
        'primitive_family': metadata_base.PrimitiveFamily.DIMENSIONALITY_REDUCTION,
    })

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0)-> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)
 
        self.clf = TSNE(n_components = self.hyperparams['n_components'],random_state=self.random_seed)


    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : dataframe with attached metadata for semi-supervised or unsupervised data 

        Returns
        ----------
        Outputs
            D3M dataframe with t-SNE dimensions and D3M indices
        """ 
       
        
        # store information on target, index variable
        targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        target_names = [list(inputs)[t] for t in targets]
        index = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        index_names = [list(inputs)[i] for i in index]
        
        n_ts = len(inputs.d3mIndex.unique())
        if n_ts == inputs.shape[0]:
            X_test = inputs.drop(columns = list(inputs)[index[0]])
            X_test = X_test.drop(columns = target_names).values
        else:
            ts_sz = int(inputs.shape[0] / n_ts)
            X_test = np.array(inputs.value).reshape(n_ts, ts_sz)

        # fit_transform data and create new dataframe
        n_components = self.hyperparams['n_components']
        col_names = ['Dim'+ str(c) for c in range(0,n_components)]

        tsne_df = d3m_DataFrame(pandas.DataFrame(self.clf.fit_transform(X_test), columns = col_names))
        
        tsne_df = pandas.concat([inputs.d3mIndex, tsne_df], axis=1)
            
        # add index colmn metadata
        col_dict = dict(tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type('1')
        col_dict['name'] = index_names[0]
        col_dict['semantic_types'] = ('http://schema.org/Int', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        tsne_df.metadata = tsne_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict)

        # add dimenion columns metadata
        for c in range(1,n_components+1):
            col_dict = dict(tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, c)))
            col_dict['structural_type'] = type(1.0)
            col_dict['name'] = 'Dim'+str(c-1)
            col_dict['semantic_types'] = ('http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute')
            tsne_df.metadata = tsne_df.metadata.update((metadata_base.ALL_ELEMENTS, c), col_dict)

        df_dict = dict(tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) 
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = ('https://metadata.datadrivendiscovery.org/types/TabularColumn',)
        df_dict_1['length'] = n_components+1      
        tsne_df.metadata = tsne_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict)

        return CallResult(tsne_df)
Exemple #9
0
class LinkPredictionRankClassifier(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    A primitive that predicts the existence of a link if it falls within the interquartile range of
    inner products.
    """

    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': '25e97696-b96f-4f5c-8620-b340fe83414d',
        'version': "0.1.0",
        'name': "jhu.link_pred_rc",
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.link_prediction.rank_classification.JHU',
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['graph', 'inner product'],
        'source': {
            'name': "JHU",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://github.com/neurodata/primitives-interfaces/blob/master/jhu_primitives/link_pred_rc/link_pred_rc.py',
#                'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py',
                'https://github.com/neurodata/primitives-interfaces.git',
            ],
            'contact': 'mailto:[email protected]'
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [
            {
            'type': 'UBUNTU',
            'package': 'libxml2-dev',
            'version': '2.9.4'
            },
            {
            'type': 'UBUNTU',
            'package': 'libpcre3-dev',
            'version': '2.9.4'
            },
            {
            'type': 'PIP',
            'package_uri': 'git+https://github.com/neurodata/primitives-interfaces.git@{git_commit}#egg=jhu_primitives'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),),
            },
            ],
        'algorithm_types': [
            "HEURISTIC"
        ],
        'primitive_family': "LINK_PREDICTION",
        'preconditions': ['NO_MISSING_VALUES']
    })

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, base.DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
        self._fitted: bool = False
        self._inner_products: container.List = []
        self._embeddings: container.List = []

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            raise ValueError("Not fitted")
            
        random_state = np.random.RandomState(seed=self.random_seed)
        
        csv = inputs[0]
        

        # print(csv, file=sys.stderr)
        csv_headers = csv.columns
        for header in csv_headers:
            if header[:6] == "source":
                SOURCE = header
            elif header[:6] == "target":
                TARGET = header
        
        source_nodeID = np.array(csv[SOURCE]).astype(int)
        target_nodeID = np.array(csv[TARGET]).astype(int)
        
        try:
            int(np.array(csv['linkType'])[0])
        except:
            csv['linkType'] = np.zeros(len(source_nodeID))
        
        link_types = np.array(csv['linkType']).astype(int)

        n_links = len(self._inner_products) - 1
        n_nodes = int(self._embeddings.shape[0] / n_links)

        n_preds = csv.shape[0]

        predictions = np.zeros(n_preds)

        global_noexists = self._inner_products[-1][0]
        global_exists = self._inner_products[-1][1]

        # The following code is used for "global" classification only; i.e. we ignore edge type training data
        for i in range(n_preds):
            temp_source = source_nodeID[i]
            temp_target = target_nodeID[i]
            temp_link = link_types[i]
            temp_inner_product = self._embeddings[temp_link*n_nodes + temp_source-1] @ self._embeddings[temp_link*n_nodes + temp_target-1]
            temp_noexists = self._inner_products[temp_link][0]
            temp_exists = self._inner_products[temp_link][1]

            # There are three 'degenerate' cases --
            # 1) Both the exists and no exists lists are empty (first 'if')
            # 2/3) One but not the other is empty ('elif')
            # if len(temp_noexists) == 0 and len(temp_exists) == 0:
            rank_noexists = np.sum(temp_inner_product > global_noexists)
            quantile_noexists = rank_noexists / len(global_noexists)

            rank_exists = np.sum(temp_inner_product > global_noexists)
            quantile_exists = rank_exists / len(global_exists)                  

            if abs(quantile_noexists - 1/2) < abs(quantile_exists - 1/2):
                predictions[i] = int(0)
            elif abs(quantile_noexists - 1/2) > abs(quantile_exists - 1/2):
                predictions[i] = int(1)
            else:
                predictions[i] = int(random_state.binomial(1, 0.5))
            
        csv['linkExists'] = predictions.astype(int)
        outputs = container.DataFrame(csv[['d3mIndex', 'linkExists']])

        return base.CallResult(outputs)

    def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]:
        if self._fitted:
            return base.CallResult(None)

        embeddings = self._training_inputs[1][0]
        csv = self._training_inputs[0]
        n_nodes, n_links = self._training_inputs[3]

        n_info = csv.shape[0]
        ranks = [[[], []] for i in range(n_links + 1)]

        try:
            int(np.array(csv['linkType'])[0])
        except:
            csv['linkType'] = np.zeros(n_info)

        # print(csv, file=sys.stderr)
        csv_headers = csv.columns
        for header in csv_headers:
            if header[:6] == "source":
                SOURCE = header
            elif header[:6] == "target":
                TARGET = header

        for i in range(n_info):
            temp_link = int(np.array(csv['linkType'])[i])
            temp_exists = int(np.array(csv['linkExists'])[i])
            temp_source = int(np.array(csv[SOURCE])[i])
            temp_target = int(np.array(csv[TARGET])[i])
            temp_dot = embeddings[temp_link*n_nodes + temp_source - 1] @ embeddings[temp_link*n_nodes + temp_target - 1]
            ranks[temp_link][temp_exists].append(temp_dot)
            ranks[-1][temp_exists].append(temp_dot)

        for i in range(len(ranks)):
            ranks[i][0] = np.sort(ranks[i][0])
            ranks[i][1] = np.sort(ranks[i][1])

        self._embeddings = container.ndarray(embeddings)
        self._inner_products = container.List(ranks)

        self._fitted = True

        return base.CallResult(None)

    def set_training_data(self, *, inputs: Inputs) -> None:
        self._training_inputs = inputs

    def get_params(self) -> Params:
        if not self._fitted:
            raise ValueError("Fit not performed.")

        return Params(
            inner_products = self._inner_products,
            embeddings = self._embeddings
        )

    def set_params(self, *, params: Params) -> None:
        self._fitted = True
        self._inner_products = params['inner_products']

        self._embeddings = params['embeddings']
class MIRankingPrimitive(
        transformer.TransformerPrimitiveBase[container.DataFrame,
                                             container.DataFrame,
                                             Hyperparams]):
    """
    Feature ranking based on a mutual information between features and a selected
    target.  Will rank any feature column with a semantic type of Float, Boolean,
    Integer or Categorical, and a corresponding structural type of int, float or str.
    Features that could not be ranked are excluded from the returned set.
    Parameters
    ----------
    inputs : A container.Dataframe with columns containing numeric or string data.
    Returns
    -------
    output : A DataFrame containing (col_idx, col_name, score) tuples for each ranked feature.
    """

    # allowable target column types
    _discrete_types = (
        'http://schema.org/Boolean', 'http://schema.org/Integer',
        'https://metadata.datadrivendiscovery.org/types/CategoricalData')

    _continous_types = ('http://schema.org/Float', )

    _roles = (
        'https://metadata.datadrivendiscovery.org/types/Attribute',
        'https://metadata.datadrivendiscovery.org/types/Target',
        'https://metadata.datadrivendiscovery.org/types/TrueTarget',
        'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
    )

    _structural_types = set((int, float))

    _semantic_types = set(_discrete_types).union(_continous_types)

    _random_seed = 100

    __author__ = 'Uncharted Software',
    metadata = metadata_base.PrimitiveMetadata({
        'id':
        'a31b0c26-cca8-4d54-95b9-886e23df8886',
        'version':
        '0.2.1',
        'name':
        'Mutual Information Feature Ranking',
        'python_path':
        'd3m.primitives.feature_selection.mi_ranking.DistilMIRanking',
        'keywords': ['vector', 'columns', 'dataframe'],
        'source': {
            'name': 'Uncharted Software',
            'contact': 'mailto:[email protected]',
            'uris': ['http://github.com/uncharted-distil/distil-mi-ranking']
        },
        'installation': [{
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://github.com/uncharted-distil/distil-mi-ranking.git@' +
            '{git_commit}#egg=distil-mi-ranking'.format(
                git_commit=d3m_utils.current_git_commit(
                    os.path.dirname(__file__)), ),
        }],
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.MUTUAL_INFORMATION,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
    })

    @classmethod
    def _can_use_column(cls, inputs_metadata: metadata_base.DataMetadata,
                        column_index: typing.Optional[int]) -> bool:

        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        valid_struct_type = column_metadata.get('structural_type',
                                                None) in cls._structural_types
        semantic_types = column_metadata.get('semantic_types', [])
        valid_semantic_type = len(
            set(cls._semantic_types).intersection(semantic_types)) > 0
        valid_role_type = len(set(cls._roles).intersection(semantic_types)) > 0

        return valid_struct_type and valid_semantic_type

    @classmethod
    def _append_rank_info(
            cls, inputs: container.DataFrame,
            result: typing.List[typing.Tuple[int, str, float]],
            rank_np: np.array, rank_df: pd.DataFrame
    ) -> typing.List[typing.Tuple[int, str, float]]:
        for i, rank in enumerate(rank_np):
            col_name = rank_df.columns.values[i]
            result.append((inputs.columns.get_loc(col_name), col_name, rank))
        return result

    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        cols = ['idx', 'name', 'rank']

        # Make sure the target column is of a valid type and return no ranked features if it isn't.
        target_idx = self.hyperparams['target_col_index']
        if not self._can_use_column(inputs.metadata, target_idx):
            return base.CallResult(container.DataFrame(data={}, columns=cols))

        # check if target is discrete or continuous
        semantic_types = inputs.metadata.query_column(
            target_idx)['semantic_types']
        discrete = len(set(semantic_types).intersection(
            self._discrete_types)) > 0

        # make a copy of the inputs and clean out any missing data
        feature_df = inputs.copy()
        feature_df.dropna(inplace=True)

        # split out the target feature
        target_df = feature_df.iloc[:, target_idx]

        # drop features that are not compatible with ranking
        feature_indices = set(
            inputs.metadata.list_columns_with_semantic_types(
                self._semantic_types))
        role_indices = set(
            inputs.metadata.list_columns_with_semantic_types(self._roles))
        feature_indices = feature_indices.intersection(role_indices)
        feature_indices.remove(target_idx)

        # return an empty result if all features were incompatible
        if len(feature_indices) is 0:
            return base.CallResult(container.DataFrame(data={}, columns=cols))

        all_indices = set(range(0, inputs.shape[1]))
        skipped_indices = all_indices.difference(feature_indices)
        for i, v in enumerate(skipped_indices):
            feature_df.drop(inputs.columns[v], axis=1, inplace=True)

        # figure out the discrete and continuous feature indices and create an array
        # that flags them
        discrete_indices = inputs.metadata.list_columns_with_semantic_types(
            self._discrete_types)
        discrete_flags = [False] * feature_df.shape[1]
        for v in discrete_indices:
            col_name = inputs.columns[v]
            if col_name in feature_df:
                # only mark columns with a least 1 duplicate value as discrete when predicting
                # a continuous target - there's a check in the bowels of MI code that will throw
                # an exception otherwise
                if feature_df[col_name].duplicated().any() and not discrete:
                    col_idx = feature_df.columns.get_loc(col_name)
                    discrete_flags[col_idx] = True

        target_np = target_df.values
        feature_np = feature_df.values

        # compute mutual information for discrete or continuous target
        ranked_features_np = None
        if discrete:
            ranked_features_np = mutual_info_classif(
                feature_np,
                target_np,
                discrete_features=discrete_flags,
                random_state=self._random_seed)
        else:
            ranked_features_np = mutual_info_regression(
                feature_np,
                target_np,
                discrete_features=discrete_flags,
                random_state=self._random_seed)

        # merge back into a single list of col idx / rank value tuples
        data: typing.List[typing.Tuple[int, str, float]] = []
        data = self._append_rank_info(inputs, data, ranked_features_np,
                                      feature_df)

        # wrap as a D3M container - metadata should be auto generated
        results = container.DataFrame(data=data,
                                      columns=cols,
                                      generate_metadata=True)
        results = results.sort_values(by=['rank'],
                                      ascending=False).reset_index(drop=True)

        return base.CallResult(results)
Exemple #11
0
class FairnessInProcessing(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    '''
        Primitive that applies an in-processing algorithm to training data while fitting a learning algorithm. 
        Algorithm is 'Adversarial_Debiasing', which learns a classifier (tf nn based) that maximizes 
        prediction accuracy, while simultaneously reducing an adversary’s ability to determine the protected 
        attribute from the predictions.
    '''
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': "f9822847-d19f-40f9-8e23-3fdcd5dcb847",
        'version': __version__,
        'name': "In-processing Fairness Techniques",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['fairness, bias, debias, data inprocessing, data augmentation'],
        'source': {
            'name': __author__,
            'contact': __contact__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/D3M-Fairness-Primitives",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
         'installation': [
             {
            'type': metadata_base.PrimitiveInstallationType.PIP,
            'package_uri': 'git+https://github.com/NewKnowledge/D3M-Fairness-Primitives.git@{git_commit}#egg=FairnessPrimitives'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
             ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.data_augmentation.data_conversion.FairnessInProcessing',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION,
        ],
        'primitive_family': metadata_base.PrimitiveFamily.DATA_AUGMENTATION,
    })

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0)-> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self.label_names = None
        self.protected_attributes = None
        self.idx = None
        self.attribute_names = None
        self.unfavorable_label = None
        self.train_dataset = None
        self.clf = None

    def get_params(self) -> Params:
        return self._params

    def set_params(self, *, params:Params) -> None:
        self.params = params

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        '''
        Sets primitive's training data

        Parameters
        ----------
        inputs : features
        outputs : labels
        '''
                                                
        # only select attributes from training data
        targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')
        if not len(targets):
            targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        self.label_names = [list(inputs)[t] for t in targets]
        
        # calculate protected attributes 
        self.protected_attributes = [list(inputs)[c] for c in self.hyperparams['protected_attribute_cols']]

        # save index and metadata
        idx = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        self.idx = [list(inputs)[i] for i in idx]
        
        # mark attributes that are not priveleged data
        attributes = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/Attribute')
        priveleged_data = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrivilegedData')
        attributes = list(set(attributes) - set(priveleged_data))
        self.attribute_names = [list(inputs)[a] for a in attributes]
        
        # transfrom dataframe to IBM 360 compliant dataset
            # 1. assume datacleaning primitive has been applied so there are no NAs
            # 2. assume categorical columns have been converted to unique numeric values
            # 3. assume the label column is numeric 
        self.unfavorable_label = 0. if self.hyperparams['favorable_label'] == 1. else 1.
        self.train_dataset = datasets.BinaryLabelDataset(df = inputs[self.attribute_names + self.label_names],
                                                label_names = self.label_names,
                                                protected_attribute_names = self.protected_attributes,
                                                favorable_label=self.hyperparams['favorable_label'],
                                                unfavorable_label=self.unfavorable_label)

        # apply in-processing algorithm
        self.clf = inprocessing.AdversarialDebiasing(unprivileged_groups = [{self.protected_attributes[0]: self.train_dataset.unprivileged_protected_attributes}],
                                                                privileged_groups = [{self.protected_attributes[0]: self.train_dataset.privileged_protected_attributes}],
                                                                scope_name = 'adversarial_debiasing', sess = tf.Session())

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """
        Fit primitive using adversarial debiasing algorithm

        Parameters
        ----------
        inputs : None

        Returns
        ----------
        Outputs : None
        """
        
        self.clf = self.clf.fit(self.train_dataset)
        return CallResult(None)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Produce predictions using fit adversarial debiasing algorithm

        Parameters
        ----------
        inputs : D3M dataframe

        Returns
        ----------
        Outputs : D3M dataframe -> predictions from fit debiasing algorithm
            
        """
        # transfrom test dataframe to IBM 360 compliant dataset
        inputs[self.label_names] = self.train_dataset.convert_to_dataframe()[0][self.label_names].values[:inputs.shape[0]].astype(int)
        test_dataset = datasets.BinaryLabelDataset(df = inputs[self.attribute_names + self.label_names],
                                                label_names = self.label_names,
                                                protected_attribute_names = self.protected_attributes,
                                                favorable_label=self.hyperparams['favorable_label'],
                                                unfavorable_label=self.unfavorable_label)

        transformed_dataset = self.clf.predict(test_dataset)
        
        # transform IBM dataset back to D3M dataset
        df = transformed_dataset.convert_to_dataframe()[0][self.label_names].astype(int)
        df = d3m_DataFrame(pandas.concat([inputs[self.idx].reset_index(drop=True), df.reset_index(drop=True)], axis = 1))
        df.metadata = df.metadata.update((metadata_base.ALL_ELEMENTS, 0), inputs.metadata.query_column(0))
        df.metadata = df.metadata.update((metadata_base.ALL_ELEMENTS, 1), inputs.metadata.query_column(1))
        print(df.head(), file = sys.__stdout__)
        return CallResult(df)
class Hdbscan(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    '''
    Produce primitive's best guess for the cluster number of each series.
    '''
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        "ca014488-6004-4b54-9403-5920fbe5a834",
        'version':
        __version__,
        'name':
        "hdbscan",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Time Series'],
        'source': {
            'name':
            __author__,
            'contact':
            __contact__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/TimeSeries-D3M-Wrappers",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type': metadata_base.PrimitiveInstallationType.PIP,
            'package': 'cython',
            'version': '0.29.7',
        }, {
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://github.com/NewKnowledge/TimeSeries-D3M-Wrappers.git@{git_commit}#egg=TimeSeriesD3MWrappers'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.clustering.hdbscan.Hdbscan',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.DBSCAN,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.CLUSTERING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        hp_class = TimeSeriesFormatterPrimitive.metadata.query(
        )['primitive_code']['class_type_arguments']['Hyperparams']
        self._hp = hp_class.defaults().replace({
            'file_col_index':
            1,
            'main_resource_index':
            'learningData'
        })

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Parameters
        ----------
        inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series 

        Returns
        ----------
        Outputs
            The output is a dataframe containing a single column where each entry is the associated series' cluster number.
        """

        # temporary (until Uncharted adds conversion primitive to repo)
        if not self.hyperparams['long_format']:
            inputs = TimeSeriesFormatterPrimitive(
                hyperparams=self._hp).produce(inputs=inputs).value['0']
        else:
            hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query(
            )['primitive_code']['class_type_arguments']['Hyperparams']
            ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive(
                hyperparams=hyperparams_class.defaults().replace(
                    {"dataframe_resource": "learningData"}))
            inputs = d3m_DataFrame(ds2df_client.produce(inputs=inputs).value)

        # parse values from output of time series formatter
        n_ts = len(inputs.d3mIndex.unique())
        ts_sz = int(inputs.shape[0] / n_ts)
        input_vals = np.array(inputs.value).reshape(n_ts, ts_sz)

        # use HP to produce DBSCAN clustering
        if self.hyperparams['algorithm'] == 'DBSCAN':
            #SimilarityMatrix = cluster.GenerateSimilarityMatrix(input_vals)
            _, labels, _ = cluster.ClusterSimilarityMatrix(
                input_vals, self.hyperparams['eps'],
                self.hyperparams['min_samples'])
        else:
            #SimilarityMatrix = cluster.GenerateSimilarityMatrix(input_vals)
            _, labels, _ = cluster.HClusterSimilarityMatrix(
                input_vals, self.hyperparams['min_cluster_size'],
                self.hyperparams['min_samples'])

        # transform labels for D3M classification task
        labels = [x + 1 if x >= 0 else x + 2 for x in labels]

        # add metadata to output
        labels = pandas.DataFrame(labels)
        out_df = pandas.concat(
            [pandas.DataFrame(inputs.d3mIndex.unique()), labels], axis=1)
        # get column names from metadata
        out_df.columns = ['d3mIndex', 'label']
        hdbscan_df = d3m_DataFrame(out_df)

        # first column ('d3mIndex')
        col_dict = dict(
            hdbscan_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("1")
        # confirm that this metadata still exists
        #index = inputs['0'].metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey')
        #col_dict['name'] = inputs.metadata.query_column(index[0])['name']
        col_dict['name'] = 'd3mIndex'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
        )
        hdbscan_df.metadata = hdbscan_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)

        # second column ('labels')
        col_dict = dict(
            hdbscan_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("1")
        #index = inputs['0'].metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')
        #col_dict['name'] = inputs.metadata.query_column(index[0])['name']
        col_dict['name'] = 'label'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/SuggestedTarget',
            'https://metadata.datadrivendiscovery.org/types/TrueTarget',
            'https://metadata.datadrivendiscovery.org/types/Target')
        hdbscan_df.metadata = hdbscan_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)

        return CallResult(hdbscan_df)
Exemple #13
0
class simon(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        "d2fa8df2-6517-3c26-bafc-87b701c4043a",
        'version':
        __version__,
        'name':
        "simon",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Data Type Predictor'],
        'source': {
            'name':
            __author__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/simon-d3m-wrapper",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://github.com/NewKnowledge/simon-d3m-wrapper.git@{git_commit}#egg=SimonD3MWrapper'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.distil.simon',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.CONVOLUTIONAL_NEURAL_NETWORK,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.DATA_CLEANING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._decoder = JSONDecoder()
        self._params = {}

    def fit(self) -> None:
        pass

    def get_params(self) -> Params:
        return self._params

    def set_params(self, *, params: Params) -> None:
        self.params = params

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        pass

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce primitive's best guess for the structural type of each input column.
        
        Parameters
        ----------
        inputs : Input pandas frame

        Returns
        -------
        Outputs
            The outputs is two lists of lists, each has length equal to number of columns in input pandas frame. 
            Each entry of the first one is a list of strings corresponding to each column's multi-label classification.
            Each entry of the second one is a list of floats corresponding to prediction probabilities.
        """
        """ Accept a pandas data frame, predicts column types in it
        frame: a pandas data frame containing the data to be processed
        -> a list of two lists of lists of 1) column labels and then 2) prediction probabilities
        """

        frame = inputs

        try:
            # setup model as you typically would in a Simon main file
            maxlen = 20
            max_cells = 500
            p_threshold = 0.5

            DEBUG = True  # boolean to specify whether or not print DEBUG information

            checkpoint_dir = "pretrained_models/"

            with open('Categories.txt', 'r') as f:
                Categories = f.read().splitlines()

            # orient the user a bit
            print("fixed categories are: ")
            Categories = sorted(Categories)
            print(Categories)
            category_count = len(Categories)

            execution_config = "Base.pkl"

            # load specified execution configuration
            if execution_config is None:
                raise TypeError
            Classifier = Simon(encoder={})  # dummy text classifier

            config = Classifier.load_config(execution_config, checkpoint_dir)
            encoder = config['encoder']
            checkpoint = config['checkpoint']

            X = encoder.encodeDataFrame(frame)

            # build classifier model
            model = Classifier.generate_model(maxlen, max_cells,
                                              category_count)
            Classifier.load_weights(checkpoint, None, model, checkpoint_dir)
            model_compile = lambda m: m.compile(loss='binary_crossentropy',
                                                optimizer='adam',
                                                metrics=['binary_accuracy'])
            model_compile(model)
            y = model.predict(X)
            # discard empty column edge case
            y[np.all(frame.isnull(), axis=0)] = 0

            out = encoder.reverse_label_encode(y, p_threshold)

            return pd.DataFrame.from_records(
                out, columns=['semantic types', 'probabilities'])
        except:
            # Should probably do some more sophisticated error logging here
            return "Failed predicting data frame"
Exemple #14
0
class TimeSeriesLoaderPrimitive(
        transformer.TransformerPrimitiveBase[container.DataFrame,
                                             container.DataFrame,
                                             Hyperparams]):
    """
    Reads the time series files from a given column in an input dataframe into a new M x N dataframe,
    where each timeseries occupies one of M rows, and each of the row's N entries represents a timestamp.
    The loading process assumes that each series file has an identical set of timestamps.
    """

    _semantic_types = (
        'https://metadata.datadrivendiscovery.org/types/FileName',
        'https://metadata.datadrivendiscovery.org/types/Timeseries')
    _media_types = ('text/csv', )

    __author__ = 'Uncharted Software',
    metadata = metadata_base.PrimitiveMetadata({
        'id':
        '1689aafa-16dc-4c55-8ad4-76cadcf46086',
        'version':
        '0.2.0',
        'name':
        'Time series loader',
        'python_path':
        'd3m.primitives.data_preprocessing.timeseries_loader.DistilTimeSeriesLoader',
        'keywords': ['series', 'reader', 'csv'],
        'source': {
            'name':
            'Uncharted Software',
            'contact':
            'mailto:[email protected]',
            'uris':
            ['https://gitlab.com/uncharted-distil/distil-timeseries-loader']
        },
        'installation': [{
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://gitlab.com/uncharted-distil/distil-timeseries-loader.git@'
            + '{git_commit}#egg=DistilTimeSeriesLoader-0.2.0'.format(
                git_commit=d3m_utils.current_git_commit(
                    os.path.dirname(__file__)), ),
        }],
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION,
        ],
        'supported_media_types':
        _media_types,
        'primitive_family':
        metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
    })

    @classmethod
    def _find_csv_file_column(
            cls, inputs_metadata: metadata_base.DataMetadata
    ) -> typing.Optional[int]:
        indices = utils.list_columns_with_semantic_types(
            inputs_metadata, cls._semantic_types)
        for i in indices:
            if cls._is_csv_file_column(inputs_metadata, i):
                return i
        return None

    @classmethod
    def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            column_index: int) -> bool:
        # check to see if a given column is a file pointer that points to a csv file
        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        if not column_metadata or column_metadata['structural_type'] != str:
            return False

        semantic_types = column_metadata.get('semantic_types', [])
        media_types = column_metadata.get('media_types', [])

        return set(cls._semantic_types).issubset(semantic_types) and set(
            cls._media_types).issubset(media_types)

    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        file_index = self.hyperparams['file_col_index']
        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata, file_index):
                raise exceptions.InvalidArgumentValueError(
                    'column idx=' + str(file_index) + ' from ' +
                    str(inputs.columns) + ' does not contain csv file names')
        else:
            file_index = self._find_csv_file_column(inputs.metadata)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    'no column from ' + str(inputs.columns) +
                    ' contains csv file names')

        value_index = self.hyperparams['value_col_index']
        time_index = self.hyperparams['time_col_index']

        # load each time series file, transpose, and append
        base_path = inputs.metadata.query(
            (metadata_base.ALL_ELEMENTS, file_index))['location_base_uris'][0]
        timeseries_dataframe: pd.DataFrame
        for idx, file_path in enumerate(inputs.iloc[:, file_index]):
            csv_path = os.path.join(base_path, file_path)
            timeseries_row = pd.read_csv(csv_path).transpose()
            # use the time values as the column headers
            if idx is 0:
                timeseries_dataframe = pd.DataFrame(
                    columns=timeseries_row.iloc[time_index])

            timeseries_dataframe = timeseries_dataframe.append(
                timeseries_row.iloc[value_index])

        # get the index to use a range of ints rather than the value col name
        timeseries_dataframe = timeseries_dataframe.reset_index(drop=True)

        # wrap as a D3M container - metadata should be auto generated
        return base.CallResult(container.DataFrame(data=timeseries_dataframe))
class SSC_ADMM(
        clustering.ClusteringDistanceMatrixMixin[Inputs, Outputs,
                                                 type(None),
                                                 SSC_ADMMHyperparams,
                                                 DistanceMatrixOutput],
        clustering.ClusteringTransformerPrimitiveBase[Inputs, Outputs,
                                                      SSC_ADMMHyperparams]):
    metadata = metadata_module.PrimitiveMetadata({
        'id':
        '83083e82-088b-47f4-9c0b-ba29adf5a51d',
        'version':
        "0.0.5",
        'name':
        'SSC_ADMM',
        'description':
        """Does sparse subspace clustering, using the Alternating Direction Method of Multipliers framework for optimization.""",
        'keywords': [
            'clustering', 'subspace', 'sparse',
            'Alternating Direction Method of Multipliers'
        ],
        'source': {
            'name':
            'Michigan',
            'contact':
            'mailto:[email protected]',
            'uris': [
                #link to file and repo
                'https://github.com/dvdmjohnson/d3m_michigan_primitives/blob/master/spider/cluster/ssc_admm/ssc_admm.py',
                'https://github.com/dvdmjohnson/d3m_michigan_primitives'
            ],
            'citation':
            """@article{elhamifar2013sparse,
  title={Sparse subspace clustering: Algorithm, theory, and applications},
  author={Elhamifar, Ehsan and Vidal, Rene},
  journal={IEEE transactions on pattern analysis and machine intelligence},
  volume={35},
  number={11},
  pages={2765--2781},
  year={2013},
  publisher={IEEE}}"""
        },
        'installation': [{
            'type':
            metadata_module.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://github.com/dvdmjohnson/d3m_michigan_primitives.git@{git_commit}#egg=spider'
            .format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)))
        }, {
            'type': metadata_module.PrimitiveInstallationType.UBUNTU,
            'package': 'ffmpeg',
            'version': '7:2.8.11-0ubuntu0.16.04.1'
        }],
        'python_path':
        'd3m.primitives.clustering.ssc_admm.Umich',
        'hyperparams_to_tune': ['n_clusters', 'alpha'],
        'algorithm_types':
        [metadata_module.PrimitiveAlgorithmType.SUBSPACE_CLUSTERING],
        'primitive_family':
        metadata_module.PrimitiveFamily.CLUSTERING
    })

    def __init__(
        self,
        *,
        hyperparams: SSC_ADMMHyperparams,
        random_seed: int = 0,
        docker_containers: typing.Dict[str,
                                       base.DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)
        self._use_affine = hyperparams['use_affine']
        self._use_outliers = hyperparams['use_outliers']
        self._alpha = hyperparams['alpha'] if hyperparams['alpha'] != -1 else (
            20 if self._use_outliers else 800)
        self._epsilon = 0.0002
        self._k = hyperparams['n_clusters']
        self._random_state = np.random.RandomState(random_seed)

    def set_training_data(self, *, inputs: Inputs) -> None:
        pass

    ##  computes regularization paramater lambda to be used in ADMM algorithm
    #   @param Y DxN data matrix
    #   @param P Dx? modified data matrix
    #   @return regularization paramater lambda for ADMM algorithm
    def _compute_lambda(self, Y, P):
        T = P.T * Y
        np.fill_diagonal(T, 0.0)
        T = np.absolute(T)
        l = np.min(np.amax(T, axis=0))
        return l

    ##  shrinkage threshold operator
    #   @param eta number
    #   @param M NumPy matrix
    #   @return NumPy matrix resulting from applying shrinkage threshold operator to each entry of M
    def _shrinkage_threshold(self, eta, M):
        ST = np.matrix(
            np.maximum(np.zeros(M.shape),
                       np.array(np.absolute(M)) - eta) * np.array(np.sign(M)))
        return ST

    ##  computes maximum L2-norm error among columns of residual of linear system
    #   @param P DxN NumPy matrix
    #   @param Z NxN NumPy matrix
    #   @return maximum L2-norm of columns of P-P*Z
    def _error_linear_system(self, P, Z):
        R, N = Z.shape
        Y = P[:, :N] if R > N else P
        Y0 = Y - P[:, N:] * Z[N:, :] if R > N else P
        C = Z[:N, :] if R > N else Z
        n = np.linalg.norm(Y0, 2, axis=0)
        S = np.array((Y0 / n) - Y * (C / n))
        err = np.sqrt(np.max(sum(S * S)))
        return err

    ##  computes adjacency matrix given coefficient matrix
    #   @param C NxN coefficient matrix (NumPy matrix)
    #   @return NxN adjacency matrix (NumPy matrix)
    def _build_adjacency_matrix(self, C):
        eps = 2.220446049250313e-16
        N = C.shape[0]
        CAbs = np.absolute(C)
        for i in range(N):
            CAbs[:, i] = CAbs[:, i] / (np.amax(CAbs[:, i]) + eps)
        A = CAbs + np.transpose(CAbs) + eps
        np.fill_diagonal(A, 0.0)
        return A

    ##  spectral clustering algorithm
    #   @param W NxN adjacency matrix (NumPy matrix)
    #   @param n_clusters number of clusters
    #   @param max_iter maximum number of iterations for KMeans
    #   @param n_init number of replications for KMeans
    #   @return labels for N points
    def _spectral_clustering(self, W, n_clusters=10, max_iter=1000, n_init=20):
        N, _ = W.shape
        eps = 2.220446049250313e-16
        DN = np.diag(1 / np.sqrt(np.sum(W, axis=0) + eps))
        LapN = np.identity(N) - np.matmul(np.matmul(DN, W), DN)
        _, _, VN = np.linalg.svd(LapN)
        kerN = VN.T[:, (N - n_clusters):N]
        normN = np.sqrt(np.sum(np.square(kerN), axis=1))
        kerNS = (kerN.T / (normN + eps).T).T
        l = KMeans(n_clusters,
                   n_init=n_init,
                   max_iter=max_iter,
                   random_state=self._random_state).fit(kerNS)
        labels = l.labels_.reshape((N, ))
        return labels

    ##  ADMM algorithm with outliers
    #   @param X DxN NumPy array/matrix representing N points in D-dimensional space
    #   @param use_affine whether or not data points come from union of affine subspaces instead of linear subspaces
    #   @param alpha constant used in calculating updates
    #   @param epsilon termination constant
    #   @param max_iter maximum number of iterations
    #   @return sparse coefficient matrix (NumPy array)
    def _outlier_admm(self,
                      X,
                      use_affine=False,
                      alpha=20.0,
                      epsilon=0.0002,
                      max_iter=200):

        Y = np.matrix(X)
        D, N = Y.shape
        gamma = alpha / np.linalg.norm(Y, 1)
        P = np.concatenate((Y, np.matlib.eye(D) / gamma), axis=1)
        mu1 = alpha / self._compute_lambda(Y, P)
        mu2 = alpha
        C = np.matlib.zeros((N + D, N))

        if not use_affine:

            # initializations
            k = 1
            A = np.linalg.pinv(mu1 * P.T * P + mu2 * np.matlib.eye(N + D))
            Lambda1 = np.matlib.zeros((D, N))
            Lambda2 = np.matlib.zeros((N + D, N))
            err1 = 10.0 * epsilon
            err2 = 10.0 * epsilon

            # main loop
            while k < max_iter and (err1 > epsilon or err2 > epsilon):
                Z = A * (mu1 * P.T * (Y + Lambda1 / mu1) + mu2 *
                         (C - Lambda2 / mu2))
                np.fill_diagonal(Z, 0.0)
                C = self._shrinkage_threshold(1.0 / mu2, Z + Lambda2 / mu2)
                np.fill_diagonal(C, 0.0)
                Lambda1 = Lambda1 + mu1 * (Y - P * Z)
                Lambda2 = Lambda2 + mu2 * (Z - C)
                err1 = np.amax(np.absolute(Z - C))
                err2 = self._error_linear_system(P, Z)
                k += 1

        else:

            # initializations
            k = 1
            delta = np.matrix([[float(i < N)] for i in range(N + D)])
            A = np.linalg.pinv(mu1 * P.T * P + mu2 * np.matlib.eye(N + D) +
                               mu2 * delta * delta.T)
            Lambda1 = np.matlib.zeros((D, N))
            Lambda2 = np.matlib.zeros((N + D, N))
            lambda3 = np.matlib.zeros((1, N))
            err1 = 10.0 * epsilon
            err2 = 10.0 * epsilon
            err3 = 10.0 * epsilon

            # main loop
            while k < max_iter and (err1 > epsilon or err2 > epsilon
                                    or err3 > epsilon):
                Z = A * (mu1 * P.T * (Y + Lambda1 / mu1) + mu2 *
                         (C - Lambda2 / mu2) + mu2 * delta *
                         (1.0 - lambda3 / mu2))
                np.fill_diagonal(Z, 0.0)
                C = self._shrinkage_threshold(1.0 / mu2, Z + Lambda2 / mu2)
                np.fill_diagonal(C, 0.0)
                Lambda1 = Lambda1 + mu1 * (Y - P * Z)
                Lambda2 = Lambda2 + mu2 * (Z - C)
                lambda3 = lambda3 + mu2 * (delta.T * Z - 1.0)
                err1 = np.amax(np.absolute(Z - C))
                err2 = self._error_linear_system(P, Z)
                err3 = np.amax(np.absolute(delta.T * Z - 1.0))
                k += 1

        C = np.array(C[:N, :])
        return C

    ##  ADMM algorithm without outliers
    #   @param X DxN NumPy array/matrix representing D points in N-dimensional space
    #   @param use_affine whether or not data points come from union of affine subspaces instead of linear subspaces
    #   @param alpha constant used in calculating updates
    #   @param epsilon termination constant
    #   @param max_iter maximum number of iterations
    #   @return sparse coefficient matrix (NumPy array)
    def _lasso_admm(self,
                    X,
                    use_affine=False,
                    alpha=800.0,
                    epsilon=0.0002,
                    max_iter=200):

        Y = np.matrix(X)
        N = Y.shape[1]
        mu1 = alpha / self._compute_lambda(Y, Y)
        mu2 = alpha
        C = np.matlib.zeros((N, N))

        if not use_affine:

            # initializations
            k = 1
            A = np.linalg.pinv(mu1 * Y.T * Y + mu2 * np.matlib.eye(N))
            Lambda2 = np.matlib.zeros((N, N))
            err1 = 10.0 * epsilon

            # main loop
            while k < max_iter and err1 > epsilon:
                Z = A * (mu1 * Y.T * Y + mu2 * (C - Lambda2 / mu2))
                np.fill_diagonal(Z, 0.0)
                C = self._shrinkage_threshold(1.0 / mu2, Z + Lambda2 / mu2)
                np.fill_diagonal(C, 0.0)
                Lambda2 = Lambda2 + mu2 * (Z - C)
                err1 = np.amax(np.absolute(Z - C))
                k += 1

        else:

            # initializations
            k = 1
            A = np.linalg.pinv(mu1 * Y.T * Y + mu2 * np.matlib.eye(N) + mu2)
            Lambda2 = np.matlib.zeros((N, N))
            lambda3 = np.matlib.zeros((1, N))
            err1 = 10.0 * epsilon
            err3 = 10.0 * epsilon

            # main loop
            while k < max_iter and (err1 > epsilon or err3 > epsilon):
                Z = A * (mu1 * Y.T * Y + mu2 *
                         (C - Lambda2 / mu2) + mu2 * np.matlib.ones(
                             (N, 1)) * (1.0 - lambda3 / mu2))
                np.fill_diagonal(Z, 0.0)
                C = self._shrinkage_threshold(1.0 / mu2, Z + Lambda2 / mu2)
                np.fill_diagonal(C, 0.0)
                Lambda2 = Lambda2 + mu2 * (Z - C)
                lambda3 = lambda3 + mu2 * (np.matlib.ones((1, N)) * Z - 1.0)
                err1 = np.amax(np.absolute(Z - C))
                err3 = np.amax(np.absolute(np.matlib.ones((1, N)) * Z - 1.0))
                k += 1

        C = np.array(C)
        return C

    ##  computes sparse coefficient matrix using SSC algorithm with ADMM
    #   @param X NxD NumPy array/matrix representing N points in D-dimensional space
    #   @return sparse coefficient matrix (NumPy array)
    def _compute_sparse_coefficient_matrix(self, X, max_iter):
        XX = np.transpose(X)
        a = self._alpha
        C = self._outlier_admm(
            XX, self._use_affine, a, self._epsilon,
            max_iter) if self._use_outliers else self._lasso_admm(
                XX, self._use_affine, a, self._epsilon, max_iter)
        return C

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        assert inputs.ndim == 2, "Inputs are not in the right shape"

        if iterations == None or iterations < 5:
            iterations = 200

        C = self._compute_sparse_coefficient_matrix(inputs, iterations)
        W = self._build_adjacency_matrix(C)
        labels = self._spectral_clustering(W, self._k)
        labels = np.array(labels)

        return base.CallResult(Outputs(labels))

    def produce_distance_matrix(self,
                                *,
                                inputs: Inputs,
                                timeout: float = None,
                                iterations: int = None
                                ) -> base.CallResult[DistanceMatrixOutput]:
        """
            Returns 1 - the affinity matrix generated from the subspace-transformed data
        """
        assert inputs.ndim == 2, "Inputs are not in the right shape"

        if iterations == None or iterations < 5:
            iterations = 200

        C = self._compute_sparse_coefficient_matrix(inputs, iterations)
        W = self._build_adjacency_matrix(C)

        return base.CallResult(DistanceMatrixOutput(1 - W))

    def __getstate__(self) -> dict:
        return {
            'constructor': {
                'hyperparams': self.hyperparams,
                'random_seed': self.random_seed,
                'docker_containers': self.docker_containers,
            },
            'random_state': self._random_state,
        }

    def __setstate__(self, state: dict) -> None:
        self.__init__(**state['constructor'])  # type: ignore
        self._random_state = state['random_state']

    #placeholder for now, just calls base version.
    @classmethod
    def can_accept(
        cls, *, method_name: str,
        arguments: typing.Dict[str, typing.Union[metadata_module.Metadata,
                                                 type]],
        hyperparams: SSC_ADMMHyperparams
    ) -> typing.Optional[metadata_module.DataMetadata]:
        return super().can_accept(method_name=method_name,
                                  arguments=arguments,
                                  hyperparams=hyperparams)
Exemple #16
0
class FailPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs,
                                                         Hyperparams]):
    """
    A primitive which fails on the requested method (given as hyper-parameter).

    Moreover, primitive does not correctly preserve state so if you pickle
    and unpickle it, it does not seen itself as fitted anymore.
    """

    metadata: typing.ClassVar[
        metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({
            'id':
            'd6dfbefa-0fb8-11e9-ab14-d663bd873d93',
            'version':
            __version__,
            'name':
            "Failure Tester",
            'keywords': ['test primitive'],
            'source': {
                'name':
                __author__,
                'contact':
                'mailto:[email protected]',
                'uris': [
                    'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/fail.py',
                    'https://gitlab.com/datadrivendiscovery/tests-data.git',
                ],
            },
            'installation': [{
                'type':
                metadata_base.PrimitiveInstallationType.PIP,
                'package_uri':
                'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            }],
            'location_uris': [
                'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/fail.py'
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            ],
            'python_path':
            'd3m.primitives.operator.null.FailTest',
            'algorithm_types': [
                metadata_base.PrimitiveAlgorithmType.IDENTITY_FUNCTION,
            ],
            'primitive_family':
            metadata_base.PrimitiveFamily.OPERATOR,
        })

    def __init__(self, *, hyperparams: Hyperparams) -> None:
        super().__init__(hyperparams=hyperparams)
        self._conditional_fail('__init__')
        self._fitted = False

    def _conditional_fail(self, method_name: str) -> None:
        if self.hyperparams['method_to_fail'] == method_name:
            raise IntentionalError(self.__class__.__name__, method_name)

    def set_training_data(self) -> None:  # type: ignore
        self._conditional_fail('set_training_data')
        self._fitted = False
        super().set_training_data()

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[None]:
        self._conditional_fail('fit')
        self._fitted = True
        return super().fit(timeout=timeout, iterations=iterations)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        self._conditional_fail('produce')
        if not self._fitted:
            raise exceptions.PrimitiveNotFittedError(
                "Primitive is not fitted.")
        return base.CallResult(inputs)
class DistilSeededGraphMatchingPrimitive(
    PrimitiveBase[container.List, container.DataFrame, Params, Hyperparams]
):
    """
    A primitive that matches seeded graphs.
    """

    metadata = metadata_base.PrimitiveMetadata(
        {
            "id": "8baea8e6-9d3a-46d7-acf1-04fd593dcd37",
            "version": version.__version__,
            "name": "SeededGraphMatcher",
            "python_path": "d3m.primitives.graph_matching.seeded_graph_matching.DistilSeededGraphMatcher",
            "source": {
                "name": "Distil",
                "contact": "mailto:[email protected]",
                "uris": [
                    "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/seeded_graph_matching.py",
                    "https://github.com/uncharted-distil/distil-primitives",
                ],
            },
            "installation": [
                CYTHON_DEP,
                {
                    "type": metadata_base.PrimitiveInstallationType.PIP,
                    "package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives".format(
                        git_commit=utils.current_git_commit(os.path.dirname(__file__)),
                    ),
                },
            ],
            "algorithm_types": [
                metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING,
            ],
            "primitive_family": metadata_base.PrimitiveFamily.GRAPH_MATCHING,
        },
    )

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)
        self._model = SGMGraphMatcher(target_metric="accuracy")

    def set_training_data(
        self, *, inputs: container.List, outputs: container.DataFrame
    ) -> None:
        self._inputs = inputs
        self._outputs = outputs
        self._target_col = outputs.columns[0]

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        logger.debug(f"Fitting {__name__}")

        X_train, y_train, U_train = self._inputs
        X_train = X_train.value
        self._model.fit(X_train, y_train, U_train)

        return CallResult(None)

    def produce(
        self, *, inputs: container.List, timeout: float = None, iterations: int = None
    ) -> CallResult[container.DataFrame]:
        logger.debug(f"Producing {__name__}")

        X_train, _, _ = inputs
        X_train = X_train.value
        result = self._model.predict(X_train).astype(int)

        # create dataframe to hold d3mIndex and result
        result_df = container.DataFrame(
            {X_train.index.name: X_train.index, self._target_col: result}
        )

        # mark the semantic types on the dataframe
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            "https://metadata.datadrivendiscovery.org/types/PrimaryKey",
        )
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            "https://metadata.datadrivendiscovery.org/types/PredictedTarget",
        )

        return base.CallResult(result_df)

    def get_params(self) -> Params:
        return Params(model=self._model, target_col=self._target_col)

    def set_params(self, *, params: Params) -> None:
        self._model = params["model"]
        self._target_col = params["target_col"]
        return
class NonParametricClustering(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': '2e3cda2b-ce4a-39ae-ae02-22dc33affd17',
        'version': "0.1.0",
        'name': "jhu.nonpar",
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.jhu_primitives.NonParametricClustering',
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['nonparametric'],
        'source': {
            'name': "JHU",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://github.com/neurodata/primitives-interfaces/jhu_primitives/nonpar/nonpar.py',
#                'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py',
                'https://github.com/neurodata/primitives-interfaces.git',
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type': metadata_module.PrimitiveInstallationType.PIP,
            'package_uri': 'git+https://github.com/neurodata/primitives-interfaces.git@{git_commit}#egg=jhu_primitives'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
                ),
        }],
        # URIs at which one can obtain code for the primitive, if available.
        # 'location_uris': [
        #     'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format(
        #         git_commit=utils.current_git_commit(os.path.dirname(__file__)),
        #     ),
        # ],
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            "HIGHER_ORDER_SINGULAR_VALUE_DECOMPOSITION"
        ],
        'primitive_family': "DATA_TRANSFORMATION"
    })

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Non-parametric clustering

        **Positional Arguments:**

        xhat1:
            - A numpy.ndarray type "matrix"
        xhat2:
            - A numpy.ndarray type "matrix"

        **Optional Arguments:**

        sigma:
            - a sigma for the Gaussian kernel
        """

        #xhat1 = inputs[0,:,:]
        #xhat2 = inputs[1,:,:]

        xhat1 = inputs[0]
        xhat2 = inputs[1]

        sigma = self.hyperparams['sigma']

        path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                "nonpar.interface.R")

        cmd = """
        source("%s")
        fn <- function(xhat1, xhat2, sigma) {
            nonpar.interface(xhat1, xhat2, sigma)
        }
        """ % path

        result =  np.array(robjects.r(cmd)(xhat1, xhat2, sigma))

        outputs = container.ndarray(result)

        return base.CallResult(outputs)
Exemple #19
0
class TextEncoderPrimitive(base.PrimitiveBase[Inputs, Outputs, Params,
                                              Hyperparams]):
    """
    Encodes string fields using TFIDF scoring combined with a linear SVC classifier.  The original string field is removed
    and replaced with encoding columns.
    """

    _attribute_semantic = "https://metadata.datadrivendiscovery.org/types/Attribute"
    metadata = metadata_base.PrimitiveMetadata(
        {
            "id":
            "09f252eb-215d-4e0b-9a60-fcd967f5e708",
            "version":
            version.__version__,
            "name":
            "Text encoder",
            "python_path":
            "d3m.primitives.data_transformation.encoder.DistilTextEncoder",
            "source": {
                "name":
                "Distil",
                "contact":
                "mailto:[email protected]",
                "uris": [
                    "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/text_encoder.py",
                    "https://github.com/uncharted-distil/distil-primitives",
                ],
            },
            "installation": [
                CYTHON_DEP,
                {
                    "type":
                    metadata_base.PrimitiveInstallationType.PIP,
                    "package_uri":
                    "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives"
                    .format(git_commit=utils.current_git_commit(
                        os.path.dirname(__file__)), ),
                },
            ],
            "algorithm_types": [
                metadata_base.PrimitiveAlgorithmType.ENCODE_BINARY,
            ],
            "primitive_family":
            metadata_base.PrimitiveFamily.DATA_TRANSFORMATION,
        }, )

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)
        self._encoders: List[SVMTextEncoder] = []
        self._cols: List[int] = []

    def __getstate__(self) -> dict:
        state = base.PrimitiveBase.__getstate__(self)
        state["models"] = self._encoders
        state["columns"] = self._cols
        return state

    def __setstate__(self, state: dict) -> None:
        base.PrimitiveBase.__setstate__(self, state)
        self._encoders = state["models"]
        self._cols = state["columns"]

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._inputs = inputs

        # https://github.com/scikit-learn/scikit-learn/issues/14429#issuecomment-513887163
        if type(outputs
                ) == container.pandas.DataFrame and outputs.shape[1] == 1:
            outputs = outputs.values.reshape(outputs.shape[0], )
        else:
            outputs = outputs.iloc[:, 0].values
        self._outputs = pd.Series(outputs)

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[None]:
        logger.debug(f"Fitting {__name__}")

        # determine columns to operate on
        cols = distil_utils.get_operating_columns(
            self._inputs, self.hyperparams["use_columns"],
            ("http://schema.org/Text", ))

        logger.debug(f"Found {len(cols)} columns to encode")

        self._cols = list(cols)
        self._encoders: List[SVMTextEncoder] = []
        if len(cols) is 0:
            return base.CallResult(None)

        for i, c in enumerate(self._cols):
            if self.hyperparams["encoder_type"] == "svm":
                self._encoders.append(
                    SVMTextEncoder(self.hyperparams["metric"],
                                   self.random_seed))
            elif self.hyperparams["encoder_type"] == "tfidf":
                self._encoders.append(TfidifEncoder())
            else:
                raise Exception(
                    f"{self.hyperparams['encoder_type']} is not a valid encoder type"
                )
            text_inputs = self._inputs.iloc[:, c]
            try:
                self._encoders[i].fit_transform(
                    text_inputs, self._outputs
                )  # requires fit transform to fit SVM on vectorizer results
            except:
                text_inputs[:] = "avoiding a bug"
                self._encoders[i].fit_transform(text_inputs, self._outputs)

        return base.CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        logger.debug(f"Producing {__name__}")

        if len(self._cols) == 0:
            return base.CallResult(inputs)

        outputs = inputs.copy()
        encoded_cols = container.DataFrame()
        encoded_cols_source = []
        # encode columns into a new dataframe
        for i, c in enumerate(self._cols):
            text_inputs = outputs.iloc[:, c]
            result = self._encoders[i].transform(text_inputs)
            for j in range(result.shape[1]):
                encoded_idx = i * result.shape[1] + j
                encoded_cols[(f"__text_{encoded_idx}")] = result[:, j]
                encoded_cols_source.append(c)
        # generate metadata for encoded columns
        encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols)
        for c in range(encoded_cols.shape[1]):
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float")
            encoded_cols.metadata = encoded_cols.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, c), self._attribute_semantic)
            col_dict = dict(
                encoded_cols.metadata.query((metadata_base.ALL_ELEMENTS, c)))
            col_dict["source_column"] = outputs.metadata.query(
                (metadata_base.ALL_ELEMENTS, encoded_cols_source[c]))["name"]
            encoded_cols.metadata = encoded_cols.metadata.update(
                (metadata_base.ALL_ELEMENTS, c), col_dict)

        # append the encoded columns and remove the source columns
        outputs = outputs.append_columns(encoded_cols)
        outputs = outputs.remove_columns(self._cols)

        logger.debug(f"\n{outputs}")

        return base.CallResult(outputs)

    def get_params(self) -> Params:
        return Params(_encoders=self._encoders, _cols=self._cols)

    def set_params(self, *, params: Params) -> None:
        self._encoders = params["_encoders"]
        self._cols = params["_cols"]
Exemple #20
0
class StatisticalMeanAbsTemporalDerivativePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    """
    Primitive to find mean_abs_temporal_derivative of time series
    """
    __author__ = "DATA Lab at Texas A&M University",
    metadata = metadata_base.PrimitiveMetadata(
        {
            'id': 'eb571238-6229-4fe4-94b3-684f043e4dbf',
            'version': '0.1.0',
            'name': 'Time Series Decompostional',
            'python_path': 'd3m.primitives.tods.feature_analysis.statistical_mean_abs_temporal_derivative',
            'keywords': ['Time Series','MeanAbsTemporalDerivative'],
            "hyperparams_to_tune": ['window_size'],
            'source': {
                'name': 'DATA Lab at Texas A&M University',
                'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalMeanAbsTemporalDerivative.py'],
                'contact': 'mailto:[email protected]'

            },
            'installation': [
                {'type': metadata_base.PrimitiveInstallationType.PIP,
                 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format(
                     git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)),
                 ),
                 }

            ],
            'algorithm_types': [
                metadata_base.PrimitiveAlgorithmType.DATA_PROFILING,
            ],
            'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION,

        }
    )

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
        """

        Args:
            inputs: Container DataFrame
            timeout: Default
            iterations: Default

        Returns:
            Container DataFrame containing mean_abs_temporal_derivative of  time series
        """
        self.logger.info('Statistical MeanAbsTemporalDerivative  Primitive called')

        # Get cols to fit.
        self._fitted = False
        self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if len(self._training_indices) > 0:
            # self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")

        if not self._fitted:
            raise PrimitiveNotFittedError("Primitive not fitted.")
        statistical_mean_abs_temporal_derivative_input = inputs
        if self.hyperparams['use_semantic_types']:
            statistical_mean_abs_temporal_derivative_input = inputs.iloc[:, self._training_indices]
        output_columns = []
        if len(self._training_indices) > 0:
            statistical_mean_abs_temporal_derivative_output = self._mean_abs_temporal_derivative(statistical_mean_abs_temporal_derivative_input,self.hyperparams["window_size"])

            if sparse.issparse(statistical_mean_abs_temporal_derivative_output):
                statistical_mean_abs_temporal_derivative_output = statistical_mean_abs_temporal_derivative_output.toarray()
            outputs = self._wrap_predictions(inputs, statistical_mean_abs_temporal_derivative_output)

            #if len(outputs.columns) == len(self._input_column_names):
               # outputs.columns = self._input_column_names

            output_columns = [outputs]


        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
                                             add_index_columns=self.hyperparams['add_index_columns'],
                                             inputs=inputs, column_indices=self._training_indices,
                                             columns_list=output_columns)

        self.logger.info('Statistical MeanAbsTemporalDerivative  Primitive returned')

        return base.CallResult(outputs)

    @classmethod
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
        """
        Select columns to fit.
        Args:
            inputs: Container DataFrame
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            list
        """
        if not hyperparams['use_semantic_types']:
            return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

        use_columns = hyperparams['use_columns']
        exclude_columns = hyperparams['exclude_columns']

        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
                                                                                   use_columns=use_columns,
                                                                                   exclude_columns=exclude_columns,
                                                                                   can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
        # return columns_to_produce

    @classmethod
    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int,
                            hyperparams: Hyperparams) -> bool:
        """
        Output whether a column can be processed.
        Args:
            inputs_metadata: d3m.metadata.base.DataMetadata
            column_index: int

        Returns:
            bool
        """
        column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
        accepted_semantic_types = set()
        accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
        if not issubclass(column_metadata['structural_type'], accepted_structural_types):
            return False

        semantic_types = set(column_metadata.get('semantic_types', []))
        return True
        if len(semantic_types) == 0:
            cls.logger.warning("No semantic types found in column metadata")
            return False

        # Making sure all accepted_semantic_types are available in semantic_types
        if len(accepted_semantic_types - semantic_types) == 0:
            return True

        return False

    @classmethod
    def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
                                     target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
        """
        Updata metadata for selected columns.
        Args:
            inputs_metadata: metadata_base.DataMetadata
            outputs: Container Dataframe
            target_columns_metadata: list

        Returns:
            d3m.metadata.base.DataMetadata
        """
        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

        for column_index, column_metadata in enumerate(target_columns_metadata):
            column_metadata.pop("structural_type", None)
            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

        return outputs_metadata

    def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
        """
        Wrap predictions into dataframe
        Args:
            inputs: Container Dataframe
            predictions: array-like data (n_samples, n_features)

        Returns:
            Dataframe
        """
        outputs = d3m_dataframe(predictions, generate_metadata=True)
        target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams)
        outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)

        return outputs

    @classmethod
    def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams):
        """
        Add target columns metadata
        Args:
            outputs_metadata: metadata.base.DataMetadata
            hyperparams: d3m.metadata.hyperparams.Hyperparams

        Returns:
            List[OrderedDict]
        """
        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            # column_name = "output_{}".format(column_index)
            column_metadata = OrderedDict()
            semantic_types = set()
            semantic_types.add(hyperparams["return_semantic_type"])
            column_metadata['semantic_types'] = list(semantic_types)

            # column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        return target_columns_metadata

    def _write(self, inputs: Inputs):
        inputs.to_csv(str(time.time()) + '.csv')


    def _mean_abs_temporal_derivative(self,X,window_size):
        """ statistical mean_abs_temporal_derivative of time series sequence
           Args:
            X : DataFrame
               Time series.
        Returns:
            DataFrame
            A object with mean_abs_temporal_derivative
        """
        if(window_size==-1):
            window_size = len(X)
        transformed_X = utils.pandas.DataFrame()
        for column in X.columns:
            column_value = X[column].values
            column_mean_abs_temporal_derivative = np.zeros(len(column_value))
            for iter in range(window_size-1,len(column_value)):
                sequence = column_value[iter-window_size+1:iter+1]
                column_mean_abs_temporal_derivative[iter] = np.mean(np.abs(np.diff(sequence)))
            column_mean_abs_temporal_derivative[:window_size-1] = column_mean_abs_temporal_derivative[window_size-1]
            transformed_X[column + "_mean_abs_temporal_derivative"] = column_mean_abs_temporal_derivative

        return transformed_X
Exemple #21
0
class NBEATSPrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                                     Hyperparams]):
    """
    This primitive applies the Neural basis expansion analysis for interpretable time
    series forecasting (NBEATS) method for time series forecasting. The implementation is based off of
    this paper: https://arxiv.org/abs/1905.10437 and this repository: https://gluon-ts.mxnet.io/index.html
    """

    metadata = metadata_base.PrimitiveMetadata({
        "id":
        "3952a074-145e-406d-9cee-80232ae8f3ae",
        "version":
        __version__,
        "name":
        "NBEATS",
        "keywords": [
            "time series",
            "forecasting",
            "deep neural network",
            "fully-connected",
            "residual network",
            "interpretable",
        ],
        "source": {
            "name": __author__,
            "contact": __contact__,
            "uris": [
                "https://github.com/kungfuai/d3m-primitives",
            ],
        },
        "installation": [
            {
                "type": "PIP",
                "package": "cython",
                "version": "0.29.16"
            },
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        "python_path":
        "d3m.primitives.time_series_forecasting.feed_forward_neural_net.NBEATS",
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.DEEP_NEURAL_NETWORK,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING,
        "can_use_gpus":
        True,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._freq = None
        self._is_fit = False
        self.preds = None

    def get_params(self) -> Params:
        return Params(
            nbeats_dataset=self._nbeats_dataset,
            timestamp_column=self._timestamp_column,
            group_cols=self._grouping_columns,
            output_column=self._output_column,
            target_column=self._target_column,
            freq=self._freq,
            reind_freq=self._reind_freq,
            is_fit=self._is_fit,
            min_trains=self._min_trains,
        )

    def set_params(self, *, params: Params) -> None:
        self._nbeats_dataset = params["nbeats_dataset"]
        self._timestamp_column = params["timestamp_column"]
        self._grouping_columns = params["group_cols"]
        self._output_column = params["output_column"]
        self._target_column = params["target_column"]
        self._freq = params["freq"]
        self._reind_freq = params["reind_freq"]
        self._is_fit = params["is_fit"]
        self._min_trains = params["min_trains"]

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """Sets primitive's training data

        Arguments:
            inputs {Inputs} -- D3M dataframe containing attributes
            outputs {Outputs} -- D3M dataframe containing targets

        Raises:
            ValueError: If multiple columns are annotated with 'Time' or 'DateTime' metadata
        """

        self._output_column = outputs.columns[0]

        frame = inputs.append_columns(outputs)
        self._get_cols(frame)
        self._set_freq(frame)
        frame, self._min_trains, max_train_length, _ = self._reindex(frame)
        self._check_window_support(max_train_length)

        self._nbeats_dataset = NBEATSDataset(
            frame,
            self._grouping_columns,
            self._timestamp_column,
            self._target_column,
            self._freq,
            self.hyperparams["prediction_length"],
            self.hyperparams["num_context_lengths"],
        )
        self._train_data = self._nbeats_dataset.get_data()

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """Fits NBEATS model using training data from set_training_data and hyperparameters

        Keyword Arguments:
            timeout {float} -- timeout, considered (default: {None})
            iterations {int} -- iterations, considered (default: {None})

        Returns:
            CallResult[None]
        """

        if iterations is None:
            iterations = self.hyperparams["epochs"]
            has_finished = True
        else:
            has_finished = False

        if self.hyperparams["interpretable"]:
            num_stacks = 2
            num_blocks = [1]
            widths = [256, 2048]
            sharing = [True]
            expansion_coefficient_lengths = [3]
            stack_types = ["T", "S"]
            estimator_class = NBEATSEnsembleEstimatorHook
        else:
            num_stacks = 30
            num_blocks = [3]
            widths = [512]
            sharing = [False]
            expansion_coefficient_lengths = [32]
            stack_types = ["G"]
            estimator_class = NBEATSEnsembleEstimator

        estimator = estimator_class(
            freq=self._freq,
            prediction_length=self.hyperparams["prediction_length"],
            meta_context_length=[
                i
                for i in range(2, self.hyperparams["num_context_lengths"] + 2)
            ],
            meta_loss_function=["sMAPE", "MASE", "MAPE"],
            meta_bagging_size=self.hyperparams["num_estimators"],
            num_stacks=num_stacks,
            num_blocks=num_blocks,
            widths=widths,
            sharing=sharing,
            expansion_coefficient_lengths=expansion_coefficient_lengths,
            stack_types=stack_types,
            trainer=Trainer(
                epochs=iterations,
                learning_rate=self.hyperparams["learning_rate"],
                batch_size=self.hyperparams["training_batch_size"],
                num_batches_per_epoch=self.hyperparams["steps_per_epoch"],
            ),
        )

        logger.info(f"Fitting for {iterations} iterations")
        start_time = time.time()
        predictor = estimator.train(self._train_data)
        predictor.batch_size = self.hyperparams["inference_batch_size"]
        predictor.set_aggregation_method("none")
        self._is_fit = True
        logger.info(
            f"Fit for {iterations} epochs, took {time.time() - start_time}s")

        if not os.path.isdir(self.hyperparams["weights_dir"]):
            os.mkdir(self.hyperparams["weights_dir"])
        predictor.serialize(Path(self.hyperparams["weights_dir"]))

        return CallResult(None, has_finished=has_finished)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """Produce primitive's predictions for specific time series at specific future time instances
        * these specific timesteps / series are specified implicitly by input dataset

        Arguments:
            inputs {Inputs} -- D3M dataframe containing attributes

        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, not considered (default: {None})

        Raises:
            PrimitiveNotFittedError: if primitive not fit

        Returns:
            CallResult[Outputs] -- (N, 2) dataframe with d3m_index and value for each prediction slice requested.
                prediction slice = specific horizon idx for specific series in specific regression
        """
        all_preds, pred_intervals = self._produce(inputs)

        if self.hyperparams["interpretable"]:
            all_components = [[] for c in range(3)]
            for series, idxs in zip(all_preds, pred_intervals):
                for i, component in enumerate(series):
                    all_components[i].append(component[idxs])
            all_components = [
                np.concatenate(component) for component in all_components
            ]

            col_names = (
                self._output_column,
                "trend-component",
                "seasonality-component",
            )
            df_data = {
                col_name: component
                for col_name, component in zip(col_names, all_components)
            }

        else:
            point_estimates = np.concatenate([
                series[0][idxs]
                for series, idxs in zip(all_preds, pred_intervals)
            ])
            df_data = {self._output_column: point_estimates}

        result_df = container.DataFrame(
            df_data,
            generate_metadata=True,
        )

        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )
        return CallResult(result_df, has_finished=self._is_fit)

    def _get_col_names(self, col_idxs, all_col_names):
        """ transform column indices to column names """
        return [all_col_names[i] for i in col_idxs]

    def _process_special_col(self, col_list, col_type):
        """private util function that warns if multiple special columns"""

        if len(col_list) == 0:
            return None
        elif len(col_list) > 1:
            logger.warn(
                f"""There are more than one {col_type} marked. This primitive will use the first"""
            )
        return col_list[0]

    def _sort_by_timestamp(self, frame):
        """private util function: convert to pd datetime and sort"""

        time_name = frame.columns[self._timestamp_column]
        new_frame = frame.copy()

        if "http://schema.org/Integer" in frame.metadata.query_column_field(
                self._timestamp_column, "semantic_types"):
            new_frame.iloc[:, self._timestamp_column] = pd.to_datetime(
                new_frame.iloc[:, self._timestamp_column] - 1, unit="D")
            self._freq = "D"
            self._reind_freq = "D"
        else:
            new_frame.iloc[:, self._timestamp_column] = pd.to_datetime(
                new_frame.iloc[:, self._timestamp_column], unit="s")
        return new_frame.sort_values(by=time_name)

    def _set_freq(self, frame):
        """sets frequency using differences in timestamp column in data frame
        ASSUMPTION: frequency is the same across all grouped time series
        """

        if len(self._grouping_columns) == 0:
            if self._freq is None:
                diff = (frame.iloc[1, self._timestamp_column] -
                        frame.iloc[0, self._timestamp_column])
                self._freq, self._reind_freq = calculate_time_frequency(
                    diff, model="gluon")
        else:
            if self._freq is None:
                g_cols = self._get_col_names(self._grouping_columns,
                                             frame.columns)
                for g, df in frame.groupby(g_cols, sort=False):
                    diff = (df.iloc[1, self._timestamp_column] -
                            df.iloc[0, self._timestamp_column])
                    break
                self._freq, self._reind_freq = calculate_time_frequency(
                    diff, model="gluon")

    def _robust_reindex(self, frame):
        """ reindex dataframe IFF it has > 1 row, interpolate target column """

        frame = self._sort_by_timestamp(frame)
        original_times = frame.iloc[:, self._timestamp_column]
        frame = frame.drop_duplicates(
            subset=frame.columns[self._timestamp_column])
        frame.index = frame.iloc[:, self._timestamp_column]
        if frame.shape[0] > 1:
            frame = frame.reindex(
                pd.date_range(
                    frame.index[0],
                    frame.index[-1],
                    freq=self._reind_freq,
                ))

        # only interpolate when target exists during training
        if self._target_column < frame.shape[1]:
            frame.iloc[:, self.
                       _target_column] = frame.iloc[:, self.
                                                    _target_column].interpolate(
                                                    )
        frame.iloc[:, self.
                   _grouping_columns] = frame.iloc[:, self.
                                                   _grouping_columns].ffill()

        return frame, original_times

    def _reindex(self, frame):
        """reindex data, interpolating target columns"""

        if len(self._grouping_columns) == 0:
            df, original_times = self._robust_reindex(frame)
            return df, [df.index[0]], df.shape[0], original_times
        else:
            all_dfs, min_trains, original_times = [], {}, OrderedDict()
            max_train_length = 0
            g_cols = self._get_col_names(self._grouping_columns, frame.columns)
            for grp, df in frame.groupby(g_cols, sort=False):
                df, orig_times = self._robust_reindex(df)
                if df.shape[0] > max_train_length:
                    max_train_length = df.shape[0]
                all_dfs.append(df)
                min_trains[grp] = df.index[0]
                original_times[grp] = orig_times
            return pd.concat(
                all_dfs), min_trains, max_train_length, original_times

    def _get_cols(self, frame):
        """private util function: get indices of important columns from metadata"""

        input_metadata = frame.metadata

        # get target idx (first column by default)
        target_columns = input_metadata.list_columns_with_semantic_types((
            "https://metadata.datadrivendiscovery.org/types/SuggestedTarget",
            "https://metadata.datadrivendiscovery.org/types/TrueTarget",
            "https://metadata.datadrivendiscovery.org/types/Target",
        ))
        if len(target_columns) == 0:
            raise ValueError("At least one column must be marked as a target")

        self._target_column = self._process_special_col(
            target_columns, "target column")

        # get timestamp idx (first column by default)
        timestamp_columns = input_metadata.list_columns_with_semantic_types((
            "https://metadata.datadrivendiscovery.org/types/Time",
            "http://schema.org/DateTime",
        ))
        self._timestamp_column = self._process_special_col(
            timestamp_columns, "timestamp column")

        # get grouping idx
        self._grouping_columns = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/GroupingKey", ))
        suggested_group_cols = input_metadata.list_columns_with_semantic_types((
            "https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey",
        ))
        if len(self._grouping_columns) == 0:
            self._grouping_columns = suggested_group_cols

    def _check_window_support(self, max_train_length):
        """ ensures that at least one series of target series is >= context_length """

        if max_train_length < self.hyperparams["prediction_length"]:
            raise ValueError(
                f"This training set does not support a prediction length of {self.hyperparams['prediction_length']} "
                +
                f"because its longest series has length {max_train_length} observations. Please "
                + f"choose a shorter prediction length.")

    def _get_pred_intervals(self, original_times):
        """private util function that retrieves unevenly spaced prediction intervals from data frame"""

        if len(self._grouping_columns) == 0:
            intervals = discretize_time_difference(original_times,
                                                   self._min_trains[0],
                                                   self._freq,
                                                   zero_index=True)
            all_intervals = [np.array(intervals) + 1]
        else:
            all_intervals = []
            for grp, times in original_times.items():
                if grp in self._min_trains.keys():
                    intervals = discretize_time_difference(
                        times,
                        self._min_trains[grp],
                        self._freq,
                        zero_index=True)
                else:
                    logger.info(
                        f"Series with category {grp} did not exist in training data, "
                        + f"These predictions will be returned as np.nan.")
                    intervals = np.zeros(times.shape[0]).astype(int)
                all_intervals.append(np.array(intervals) + 1)
        return all_intervals

    def _produce(self, inputs: Inputs):
        """ internal produce method to support produce() and produce_confidence_intervals() methods """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        test_frame = inputs.copy()
        nbeats_forecast = NBEATSForecast(
            self._nbeats_dataset,
            self.hyperparams["weights_dir"],
            self.hyperparams["interpretable"],
            self.hyperparams["output_mean"],
            self.hyperparams["nan_padding"],
        )
        test_frame, _, _, original_times = self._reindex(test_frame)
        pred_intervals = self._get_pred_intervals(original_times)

        st = time.time()
        preds = nbeats_forecast.predict(test_frame, pred_intervals)
        logger.info(f"Making predictions took {time.time() - st}s")
        return preds, pred_intervals
class SignalFramer(FeaturizationTransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    """
    BBN D3M Signal Framing Primitive divides the audio signal into number of frames.
    Input: List of arrays with samples of shape [ num_samples ]
    Output: List of arrays with frames of shape [ num_frames, frame_length ]
    Applications include: audio, time-series classification
    """

    __git_commit__=utils.current_git_commit(os.path.dirname(__file__))
    metadata = metadata_module.PrimitiveMetadata({
        'id': '4d7160ef-ca70-4150-b513-36b90817ba45',
        'version': __version__,
        'name': "Signal Framing",
        'description': """BBN D3M Signal Framing Primitive divides the audio signal into number of frames.\n
			Input: List of arrays with samples of shape [ num_samples ]\n
			Output: List of arrays with frames of shape [ num_frames, frame_length ]\n
			Applications include: audio, time-series classification""",
        'keywords': [],
        'source': {
            'name': __author__,
            'contact':'mailto:[email protected]',
            'uris': [
                'https://github.com/BBN-E/d3m-bbn-primitives/blob/{git_commit}/bbn_primitives/time_series/signal_framing.py'.format(
                    git_commit=__git_commit__
                ),
                'https://github.com/BBN-E/d3m-bbn-primitives.git',
            ],
        },
        'installation': [{
            'type': 'PIP',
            'package_uri': 'git+https://github.com/BBN-E/d3m-bbn-primitives.git@{git_commit}#egg={egg}'.format(
                git_commit=__git_commit__, egg='bbn_primitives'
            ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.time_series_segmentation.signal_framer.SignalFramer',#'d3m.primitives.bbn.time_series.SignalFramer', #'d3m.primitives.time_series_segmentation.signal_framer.BBN',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [metadata_module.PrimitiveAlgorithmType.UNIFORM_TIME_SERIES_SEGMENTATION],
        'primitive_family': metadata_module.PrimitiveFamily.TIME_SERIES_SEGMENTATION,
    })

    def __init__(
        self, *, hyperparams: Hyperparams, random_seed: int = 0,
        docker_containers: typing.Dict[str, DockerContainer] = None
    ) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
        return

    def _frame_length(self, sampling_rate: float) -> int:
        return int(self.hyperparams['frame_length_s'] * sampling_rate)

    def _frame_shift(self, sampling_rate: float) -> int:
        return max(int(self.hyperparams['frame_shift_s'] * sampling_rate), 1)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Arguments:
            - inputs: [ num_samples ]

        Returns:
            - [ num_windows, window_len ]
        """
        with stopit.ThreadingTimeout(timeout) as timer:
            outputs = Outputs()
            metadata = inputs.metadata.clear({
                'schema': metadata_module.CONTAINER_SCHEMA_VERSION,
                'structural_type': Outputs,
                'dimension': {
                    'length': len(outputs)
                }
            }, for_value=outputs).update((metadata_module.ALL_ELEMENTS,), {
                'structural_type': d3m_ndarray,
            })

            for input_id in range(len(inputs)):
                cinput = inputs[input_id]
                # TODO: review the following because it's hacky
                # It was done in the way to enable handling both audio (high sampling_rate) and frames
                sampling_rate = inputs.metadata.query((input_id,))['sampling_rate'] if 'sampling_rate' in inputs.metadata.query((input_id,)) else 1
                frame_length = self._frame_length(sampling_rate)
                frame_shift = self._frame_shift(sampling_rate)

                if cinput.size == 0:
                    outputs.append(d3m_ndarray(np.array([]), generate_metadata=False))
                    continue

                if cinput.shape[0] <= frame_length:
                    if len(cinput.shape) <= 2:
                        cinput = np.concatenate((cinput,
                                   #np.matlib.repmat(cinput[-1], frame_length-cinput.shape[0], 1)
                                   np.zeros((frame_length-cinput.shape[0],)+cinput.shape[1:], dtype=cinput.dtype)
                                 ))
                shape = ((cinput.shape[0] - frame_length) // frame_shift + 1,
                        frame_length) + cinput.shape[1:]
                strides = (cinput.strides[0]*frame_shift,cinput.strides[0]) + cinput.strides[1:]
                coutput = np.lib.stride_tricks.as_strided(cinput, shape=shape, strides=strides)
                outputs.append(d3m_ndarray(
			coutput.flatten() if self.hyperparams['flatten_output'] else coutput,
			generate_metadata=False))

                if 'sampling_rate' in inputs.metadata.query((input_id,)):
                    metadata = metadata.update((input_id,), { 'sampling_rate': inputs.metadata.query((input_id,))['sampling_rate'] })

            #metadata = metadata.update((), { 'dimension': { 'length': len(outputs) } })
            # Set metadata attribute.
            outputs.metadata = metadata

        if timer.state == timer.EXECUTED:
            return CallResult(outputs)
        else:
            raise TimeoutError('SignalFramer exceeded time limit')
Exemple #23
0
class Sent2VecPrimitive(TransformerPrimitiveBase[Inputs, Outputs,
                                                 Hyperparams]):
    """
        Produce numerical representations (features) for short texts or sentences.

        Parameters
        ----------
        inputs : Input pandas dataframe

        Returns
        -------
        Outputs
            The output is a pandas dataframe
        """

    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        "id":
        "cf450079-9333-4a3f-aed4-b77a4e8c7be7",
        "version":
        __version__,
        "name":
        "sent2vec_wrapper",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        "keywords":
        ["Sent2Vec", "Embedding", "NLP", "Natural Language Processing"],
        "source": {
            "name":
            __author__,
            "contact":
            __contact__,
            "uris": [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/nk-sent2vec-d3m-wrapper"
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        "installation": [
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/NewKnowledge/nk-sent2vec-d3m-wrapper.git@{git_commit}#egg=sent2vec_wrapper"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__))),
            },
            {
                "type":
                "FILE",
                "key":
                "sent2vec_model",
                "file_uri":
                "http://public.datadrivendiscovery.org/twitter_bigrams.bin",
                "file_digest":
                "9e8ccfea2aaa4435ca61b05b11b60e1a096648d56fff76df984709339f423dd6",
            },
        ],
        # The same path the primitive is registered with entry points in setup.py.
        "python_path":
        "d3m.primitives.feature_extraction.nk_sent2vec.Sent2Vec",
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        "algorithm_types":
        [metadata_base.PrimitiveAlgorithmType.VECTORIZATION],
        "primitive_family":
        metadata_base.PrimitiveFamily.FEATURE_EXTRACTION,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 volumes: typing.Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         volumes=volumes)

        self.volumes = volumes

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce numerical representations (features) for short texts or sentences.

        Parameters
        ----------
        inputs : Input pandas dataframe

        Returns
        -------
        Outputs
            The output is a pandas dataframe
        """

        # extract sentences from stored in nested media files
        text_columns = inputs.metadata.get_columns_with_semantic_type(
            'https://metadata.datadrivendiscovery.org/types/FileName')
        base_paths = [
            inputs.metadata.query(
                (metadata_base.ALL_ELEMENTS,
                 t))['location_base_uris'][0].replace('file:///', '/')
            for t in text_columns
        ]
        txt_paths = [[
            os.path.join(base_path, filename)
            for filename in inputs.iloc[:, col]
        ] for base_path, col in zip(base_paths, text_columns)]
        txt = [[
            open(path, 'r').read().replace('\n', '') for path in path_list
        ] for path_list in txt_paths]
        txt_df = pd.DataFrame(np.array(txt).T)

        # concatenate with text columns that aren't stored in nested files
        local_text_columns = inputs.metadata.get_columns_with_semantic_type(
            'http://schema.org/Text')
        local_text_columns = [
            col for col in local_text_columns if col not in text_columns
        ]
        frame = pd.concat((txt_df, inputs[local_text_columns]), axis=1)

        # delete columns with path names of nested media files
        outputs = inputs.remove_columns(text_columns)

        try:
            vectorizer = _Sent2Vec(path=self.volumes["sent2vec_model"])
            #print('loaded sent2vec model', file = sys.__stdout__)
            output_vectors = []
            for col in range(frame.shape[1]):
                text = frame.iloc[:, col].tolist()
                embedded_sentences = vectorizer.embed_sentences(sentences=text)
                output_vectors.append(embedded_sentences)
            embedded_df = pd.DataFrame(
                np.array(output_vectors).reshape(len(embedded_sentences), -1))
        except ValueError:
            # just return inputs with file names deleted if vectorizing fails
            return CallResult(outputs)

        #print('successfully vectorized text\n', file = sys.__stdout__)

        # create df with vectorized columns and append to input df
        embedded_df = d3m_DataFrame(embedded_df)
        for col in range(embedded_df.shape[1]):
            col_dict = dict(
                embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, col)))
            col_dict['structural_type'] = type(1.0)
            col_dict['name'] = "vector_" + str(col)
            col_dict["semantic_types"] = (
                "http://schema.org/Float",
                "https://metadata.datadrivendiscovery.org/types/Attribute",
            )
            embedded_df.metadata = embedded_df.metadata.update(
                (metadata_base.ALL_ELEMENTS, col), col_dict)
        df_dict = dict(
            embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict_1 = dict(
            embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, )))
        df_dict['dimension'] = df_dict_1
        df_dict_1['name'] = 'columns'
        df_dict_1['semantic_types'] = (
            'https://metadata.datadrivendiscovery.org/types/TabularColumn', )
        df_dict_1['length'] = embedded_df.shape[1]
        embedded_df.metadata = embedded_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, ), df_dict)
        return CallResult(outputs.append_columns(embedded_df))
class SSC_OMP(clustering.ClusteringDistanceMatrixMixin[Inputs, Outputs, type(None), SSC_OMPHyperparams, DistanceMatrixOutput],
          clustering.ClusteringTransformerPrimitiveBase[Inputs, Outputs, SSC_OMPHyperparams]):
    """
    This code implements the subspace clustering algorithm described in
    Chong You, Daniel Robinson, Rene Vidal,
    "Scalable Sparse Subspace Clustering by Orthogonal Matching Pursuit", CVPR 2016.

    It performs the OMP algorithm on every column of X using all other columns as a
    dictionary

    :param data: A dxN numpy array
    :param K: The maximum subspace dimension
    :param thres: termination condition
    :return: the SSC-OMP representation of the data
    """

    metadata = metadata_module.PrimitiveMetadata({
        'id': '50f89f90-7cef-4bb6-b56f-642f85bd1d58',
        'version': "0.0.5",
        'name': 'SSC_OMP',
        'description': """Does sparse subspace clustering using orthogonal matching pursuit.""",
        'keywords': ['clustering', 'subspace', 'sparse', 'orthogonal matching pursuit'],
        'source': {
            'name': 'Michigan',
            'contact': 'mailto:[email protected]',
            'uris': [
                #link to file and repo
                'https://github.com/dvdmjohnson/d3m_michigan_primitives/blob/master/spider/cluster/ssc_omp/ssc_omp.py',
                'https://github.com/dvdmjohnson/d3m_michigan_primitives'],
            'citation': """@inproceedings{you2016scalable,
  title={Scalable sparse subspace clustering by orthogonal matching pursuit},
  author={You, Chong and Robinson, Daniel and Vidal, Ren{\'e}},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={3918--3927},
  year={2016}}"""
            },
        'installation': [
            {'type': metadata_module.PrimitiveInstallationType.PIP,
             'package_uri': 'git+https://github.com/dvdmjohnson/d3m_michigan_primitives.git@{git_commit}#egg=spider'.format(
             git_commit=utils.current_git_commit(os.path.dirname(__file__)))
            },
            {'type': metadata_module.PrimitiveInstallationType.UBUNTU,
                 'package': 'ffmpeg',
                 'version': '7:2.8.11-0ubuntu0.16.04.1'}],
        'python_path': 'd3m.primitives.clustering.ssc_omp.Umich',
        'hyperparams_to_tune': ['n_clusters', 'sparsity_level'],
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.SUBSPACE_CLUSTERING],
        'primitive_family': metadata_module.PrimitiveFamily.CLUSTERING
        })
    
    def __init__(self, *, hyperparams: SSC_OMPHyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, base.DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
        self._k = hyperparams['n_clusters']
        self._max_subspace_dim = hyperparams['sparsity_level']
        self._thres = hyperparams['thresh']
        self._random_state = np.random.RandomState(random_seed)

    def set_training_data(self, *, inputs: Inputs) -> None:
        pass

    @staticmethod
    def _cNormalize(data, norm=2):
        """
        This method performs the column wise normalization of the input data
        :param data: A dxN numpy array
        :param norm: the desired norm value (This has to be in accordance with the accepted numpy
         norm values
        :return: Returns the column wise normalised data
        """
        return data / (np.linalg.norm(data, ord=norm, axis = 0) + 2.220446049250313e-16)

    @staticmethod
    def _OMPMatFunction(data, K, thres):

        memory_total = 0.1 * 10**9
        _, n = data.shape
        data_normalised = SSC_OMP._cNormalize(data)
        support_set = np.ones((n, K), dtype=np.int64)
        indices = np.arange(n, dtype=np.int64).reshape(n, 1) * np.ones((1, K))
        values = np.zeros((n, K))
        t_vector = np.ones((n, 1), dtype=np.int64) * K
        residual = np.copy(data_normalised)

        for t in range(K):
            counter = 0
            block_size = np.ceil(memory_total / n)
            while True:
                mask = np.arange(counter, min(counter+block_size, n))
                iMat = np.abs(np.matmul(data.T, residual[:, mask]))
                np.fill_diagonal(iMat, 0.0)
                jMat = np.argmax(iMat, axis=0)
                support_set[mask, t] = jMat
                counter = counter + block_size
                if counter >= n:
                    break

            if t+1 != K:
                for iN in range(n):
                    if t_vector[iN] == K:
                        B = data_normalised[:, support_set[iN, 0:(t+1)]]
                        mat_tmp, _, _, _ = lstsq(B, data_normalised[:, iN])

                        residual[:, iN] = data_normalised[:, iN] - np.matmul(B, mat_tmp)

                        if np.sum(residual[:, iN]**2) < thres:
                            t_vector[iN] = t

            if not np.any(K == t_vector):
                break

        for iN in range(n):
            tmp, _, _, _ = lstsq(data[:, support_set[iN, 0:np.asscalar(t_vector[iN] + 1)]], (data[:, iN]))
            values[iN, 0:np.asscalar(t_vector[iN])] = tmp.T

        sparse_mat = sps.coo_matrix((values.flat, (support_set.flat, indices.flat)), shape=(n, n))
        sparse_mat = sparse_mat.toarray()
        return sparse_mat

    def _spectral_clustering(self, W, n_clusters = 10, max_iter = 1000, n_init = 20):
        N,_ = W.shape
        eps = 2.220446049250313e-16
        DN = np.diag(1/np.sqrt(np.sum(W, axis = 0) + eps))
        LapN = np.identity(N) - np.matmul(np.matmul(DN, W), DN)
        _, _, VN = np.linalg.svd(LapN)
        kerN = VN.T[:,(N - n_clusters):N]
        normN = np.sqrt(np.sum(np.square(kerN), axis = 1));
        kerNS = (kerN.T / (normN + eps).T).T
        l = KMeans(n_clusters, n_init = n_init, max_iter = max_iter, random_state = self._random_state).fit(kerNS)
        labels = l.labels_.reshape((N,))
        return labels

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
        assert inputs.ndim == 2, "Data is not in the right shape"
        assert self._max_subspace_dim <= inputs.shape[1], "max_subspace dim can't be greater than the" + \
        "input feature space"

        if iterations is None or iterations < 5:
            iterations = 200

        data = inputs.T
        R = SSC_OMP._OMPMatFunction(data, self._max_subspace_dim, self._thres)
        np.fill_diagonal(R, 0)
        A = np.abs(R) + np.abs(R.T)
        labels = self._spectral_clustering(A, n_clusters=self._k, max_iter=iterations, n_init=20)

        return base.CallResult(Outputs(labels))

    def produce_distance_matrix(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[DistanceMatrixOutput]:
        """
            Returns 1 - the affinity matrix generated from the subspace-transformed data
        """
        assert inputs.ndim == 2, "Data is not in the right shape"
        assert self._max_subspace_dim <= inputs.shape[1], "max_subspace dim can't be greater than the" + \
        "input feature space"

        data = inputs.T
        R = SSC_OMP._OMPMatFunction(data, self._max_subspace_dim, self._thres)
        np.fill_diagonal(R, 0)
        A = np.abs(R) + np.abs(R.T)

        return base.CallResult(DistanceMatrixOutput(1 - A))


    def __getstate__(self) -> dict:
        return {
            'constructor': {
                'hyperparams': self.hyperparams,
                'random_seed': self.random_seed,
                'docker_containers': self.docker_containers,
            },
            'random_state': self._random_state,
        }

    def __setstate__(self, state: dict) -> None:
        self.__init__(**state['constructor'])  # type: ignore
        self._random_state = state['random_state']

    #placeholder for now, just calls base version.
    @classmethod
    def can_accept(cls, *, method_name: str, arguments: typing.Dict[str, typing.Union[metadata_module.Metadata, type]], hyperparams: SSC_OMPHyperparams) -> typing.Optional[metadata_module.DataMetadata]:
        return super().can_accept(method_name=method_name, arguments=arguments, hyperparams=hyperparams)
class TimeSeriesFormatterPrimitive(
        transformer.TransformerPrimitiveBase[container.Dataset,
                                             container.Dataset, Hyperparams]):
    """
    Reads the time series files from a given column in an input dataset resource into a new M x N data resource,
    where each value in timeseries occupies one of M rows. Each row has N columns, representing the union of
    the fields found in the timeseries files and in the main data resource.
    The loading process assumes that each series file has an identical set of timestamps.
    """

    _semantic_types = (
        'https://metadata.datadrivendiscovery.org/types/FileName',
        'https://metadata.datadrivendiscovery.org/types/Timeseries',
        'http://schema.org/Text',
        'https://metadata.datadrivendiscovery.org/types/Attribute')
    _media_types = ('text/csv', )

    __author__ = 'Uncharted Software',
    metadata = metadata_base.PrimitiveMetadata({
        'id':
        '24b09066-836f-4b8f-9773-8c86a5eee26c',
        'version':
        '0.2.0',
        'name':
        'Time series formatter',
        'python_path':
        'd3m.primitives.data_preprocessing.timeseries_formatter.DistilTimeSeriesFormatter',
        'keywords': ['series', 'reader', 'csv'],
        'source': {
            'name':
            'Uncharted Software',
            'contact':
            'mailto:[email protected]',
            'uris':
            ['https://gitlab.com/uncharted-distil/distil-timeseries-loader']
        },
        'installation': [{
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://gitlab.com/uncharted-distil/distil-timeseries-loader.git@'
            + '{git_commit}#egg=DistilTimeSeriesLoader-0.2.0'.format(
                git_commit=d3m_utils.current_git_commit(
                    os.path.dirname(__file__)), ),
        }],
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION,
        ],
        'supported_media_types':
        _media_types,
        'primitive_family':
        metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
    })

    @classmethod
    def _find_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata,
                              res_id: int) -> typing.Optional[int]:
        indices = inputs_metadata.list_columns_with_semantic_types(
            cls._semantic_types, at=(res_id, ))
        for i in indices:
            if cls._is_csv_file_column(inputs_metadata, res_id, i):
                return i
        return None

    @classmethod
    def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            res_id: int, column_index: int) -> bool:
        # check to see if a given column is a file pointer that points to a csv file
        column_metadata = inputs_metadata.query(
            (res_id, metadata_base.ALL_ELEMENTS, column_index))

        if not column_metadata or column_metadata['structural_type'] != str:
            return False

        # check if a foreign key exists
        if column_metadata['foreign_key'] is None:
            return False

        ref_col_index = column_metadata['foreign_key']['column_index']
        ref_res_id = column_metadata['foreign_key']['resource_id']

        return cls._is_csv_file_reference(inputs_metadata, ref_res_id,
                                          ref_col_index)

    @classmethod
    def _is_csv_file_reference(cls,
                               inputs_metadata: metadata_base.DataMetadata,
                               res_id: int, column_index: int) -> bool:
        # check to see if the column is a csv resource
        column_metadata = inputs_metadata.query(
            (res_id, metadata_base.ALL_ELEMENTS, column_index))

        if not column_metadata or column_metadata['structural_type'] != str:
            return False

        semantic_types = column_metadata.get('semantic_types', [])
        media_types = column_metadata.get('media_types', [])

        semantic_types_set = set(semantic_types)
        _semantic_types_set = set(cls._semantic_types)

        return bool(
            semantic_types_set.intersection(_semantic_types_set)) and set(
                cls._media_types).issubset(media_types)

    def produce(self,
                *,
                inputs: container.Dataset,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[container.Dataset]:

        main_resource_index = self.hyperparams['main_resource_index']
        if main_resource_index is None:
            raise exceptions.InvalidArgumentValueError(
                'no main resource specified')

        file_index = self.hyperparams['file_col_index']
        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata,
                                            main_resource_index, file_index):
                raise exceptions.InvalidArgumentValueError(
                    'column idx=' + str(file_index) +
                    ' from does not contain csv file names')
        else:
            file_index = self._find_csv_file_column(inputs.metadata)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    'no column from contains csv file names')

        # generate the long form timeseries data
        base_path = self._get_base_path(inputs.metadata, main_resource_index,
                                        file_index)
        output_data = []
        timeseries_dataframe = pd.DataFrame()
        for idx, tRow in inputs[main_resource_index].iterrows():
            # read the timeseries data
            csv_path = os.path.join(base_path, tRow[file_index])
            timeseries_row = pd.read_csv(csv_path)

            # add the timeseries id
            tRow = tRow.append(pd.Series({'series_id': int(idx)}))

            # combine the timeseries data with the value row
            output_data.extend([
                pd.concat([tRow, vRow])
                for vIdx, vRow in timeseries_row.iterrows()
            ])

        # add the timeseries index
        timeseries_dataframe = timeseries_dataframe.append(output_data,
                                                           ignore_index=True)

        # join the metadata from the 2 data resources
        timeseries_dataframe = container.DataFrame(timeseries_dataframe)

        # wrap as a D3M container
        #return base.CallResult(container.Dataset({'0': timeseries_dataframe}, metadata))
        return base.CallResult(
            container.Dataset({'0': timeseries_dataframe},
                              generate_metadata=True))

    def _get_base_path(self, inputs_metadata: metadata_base.DataMetadata,
                       res_id: str, column_index: int) -> str:
        # get the base uri from the referenced column
        column_metadata = inputs_metadata.query(
            (res_id, metadata_base.ALL_ELEMENTS, column_index))

        ref_col_index = column_metadata['foreign_key']['column_index']
        ref_res_id = column_metadata['foreign_key']['resource_id']

        return inputs_metadata.query((ref_res_id, metadata_base.ALL_ELEMENTS,
                                      ref_col_index))['location_base_uris'][0]

    def _get_ref_resource(self, inputs_metadata: metadata_base.DataMetadata,
                          res_id: str, column_index: int) -> str:
        # get the referenced resource from the referenced column
        column_metadata = inputs_metadata.query(
            (res_id, metadata_base.ALL_ELEMENTS, column_index))
        ref_res_id = column_metadata['foreign_key']['resource_id']

        return ref_res_id
Exemple #26
0
class Parrot(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    '''
    Produce the primitive's prediction for future time series data. The output 
    is a list of length 'n_periods' that contains a prediction for each of 'n_periods' 
    future time periods. 'n_periods' is a hyperparameter that must be set before making the prediction.
    '''
    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        "d473d487-2c32-49b2-98b5-a2b48571e07c",
        'version':
        __version__,
        'name':
        "parrot",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Time Series'],
        'source': {
            'name':
            __author__,
            'contact':
            __contact__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/parrot-d3m-wrapper",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type': metadata_base.PrimitiveInstallationType.PIP,
            'package': 'cython',
            'version': '0.28.5',
        }, {
            "type":
            "PIP",
            "package_uri":
            "git+https://github.com/NewKnowledge/sloth.git@82a1e08049531270256f38ca838e6cc7d1119223#egg=Sloth-2.0.3"
        }, {
            'type':
            metadata_base.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://github.com/NewKnowledge/parrot-d3m-wrapper.git@{git_commit}#egg=ParrotD3MWrapper'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.time_series_forecasting.arima.Parrot',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_base.PrimitiveAlgorithmType.
            AUTOREGRESSIVE_INTEGRATED_MOVING_AVERAGE,
        ],
        'primitive_family':
        metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed)
        self._params = {}
        self._X_train = None  # training inputs
        self._sloth = Sloth()  # Sloth model
        self._arima = None  # ARIMA classifier

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """
        Fits ARIMA model using training data from set_training_data and hyperparameters
        """

        # fits ARIMA model using training data from set_training_data and hyperparameters
        self._arima = self._sloth.FitSeriesARIMA(
            self._X_train, self.hyperparams['seasonal'],
            self.hyperparams['seasonal_differencing'])
        return CallResult(None)

    def get_params(self) -> Params:
        return self._params

    def set_params(self, *, params: Params) -> None:
        self.params = params

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """
        Set primitive's training data

        Parameters
        ----------
        inputs : pandas data frame containing training data where first column contains dates and second column contains values
        
        """

        # use column according to hyperparameter index
        self._X_train = (inputs.iloc[:,
                                     self.hyperparams['index']].values).astype(
                                         np.float)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce primitive's prediction for future time series data

        Parameters
        ----------
        None

        Returns
        ----------
        Outputs
            The output is a data frame containing the d3m index and a forecast for each of the 'n_periods' future time periods
        """

        # add metadata to output
        # just take d3m index from input test set
        output_df = inputs['d3mIndex']
        # produce future foecast using arima
        future_forecast = pandas.DataFrame(
            self._sloth.PredictSeriesARIMA(self._arima,
                                           self.hyperparams['n_periods']))
        output_df = pandas.concat([output_df, future_forecast], axis=1)
        parrot_df = d3m_DataFrame(output_df)

        # first column ('d3mIndex')
        col_dict = dict(
            parrot_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = 'd3mIndex'
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
        )
        parrot_df.metadata = parrot_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)
        # second column ('predictions')
        col_dict = dict(
            parrot_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict['structural_type'] = type("1")
        col_dict['name'] = list(inputs)[self.hyperparams['index']]
        col_dict['semantic_types'] = (
            'http://schema.org/Integer',
            'https://metadata.datadrivendiscovery.org/types/Attribute',
        )
        parrot_df.metadata = parrot_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)

        return CallResult(parrot_df)
class SegmentCurveFitter(FeaturizationTransformerPrimitiveBase[Inputs, Outputs,
                                                               Hyperparams]):
    """
    BBN D3M Segment Curve Fitter takes segmented sequence of feature vectors as input and for each segment and feature dimension separately replaces the series of values by coefficients of its polynomial approximation of specified degree
    Input: List of lists of segmented sequence of feature vectors, i.e. List( [ seg_length_1, num_features ], [ seg_length_2, num_features ], ...)
    Output: List of lists of segmented sequence of polynomial coefficients, i.e. List( [ poly_deg, num_features ], [ poly_deg, num_features ], ...)
    Applications include: audio, time-series classification

    For details, refer to Gish, H. and Ng, K., 1996, October. Parametric trajectory models for speech recognition. In Spoken Language, 1996. ICSLP 96. Proceedings., Fourth International Conference on (Vol. 1, pp. 466-469). IEEE.
    """
    __git_commit__ = utils.current_git_commit(os.path.dirname(__file__))
    metadata = metadata_module.PrimitiveMetadata({
        'id':
        '7c1d88a3-2388-4ba8-97c6-aa0aa2673024',
        'version':
        __version__,
        'name':
        "Segment Curve Fitter",
        'description':
        """BBN D3M Segment Curve Fitter takes segmented sequence of feature vectors as input and for each segment and feature dimension separately replaces the series of values by coefficients of its polynomial approximation of specified degree\n
                       Input: List of lists of segmented sequence of feature vectors, i.e. List( [ seg_length_1, num_features ], [ seg_length_2, num_features ], ...)\n
                       Output: List of lists of segmented sequence of polynomial coefficients, i.e. List( [ poly_deg, num_features ], [ poly_deg, num_features ], ...)\n
                       Applications include: audio, time-series classification""",
        'keywords': [],
        'source': {
            'name':
            __author__,
            'contact':
            'mailto:[email protected]',
            'uris': [
                'https://github.com/BBN-E/d3m-bbn-primitives/blob/{git_commit}/bbn_primitives/time_series/segment_curve_fitter.py'
                .format(git_commit=__git_commit__),
                'https://github.com/BBN-E/d3m-bbn-primitives.git',
            ],
        },
        'installation': [{
            'type':
            'PIP',
            'package_uri':
            'git+https://github.com/BBN-E/d3m-bbn-primitives.git@{git_commit}#egg={egg}'
            .format(git_commit=__git_commit__, egg='bbn_primitives'),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.data_transformation.segment_curve_fitter.SegmentCurveFitter',  #'d3m.primitives.bbn.time_series.SegmentCurveFitter', #'d3m.primitives.data_transformation.segment_curve_fitter.BBN',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.
            PARAMETRIC_TRAJECTORY_MODELING
        ],
        'primitive_family':
        metadata_module.PrimitiveFamily.DATA_TRANSFORMATION,
    })

    def __init__(
            self,
            *,
            hyperparams: Hyperparams,
            random_seed: int = 0,
            docker_containers: typing.Dict[str,
                                           DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)
        return

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Arguments:
            - inputs: List(
                        List([ num_frames, num_feats ], [ num_frames, num_feats], ...)
                      )


        Returns:
            - List( # Data
                List( # Segments
                  [ deg, num_feats ], ...
                )
              )
        """
        with stopit.ThreadingTimeout(timeout) as timer:
            outputs = Outputs()

            metadata = inputs.metadata.clear(
                {
                    'schema': metadata_module.CONTAINER_SCHEMA_VERSION,
                    'structural_type': Outputs,
                    'dimension': {
                        'length': len(outputs)
                    }
                },
                for_value=outputs).update((metadata_module.ALL_ELEMENTS, ), {
                    'structural_type': List,
                })

            for cinput in inputs:
                coutput = List()
                for segment in cinput:
                    if segment.ndim != 2 or segment.shape[
                            0] < self.hyperparams['deg']:
                        raise ValueError('Incompatible shape ' +
                                         str(segment.shape) + ' of cinput.')
                    n = segment.shape[0]

                    x = np.linspace(0., 1., n)
                    p = np.polyfit(x, segment, deg=self.hyperparams['deg'])
                    E = segment - applyFitting(n, p)
                    #                    for d in range(segment.shape[1]):
                    #                        pfcn = np.poly1d(p[:, d])
                    #                        E[:, d] = segment[:, d]-pfcn(x)
                    Sigma = np.dot(E.T, E) / n
                    #segment_output = CurveFitting(deg = self.deg,
                    #        beta = p, sigma = Sigma, N = n)
                    coutput.append(d3m_ndarray(p, generate_metadata=False))
                outputs.append(coutput)

            # Set metadata attribute.
            outputs.metadata = metadata

        if timer.state == timer.EXECUTED:
            return CallResult(outputs)
        else:
            raise TimeoutError('SegmentCurveFitter exceeded time limit')
class simon(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                             Hyperparams]):
    """ Simon uses a LSTM-FCN neural network trained on 18 different semantic types to infer the semantic
        type of each column. A hyperparameter `return_result` controls whether Simon's inferences replace existing metadata, 
        append new columns with inferred metadata, or return a new dataframe with only the inferred columns. 

        Simon can append multiple annotations if the hyperparameter `multi_label_classification` is set to 'True'. 
        If `statistical_classification` is set to True, Simon will use rule-based heuristics to label categorical and ordinal columns. 
        Finally, the `p_threshold` hyperparameter varies the prediction probability threshold for adding annotations. 

        The following annotations will only be considered if `statistical_classification` is set to False:
            "https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber",
            "http://schema.org/addressCountry", "http://schema.org/Country",
            "http://schema.org/longitude", "http://schema.org/latitude",
            "http://schema.org/postalCode", "http://schema.org/City",
            "http://schema.org/State", "http://schema.org/address", "http://schema.org/email", 
            "https://metadata.datadrivendiscovery.org/types/FileName"
        
        The following annotations will only be considered if `statistical_classification` is set to True:
            "https://metadata.datadrivendiscovery.org/types/OrdinalData",

        Arguments:
            hyperparams {Hyperparams} -- D3M Hyperparameter object

        Keyword Arguments:
            random_seed {int} -- random seed (default: {0})
            volumes {Dict[str, str]} -- large file dictionary containing model weights (default: {None})
    """

    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        "id":
        "d2fa8df2-6517-3c26-bafc-87b701c4043a",
        "version":
        __version__,
        "name":
        "simon",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        "keywords": [
            "Data Type Predictor",
            "Semantic Classification",
            "Text",
            "NLP",
            "Tabular",
        ],
        "source": {
            "name":
            __author__,
            "contact":
            __contact__,
            "uris": [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/simon-d3m-wrapper",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        "installation": [
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/NewKnowledge/simon-d3m-wrapper.git@{git_commit}#egg=SimonD3MWrapper"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
            {
                "type":
                "TGZ",
                "key":
                "simon_models_1",
                "file_uri":
                "http://public.datadrivendiscovery.org/simon_models_1.tar.gz",
                "file_digest":
                "d071106b823ab1168879651811dd03b829ab0728ba7622785bb5d3541496c45f",
            },
        ],
        # The same path the primitive is registered with entry points in setup.py.
        "python_path":
        "d3m.primitives.data_cleaning.column_type_profiler.Simon",
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.CONVOLUTIONAL_NEURAL_NETWORK,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.DATA_CLEANING,
    })

    def __init__(
        self,
        *,
        hyperparams: Hyperparams,
        random_seed: int = 0,
        volumes: typing.Dict[str, str] = None,
    ) -> None:

        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         volumes=volumes)
        self._volumes = volumes
        self._X_train: Inputs = None
        self._add_semantic_types: typing.List[typing.List[str]] = None
        self._remove_semantic_types: typing.List[typing.List[str]] = None

    def set_training_data(self, *, inputs: Inputs) -> None:
        """ Sets primitive's training data

            Arguments:
                inputs {Inputs} -- D3M dataframe
        """
        self._X_train = inputs
        self._is_fit = False

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """ Learns column annotations using training data. Saves to apply to testing data.

            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})

            Returns:
                CallResult[None]
        """

        true_target_columns = self._X_train.metadata.list_columns_with_semantic_types(
            ['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
        index_columns = self._X_train.metadata.get_index_columns()

        # Target and index columns should be set only once, if they are set.
        self.has_set_target_columns = False
        self.has_set_index_column = False

        columns_to_use = self._get_columns(self._X_train.metadata)

        self._add_semantic_types = []
        self._remove_semantic_types = []

        # compute SIMON annotations
        self.simon_annotations = self._produce_annotations(
            inputs=self._X_train)
        logger.debug(f"simon annotations: {self.simon_annotations}")

        for col_idx in columns_to_use:

            # Target and index columns should be set only once, if they are set.
            self.has_set_target_columns = False
            self.has_set_index_column = False

            input_column = self._X_train.select_columns([col_idx])
            column_metadata = self._X_train.metadata.query_column(col_idx)
            column_name = column_metadata.get('name', str(col_idx))
            column_semantic_types = list(
                column_metadata.get('semantic_types', []))

            # We might be here because column has a known type, but it has "https://metadata.datadrivendiscovery.org/types/SuggestedTarget" set.
            has_unknown_type = not column_semantic_types or 'https://metadata.datadrivendiscovery.org/types/UnknownType' in column_semantic_types

            # A normalized copy of semantic types, which always includes unknown type.
            normalized_column_semantic_types = copy.copy(column_semantic_types)

            # If we are processing this column and it does not have semantic type then it has missing semantic types,
            # we first set it, to normalize the input semantic types. If we will add any other semantic type,
            # we will then remove this semantic type.
            if has_unknown_type \
                    and 'https://metadata.datadrivendiscovery.org/types/UnknownType' in self.hyperparams['detect_semantic_types'] \
                    and 'https://metadata.datadrivendiscovery.org/types/UnknownType' not in normalized_column_semantic_types:
                normalized_column_semantic_types.append(
                    'https://metadata.datadrivendiscovery.org/types/UnknownType'
                )

            # A working copy of semantic types.
            new_column_semantic_types = copy.copy(
                normalized_column_semantic_types)

            # append simon labels
            if has_unknown_type:
                new_column_semantic_types = self._append_simon_annotations(
                    new_column_semantic_types, col_idx)

            # handle target columns
            new_column_semantic_types = self._set_target_column(
                new_column_semantic_types, true_target_columns)

            if has_unknown_type:

                # handle index columns
                if not index_columns and not self.has_set_index_column:
                    new_column_semantic_types = self._set_index_column(
                        new_column_semantic_types, column_name)

                # handle attribute columns
                new_column_semantic_types = self._set_attribute_column(
                    new_column_semantic_types)

                # handle additional time label
                new_column_semantic_types = self._set_additional_time_label(
                    new_column_semantic_types)

                # Have we added any other semantic type besides unknown type?
                if new_column_semantic_types != normalized_column_semantic_types:
                    if self.hyperparams[
                            'remove_unknown_type'] and 'https://metadata.datadrivendiscovery.org/types/UnknownType' in new_column_semantic_types:
                        new_column_semantic_types.remove(
                            'https://metadata.datadrivendiscovery.org/types/UnknownType'
                        )

            new_column_semantic_types_set = set(new_column_semantic_types)
            column_semantic_types_set = set(column_semantic_types)

            self._add_semantic_types.append(
                sorted(new_column_semantic_types_set -
                       column_semantic_types_set))
            self._remove_semantic_types.append(
                sorted(column_semantic_types_set -
                       new_column_semantic_types_set))

        assert len(self._add_semantic_types) == len(columns_to_use)
        assert len(self._remove_semantic_types) == len(columns_to_use)
        self._is_fit = True
        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Inputs]:
        """ Add SIMON annotations 

            Arguments:
                inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target

            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})

            Raises:
                PrimitiveNotFittedError: if primitive not fit

            Returns:
                CallResult[Outputs] -- Input pd frame with metadata augmented 

        """
        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        ## BEGIN originally from from d3m.primitives.schema_discovery.profiler.Common """
        assert self._add_semantic_types is not None
        assert self._remove_semantic_types is not None

        columns_to_use, output_columns = self._produce_columns(
            inputs, self._add_semantic_types, self._remove_semantic_types)

        if self.hyperparams['replace_index_columns'] and self.hyperparams[
                'return_result'] == 'append':
            assert len(columns_to_use) == len(output_columns)

            index_columns = inputs.metadata.get_index_columns()

            index_columns_to_use = []
            other_columns_to_use = []
            index_output_columns = []
            other_output_columns = []
            for column_to_use, output_column in zip(columns_to_use,
                                                    output_columns):
                if column_to_use in index_columns:
                    index_columns_to_use.append(column_to_use)
                    index_output_columns.append(output_column)
                else:
                    other_columns_to_use.append(column_to_use)
                    other_output_columns.append(output_column)

            outputs = base_utils.combine_columns(
                inputs,
                index_columns_to_use,
                index_output_columns,
                return_result='replace',
                add_index_columns=self.hyperparams['add_index_columns'])
            outputs = base_utils.combine_columns(
                outputs,
                other_columns_to_use,
                other_output_columns,
                return_result='append',
                add_index_columns=self.hyperparams['add_index_columns'])
        else:
            outputs = base_utils.combine_columns(
                inputs,
                columns_to_use,
                output_columns,
                return_result=self.hyperparams['return_result'],
                add_index_columns=self.hyperparams['add_index_columns'])
        ## EMD originally from from d3m.primitives.schema_discovery.profiler.Common """

        return CallResult(outputs, has_finished=self._is_fit)

    def produce_metafeatures(self,
                             *,
                             inputs: Inputs,
                             timeout: float = None,
                             iterations: int = None) -> CallResult[Outputs]:
        """ Produce primitive's best guess for the structural type of each input column.

            Arguments:
                inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target

            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})

            Raises:
                PrimitiveNotFittedError: if primitive not fit

            Returns:
                CallResult[Outputs] -- dataframe with two columns: "semantic type classifications" and "probabilities"
                    Each row represents a column in the original dataframe. The column "semantic type
                    classifications" contains a list of all semantic type labels and the column
                    "probabilities" contains a list of the model's confidence in assigning each
                    respective semantic type label
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        out_df = self._produce_annotations(inputs=inputs)

        # add metadata to output data frame
        simon_df = d3m_DataFrame(out_df)
        # first column list of ('semantic types')
        col_dict = dict(
            simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 0)))
        col_dict["structural_type"] = typing.List[str]
        col_dict["name"] = "semantic types"
        col_dict["semantic_types"] = (
            "http://schema.org/Text",
            "https://metadata.datadrivendiscovery.org/types/Attribute",
        )
        simon_df.metadata = simon_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 0), col_dict)
        # second column ('probabilities')
        col_dict = dict(
            simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 1)))
        col_dict["structural_type"] = typing.List[float]
        col_dict["name"] = "probabilities"
        col_dict["semantic_types"] = (
            "http://schema.org/Text",
            "https://metadata.datadrivendiscovery.org/types/Attribute",
            "https://metadata.datadrivendiscovery.org/types/FloatVector")
        simon_df.metadata = simon_df.metadata.update(
            (metadata_base.ALL_ELEMENTS, 1), col_dict)

        return CallResult(simon_df, has_finished=self._is_fit)

    def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata,
                        column_index: int) -> bool:
        """ originally from from d3m.primitives.schema_discovery.profiler.Common """

        column_metadata = inputs_metadata.query_column(column_index)

        semantic_types = column_metadata.get('semantic_types', [])

        # We detect only on columns which have no semantic types or where it is explicitly set as unknown.
        if not semantic_types or 'https://metadata.datadrivendiscovery.org/types/UnknownType' in semantic_types:
            return True

        # A special case to handle setting "https://metadata.datadrivendiscovery.org/types/TrueTarget".
        if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in semantic_types:
            return True

        return False

    def _get_columns(
            self,
            inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]:
        """ originally from from d3m.primitives.schema_discovery.profiler.Common """
        def can_use_column(column_index: int) -> bool:
            # if overwrite, we detect on all columns
            if self.hyperparams['overwrite']:
                return True

            return self._can_use_column(inputs_metadata, column_index)

        columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(
            inputs_metadata, self.hyperparams['use_columns'],
            self.hyperparams['exclude_columns'], can_use_column)

        # We are OK if no columns ended up being parsed.
        # "base_utils.combine_columns" will throw an error if it cannot work with this.

        if self.hyperparams['use_columns'] and columns_not_to_use:
            self.logger.warning(
                "Not all specified columns can parsed. Skipping columns: %(columns)s",
                {
                    'columns': columns_not_to_use,
                })

        return columns_to_use

    def _append_simon_annotations(self, new_column_semantic_types, col_idx):

        simon_labels = self.simon_annotations["semantic types"][col_idx]
        simon_probabilities = self.simon_annotations["probabilities"][col_idx]

        # filter labels and probs by those specified in HP
        filtered_labels, filtered_probabilities = [], []
        for label, prob in zip(simon_labels, simon_probabilities):
            if SIMON_ANNOTATIONS_DICT[label] in self.hyperparams[
                    'detect_semantic_types']:
                filtered_labels.append(SIMON_ANNOTATIONS_DICT[label])
                filtered_probabilities.append(prob)

        if self.hyperparams["multi_label_classification"]:
            new_column_semantic_types.extend(filtered_labels)
        else:
            if len(filtered_labels) > 0:
                new_column_semantic_types.append(
                    filtered_labels[np.argmax(filtered_probabilities)])
        return new_column_semantic_types

    def _produce_annotations(self, inputs: Inputs) -> Outputs:
        """ generates dataframe with semantic type classifications and classification probabilities
            for each column of original dataframe

        Arguments:
            inputs {Inputs} -- D3M dataframe

        Returns:
            Outputs -- dataframe with two columns: "semantic type classifications" and "probabilities"
                       Each row represents a column in the original dataframe. The column "semantic type
                       classifications" contains a list of all semantic type labels and the column
                       "probabilities" contains a list of the model's confidence in assigning each
                       respective semantic type label
        """

        # load model checkpoint
        checkpoint_dir = (self._volumes["simon_models_1"] +
                          "/simon_models_1/pretrained_models/")
        if self.hyperparams["statistical_classification"]:
            execution_config = "Base.pkl"
            category_list = "/Categories.txt"
        else:
            execution_config = "Base_stat_geo.pkl"
            category_list = "/Categories_base_stat_geo.txt"
        with open(
                self._volumes["simon_models_1"] + "/simon_models_1" +
                category_list, "r") as f:
            Categories = f.read().splitlines()

        # create model object
        Classifier = Simon(encoder={})
        config = Classifier.load_config(execution_config, checkpoint_dir)
        encoder = config["encoder"]
        checkpoint = config["checkpoint"]
        model = Classifier.generate_model(20, self.hyperparams["max_rows"],
                                          len(Categories))
        Classifier.load_weights(checkpoint, None, model, checkpoint_dir)
        model.compile(loss="binary_crossentropy",
                      optimizer="adam",
                      metrics=["binary_accuracy"])

        # prepare data and make predictions
        frame = inputs.copy()
        prepped_data = encoder.encodeDataFrame(frame)
        preds = model.predict_on_batch(tf.constant(prepped_data))
        decoded_preds = encoder.reverse_label_encode(
            preds, self.hyperparams["p_threshold"])

        # apply statistical / ordinal classification if desired
        if self.hyperparams["statistical_classification"]:
            logger.debug(
                "Beginning Guessing categorical/ordinal classifications...")
            raw_data = frame.values
            guesses = [
                guess(raw_data[:, i], for_types="category")
                for i in np.arange(raw_data.shape[1])
            ]

            # probability of rule-based statistical / ordinal classifications = min probability of existing classifications
            for i, g in enumerate(guesses):
                if g[0] == "category":
                    if len(decoded_preds[1][i]) == 0:
                        guess_prob = self.hyperparams['p_threshold']
                    else:
                        guess_prob = min(decoded_preds[1][i])
                    decoded_preds[0][i] += ("categorical", )
                    decoded_preds[1][i].append(guess_prob)
                    if (("int" in decoded_preds[1][i])
                            or ("float" in decoded_preds[1][i])
                            or ("datetime" in decoded_preds[1][i])):
                        decoded_preds[0][i] += ("ordinal", )
                        decoded_preds[1][i].append(guess_prob)
            logger.debug("Done with statistical variable guessing")

        # clear tf session, remove unnecessary files
        Classifier.clear_session()
        os.remove('unencoded_chars.json')

        out_df = pd.DataFrame.from_records(list(decoded_preds)).T
        out_df.columns = ["semantic types", "probabilities"]
        return out_df

    def _set_target_column(self, new_column_semantic_types,
                           true_target_columns):
        """ originally from from d3m.primitives.schema_discovery.profiler.Common """

        if not true_target_columns \
                and not self.has_set_target_columns \
                and 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in self.hyperparams['detect_semantic_types'] \
                and 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in new_column_semantic_types:
            # It should not be set because there are no columns with this semantic type in whole DataFrame.
            assert 'https://metadata.datadrivendiscovery.org/types/TrueTarget' not in new_column_semantic_types
            new_column_semantic_types.append(
                'https://metadata.datadrivendiscovery.org/types/TrueTarget')
            if 'https://metadata.datadrivendiscovery.org/types/Target' not in new_column_semantic_types:
                new_column_semantic_types.append(
                    'https://metadata.datadrivendiscovery.org/types/Target')
            if 'https://metadata.datadrivendiscovery.org/types/Attribute' in new_column_semantic_types:
                new_column_semantic_types.remove(
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
            self.has_set_target_columns = True
        return new_column_semantic_types

    def _set_index_column(self, new_column_semantic_types, column_name):
        """ originally from from d3m.primitives.schema_discovery.profiler.Common """

        if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in self.hyperparams['detect_semantic_types'] \
                and column_name == 'd3mIndex' \
                and 'https://metadata.datadrivendiscovery.org/types/UniqueKey' in new_column_semantic_types:
            # It should not be set because there are no columns with this semantic type in whole DataFrame.
            assert 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' not in new_column_semantic_types
            assert 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' not in new_column_semantic_types
            new_column_semantic_types.append(
                'https://metadata.datadrivendiscovery.org/types/PrimaryKey')
            new_column_semantic_types.remove(
                'https://metadata.datadrivendiscovery.org/types/UniqueKey')
            if 'https://metadata.datadrivendiscovery.org/types/Attribute' in new_column_semantic_types:
                new_column_semantic_types.remove(
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
            self.has_set_index_column = True
        elif 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' in self.hyperparams['detect_semantic_types'] \
                and column_name == 'd3mIndex':
            assert 'https://metadata.datadrivendiscovery.org/types/UniqueKey' not in new_column_semantic_types
            # It should not be set because there are no columns with this semantic type in whole DataFrame.
            assert 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' not in new_column_semantic_types
            assert 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' not in new_column_semantic_types
            new_column_semantic_types.append(
                'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey'
            )
            if 'https://metadata.datadrivendiscovery.org/types/Attribute' in new_column_semantic_types:
                new_column_semantic_types.remove(
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
            self.has_set_index_column = True
        return new_column_semantic_types

    def _set_attribute_column(self, new_column_semantic_types):
        """ originally from from d3m.primitives.schema_discovery.profiler.Common """

        if 'https://metadata.datadrivendiscovery.org/types/Attribute' in self.hyperparams['detect_semantic_types'] \
                and 'https://metadata.datadrivendiscovery.org/types/TrueTarget' not in new_column_semantic_types \
                and 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' not in new_column_semantic_types \
                and 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' not in new_column_semantic_types \
                and 'https://metadata.datadrivendiscovery.org/types/Attribute' not in new_column_semantic_types:
            new_column_semantic_types.append(
                'https://metadata.datadrivendiscovery.org/types/Attribute')
        return new_column_semantic_types

    def _set_additional_time_label(self, new_column_semantic_types):
        """ originally from from d3m.primitives.schema_discovery.profiler.Common """

        if 'https://metadata.datadrivendiscovery.org/types/Time' in self.hyperparams['detect_semantic_types'] \
                and 'http://schema.org/DateTime' in new_column_semantic_types \
                and 'https://metadata.datadrivendiscovery.org/types/Time' not in new_column_semantic_types:
            new_column_semantic_types.append(
                'https://metadata.datadrivendiscovery.org/types/Time')
        return new_column_semantic_types

    def _produce_columns(
        self,
        inputs: Inputs,
        add_semantic_types: typing.List[typing.List[str]],
        remove_semantic_types: typing.List[typing.List[str]],
    ) -> typing.Tuple[typing.List[int], typing.List[Outputs]]:
        """ originally from from d3m.primitives.schema_discovery.profiler.Common """
        columns_to_use = self._get_columns(inputs.metadata)

        assert len(add_semantic_types), len(remove_semantic_types)

        if len(columns_to_use) != len(add_semantic_types):
            raise exceptions.InvalidStateError(
                "Producing on a different number of columns than fitting.")

        output_columns = []

        for col_index, column_add_semantic_types, column_remove_semantic_types in zip(
                columns_to_use, add_semantic_types, remove_semantic_types):
            output_column = inputs.select_columns([col_index])

            for remove_semantic_type in column_remove_semantic_types:
                output_column.metadata = output_column.metadata.remove_semantic_type(
                    (metadata_base.ALL_ELEMENTS, 0), remove_semantic_type)
            for add_semantic_type in column_add_semantic_types:
                output_column.metadata = output_column.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, 0), add_semantic_type)

            output_columns.append(output_column)

        assert len(output_columns) == len(columns_to_use)

        return columns_to_use, output_columns

    def get_params(self) -> Params:
        if not self._is_fit:
            return Params(
                add_semantic_types=None,
                remove_semantic_types=None,
            )

        return Params(
            add_semantic_types=self._add_semantic_types,
            remove_semantic_types=self._remove_semantic_types,
        )

    def set_params(self, *, params: Params) -> None:
        self._add_semantic_types = params['add_semantic_types']
        self._remove_semantic_types = params['remove_semantic_types']
        self._is_fit = all(param is not None for param in params.values())
Exemple #29
0
            "logger": "logging.Logger",
            "metadata": "d3m.metadata.base.PrimitiveMetadata"
        },
        "instance_attributes": {
            "hyperparams": "d3m.metadata.hyperparams.Hyperparams",
            "random_seed": "int",
            "docker_containers": "typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]",
            "volumes": "typing.Dict[str, str]",
            "temporary_directory": "typing.Union[NoneType, str]"
        }
    },
    "structural_type": "test_primitives.increment.IncrementPrimitive",
    "description": "A primitive which increments each value by a fixed amount, by default 1."
}
""".replace('__INTERFACES_VERSION__', d3m.__version__).replace(
    '__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR)).replace(
        '__DIGEST__',
        IncrementPrimitive.metadata.query()['digest'])


class TestIncrementPrimitive(unittest.TestCase):
    def call_primitive(self, primitive, method_name, **kwargs):
        return getattr(primitive, method_name)(**kwargs)

    def test_basic(self):
        hyperparams_class = IncrementPrimitive.metadata.get_hyperparams()

        primitive = IncrementPrimitive(
            hyperparams=hyperparams_class.defaults())

        inputs = container.DataFrame(
Exemple #30
0
class NearestNeighborNomination(TransformerPrimitiveBase[Inputs, Outputs,
                                                         Hyperparams]):
    """
    Creates a similarity matrix from pairwise distances, and subsequently
    nominates the closest neighbor in the second graph to each vertex in the
    first graph.
    """
    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever.
        # Generated using "uuid.uuid4()".
        'id':
        '66e09f5b-3538-4d9a-9397-e32230608a35',
        'version':
        "0.1.0",
        'name':
        "jhu.nearest_neighbor_nomination",
        # Keywords do not have a controlled vocabulary. Authors can put here
        # whatever they find suitable.
        'keywords': ['nearest', 'neighbor', 'nomination', 'matching'],
        'source': {
            'name':
            "JHU",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://github.com/neurodata/primitives-interfaces/blob/master/jhu_primitives/nearest_neighbor_nomination/nearest_neighbor_nomination.py',
                'https://github.com/neurodata/primitives-interfaces',
            ],
            'contact':
            'mailto:[email protected]'
        },
        'description':
        'Creates a similarity matrix from pairwise distances, and subsequently nominates the closest neighbor in the second graph to each vertex in the first graph.',
        'hyperparams_configuration': {},
        # A list of dependencies in order. These can be Python packages, system
        # packages, or Docker images. Of course Python packages can also have
        # their own dependencies, but sometimes it is necessary to install a
        # Python package first to be even able to run setup.py of another
        # package. Or you have a dependency which is not on PyPi.
        'installation': [{
            'type': 'UBUNTU',
            'package': 'libxml2-dev',
            'version': '2.9.4'
        }, {
            'type': 'UBUNTU',
            'package': 'libpcre3-dev',
            'version': '2.9.4'
        }, {
            'type':
            'PIP',
            'package_uri':
            'git+https://github.com/neurodata/primitives-interfaces.git@{git_commit}#egg=jhu_primitives'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # URIs at which one can obtain code for the primitive, if available.
        # 'location_uris': [
        #     'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format(
        #         git_commit=utils.current_git_commit(os.path.dirname(__file__)),
        #     ),
        # ],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.graph_matching.nearest_neighbor_nomination.JHU',
        # Choose these from a controlled vocabulary in the schema. If anything
        # is missing which would best describe the primitive, make a merge
        # request.
        'algorithm_types': ["RANDOM_GRAPH"],
        'primitive_family':
        'GRAPH_MATCHING',
        'preconditions': ['NO_MISSING_VALUES']
    })

    def __init__(
            self,
            *,
            hyperparams: Hyperparams,
            random_seed: int = 0,
            docker_containers: Dict[str, base.DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

    def produce(self,
                *,
                inputs_1: Inputs,
                inputs_2: Inputs,
                reference: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        xhat = inputs_1
        yhat = inputs_2

        # do this more carefully TODO
        xhat_embedding = xhat.values[:, 1:].astype(np.float32)
        yhat_embedding = yhat.values[:, 1:].astype(np.float32)

        S = cdist(
            xhat_embedding,
            yhat_embedding,
        )
        match = np.argmin(S, axis=1)

        matches = np.zeros(len(reference), dtype=int)
        for i in range(len(reference)):
            e_id = xhat.index[xhat[xhat.columns[0]] == reference[
                reference.columns[1]].iloc[i]]
            g_id = yhat.index[yhat[yhat.columns[0]] == reference[
                reference.columns[2]].iloc[i]]
            matches[i] = 1 if g_id == match[e_id] else 0

        reference['match'] = matches

        results = reference[['d3mIndex', 'match']]

        predictions = {
            "d3mIndex": reference['d3mIndex'],
            "match": reference['match']
        }
        return base.CallResult(container.DataFrame(predictions),
                               has_finished=True,
                               iterations_done=1)

        # return base.CallResult(reference, #results,
        #                        has_finished=True,
        #                        iterations_done=1)

    def multi_produce(
            self,
            *,
            produce_methods: Sequence[str],
            inputs_1: Inputs,
            inputs_2: Inputs,
            reference: Inputs,
            timeout: float = None,
            iterations: int = None) -> base.MultiCallResult:  # type: ignore
        return self._multi_produce(produce_methods=produce_methods,
                                   timeout=timeout,
                                   iterations=iterations,
                                   inputs_1=inputs_1,
                                   inputs_2=inputs_2,
                                   reference=reference)

    def fit_multi_produce(
            self,
            *,
            produce_methods: Sequence[str],
            inputs_1: Inputs,
            inputs_2: Inputs,
            reference: Inputs,
            timeout: float = None,
            iterations: int = None) -> base.MultiCallResult:  # type: ignore
        return self._fit_multi_produce(produce_methods=produce_methods,
                                       timeout=timeout,
                                       iterations=iterations,
                                       inputs_1=inputs_1,
                                       inputs_2=inputs_2,
                                       reference=reference)