"logger": "logging.Logger", "metadata": "d3m.metadata.base.PrimitiveMetadata" }, "instance_attributes": { "hyperparams": "d3m.metadata.hyperparams.Hyperparams", "random_seed": "int", "docker_containers": "typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]", "volumes": "typing.Dict[str, str]", "temporary_directory": "typing.Union[NoneType, str]" } }, "structural_type": "test_primitives.random.RandomPrimitive", "description": "A primitive which draws random samples from a normal distribution.\n\nAttributes\n----------\nmetadata:\n Primitive's metadata. Available as a class attribute.\nlogger:\n Primitive's logger. Available as a class attribute.\nhyperparams:\n Hyperparams passed to the constructor.\nrandom_seed:\n Random seed passed to the constructor.\ndocker_containers:\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes:\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory:\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.", "digest": "__DIGEST__" } """.replace('__INTERFACES_VERSION__', d3m.__version__).replace('__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR)).replace('__DIGEST__', RandomPrimitive.metadata.query()['digest']) class TestRandomPrimitive(unittest.TestCase): def call_primitive(self, primitive, method_name, **kwargs): return getattr(primitive, method_name)(**kwargs) def test_basic(self): hyperparams_class = RandomPrimitive.metadata.get_hyperparams() primitive = RandomPrimitive(random_seed=42, hyperparams=hyperparams_class.defaults()) inputs = container.List(list(range(4)), generate_metadata=True) call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs)
class DistilLinkPredictionPrimitive(PrimitiveBase[container.List, container.DataFrame, Params, Hyperparams]): """ A primitive that uses RESCAL to predict links in graphs. """ metadata = metadata_base.PrimitiveMetadata( { "id": "fc138210-c317-4528-81ae-5eed3a1a0267", "version": version.__version__, "name": "LinkPrediction", "python_path": "d3m.primitives.link_prediction.link_prediction.DistilLinkPrediction", "source": { "name": "Distil", "contact": "mailto:[email protected]", "uris": [ "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/link_prediction.py", "https://github.com/uncharted-distil/distil-primitives", ], }, "installation": [ CYTHON_DEP, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, ], "primitive_family": metadata_base.PrimitiveFamily.LINK_PREDICTION, }, ) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._model = RescalLinkPrediction( target_metric=self.hyperparams["metric"], random_seed=random_seed) self._target_col = "" def set_training_data(self, *, inputs: container.List, outputs: container.DataFrame) -> None: self._inputs = inputs self._outputs = outputs self._target_col = outputs.columns[0] def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: logger.debug(f"Fitting {__name__}") X_train, y_train, U_train = self._inputs X_train = X_train.value y_train = y_train.squeeze() self._model.fit(X_train, y_train, U_train) return CallResult(None) def produce(self, *, inputs: container.List, timeout: float = None, iterations: int = None) -> CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") X_train, _, _ = inputs X_train = X_train.value result = self._model.predict(X_train).astype(int) # create dataframe to hold d3mIndex and result result_df = container.DataFrame({ X_train.index.name: X_train.index, self._target_col: result }) # mark the semantic types on the dataframe result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PrimaryKey", ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) return base.CallResult(result_df) def get_params(self) -> Params: return Params(model=self._model, target_col=self._target_col) def set_params(self, *, params: Params) -> None: self._model = params["model"] self._target_col = params["target_col"]
class SpectralClustering(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): ''' Primitive that applies sklearn spectral clustering algorithm to unsupervised, supervised or semi-supervised datasets. Training inputs: D3M dataframe with features and labels, and D3M indices Outputs:D3M dataframe with cluster predictions and D3M indices. Clusterlabels are of "suggestTarget" semantic type if the task_type hyperparameter is clustering, and "Attribute" if the task_type is classification. ''' metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "d13a4529-f0ba-44ee-a867-e0fdbb71d6e2", 'version': __version__, 'name': "tsne", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Clustering', 'Graph Clustering'], 'source': { 'name': __author__, 'contact': __contact__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/D3M-Unsupervised", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package': 'cython', 'version': '0.29.7', }, { 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/NewKnowledge/D3M-Unsupervised.git@{git_commit}#egg=D3MUnsupervised' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.clustering.spectral_graph_clustering.SpectralClustering', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.SPECTRAL_CLUSTERING, ], 'primitive_family': metadata_base.PrimitiveFamily.CLUSTERING, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self.sc = SC(n_clusters=self.hyperparams['n_clusters'], n_init=self.hyperparams['n_init'], n_neighbors=self.hyperparams['n_neighbors'], affinity=self.hyperparams['affinity'], random_state=self.random_seed) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : dataframe Returns ---------- Outputs The output is a transformed dataframe of X fit into an embedded space, n feature columns will equal n_components hyperparameter For timeseries datasets the output is the dimensions concatenated to the timeseries filename dataframe """ targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' ) target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] X_test = inputs.drop(columns=list(inputs)[index[0]]) X_test = X_test.drop(columns=target_names).values # special semi-supervised case - during training, only produce rows with labels series = inputs[target_names] != '' if series.any().any(): inputs = dataframe_utils.select_rows(inputs, np.flatnonzero(series)) X_test = X_test[np.flatnonzero(series)] sc_df = d3m_DataFrame( pandas.DataFrame(self.sc.fit_predict(X_test), columns=['cluster_labels'])) # just add last column of last column ('clusters') col_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type(1) if self.hyperparams['task_type'] == 'classification': col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') col_dict['name'] = 'cluster_labels' else: col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) col_dict['name'] = target_names[0] sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict) df_dict = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict(sc_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = 1 sc_df.metadata = sc_df.metadata.update((metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(utils_cp.append_columns(inputs, sc_df))
class VectorBoundsFilterPrimitive( transformer.TransformerPrimitiveBase[container.DataFrame, container.DataFrame, Hyperparams]): """ A primitive to filter columns with FloatVector semantics. based on the the i'th value of the mins/maxs list will indicate the appropriate min/max to filter out the indicated row indices. Note that the amount of row indices must match the amount of mins and maxs provided, otherwise the excess given indices won't have any filter applied on them. The filter assumes the mins and maxs are the same type of data. They can be of type int, list, and two dimensional list. If row_indices_list is empty, it filters on all indices. If the mins/maxs are an int, all values in all vectors will be filtered with those bounds. If the mins/maxs are a list, then it expect it to be the same length as the amount of indice lists given. i.e each scalar in the mins/maxs will correspond to each set of indices in row_indices_list to filter. If the mins/maxs are a two dimensional list, then each vector of filters in the list will correspond to each set of row_indices_list. In there, each i'th value in the filter vector will correspond to each i'th column in the vector to be filtered. i.e if we have the dataframe: d3mIndex | values 0 | 10, 20, 30 1 | 15, 25, 35 2 | 40, 20, 50 And you provide row_indices_list = [[0, 1], [2]], mins = [[12, 18, 31], [20, 25, 50]], maxs = [[20, 30, 40], [30, 25, 60]] Only row with index 1 will be returned, as row 0 has 10 < 12, and 30 < 31. Row 2 was filtered out because 40 > 20 and 50 > 40, 20 < 25. """ metadata = metadata_base.PrimitiveMetadata({ "id": "c2fa34c0-2d1b-42af-91d2-515da4a27752", "version": version.__version__, "name": "Vector bound filter", "python_path": "d3m.primitives.data_transformation.vector_bounds_filter.DistilVectorBoundsFilter", "source": { "name": "Distil", "contact": "mailto:[email protected]", "uris": [ "https://github.com/uncharted-distil/distil-primitives-contrib/blob/main/main/distil_primitives_contrib/vector_filter.py", "https://github.com/uncharted-distil/distil-primitives-contrib", ], }, "installation": [ { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/uncharted-distil/distil-primitives-contrib.git@{git_commit}#egg=distil-primitives-contrib" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, ], "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, }) _floatvector_semantic = ( "https://metadata.datadrivendiscovery.org/types/FloatVector", ) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) if self.hyperparams["strict"]: self._min_comparison_op = lambda x, y: x > y self._max_comparision_op = lambda x, y: x < y else: self._min_comparison_op = lambda x, y: x >= y self._max_comparision_op = lambda x, y: x <= y def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: vector_column = self._get_floatvector_column(inputs.metadata) if vector_column is None: return base.CallResult(inputs) maxs = self.hyperparams["maxs"] mins = self.hyperparams["mins"] if type(mins) == float or type(mins) == int: return base.CallResult(self._scalar_filter(inputs, vector_column)) indices = inputs.index.tolist() mins = [float("-inf") if i == None else i for i in mins] maxs = [float("inf") if i == None else i for i in maxs] indices_to_keep = np.empty((inputs.shape[0], )) try: rows = np.stack(inputs.iloc[:, vector_column], axis=0) filter_length = rows.shape[1] rows = np.logical_and( self._min_comparison_op( rows[:, :filter_length], mins, ), self._max_comparision_op(rows[:, :filter_length], maxs), ) rows_to_keep = rows.sum(axis=1) == filter_length except ValueError as error: # rows had uneven length rows = inputs.iloc[:, vector_column] # get length of each vector vector_lengths = rows.apply(np.shape).apply(np.take, args=([0])) filter_lengths = vector_lengths.values # need this to loop over lengths array while keeping vectorised # apply function over rows count_for_ref = [0] def _filter_r(row, filter_lengths, mins, maxs, counter): # in case fewer filters than row length filterable_range = min(filter_lengths[counter[0]], len(mins)) mins_for_filter = np.array(mins[:filterable_range]) maxs_for_filter = np.array(maxs[:filterable_range]) filtered_row = np.logical_and( self._min_comparison_op(row[:filterable_range], mins_for_filter), self._max_comparision_op( row[:filterable_range], maxs_for_filter, ), ) counter[0] += 1 return filtered_row rows = rows.apply( _filter_r, args=(filter_lengths, mins, maxs, count_for_ref), ) rows_to_keep = rows.apply(np.sum).values == filter_lengths if self.hyperparams["inclusive"]: indices_to_keep = [ indices[j] for j in range(len(indices)) if rows_to_keep[j] ] else: indices_to_keep = [ indices[j] for j in range(len(indices)) if not rows_to_keep[j] ] outputs = dataframe_utils.select_rows(inputs, indices_to_keep) return base.CallResult(outputs) def _scalar_filter(self, inputs, vector_column): max_value = self.hyperparams["maxs"] min_value = self.hyperparams["mins"] indices = inputs.index.tolist() if min_value == None: float("-inf") if max_value == None: float("inf") try: rows = np.stack(inputs.iloc[:, vector_column], axis=0) rows = np.logical_and( self._min_comparison_op( rows, min_value, ), self._max_comparision_op(rows, max_value), ) rows_to_keep = rows.sum(axis=1) == rows.shape[1] except ValueError as error: rows = inputs.iloc[:, vector_column] def _filter_r(row, min_val, max_val): return np.logical_and( self._min_comparison_op( row, min_val, ), self._max_comparision_op( row, max_val, ), ) rows = rows.apply( _filter_r, args=(min_value, max_value), ) rows_to_keep = rows.apply(np.sum) == rows.apply(np.shape).apply( np.take, args=([0])) if self.hyperparams["inclusive"]: rows_to_keep = [ indices[j] for j in range(len(indices)) if rows_to_keep[j] ] else: rows_to_keep = [ indices[j] for j in range(len(indices)) if not rows_to_keep[j] ] return dataframe_utils.select_rows(inputs, rows_to_keep) def _get_floatvector_column(self, inputs_metadata: metadata_base.DataMetadata): fv_column = self.hyperparams["column"] if fv_column: return fv_column fv_columns = inputs_metadata.list_columns_with_semantic_types( self._floatvector_semantic) if len(fv_columns) > 0: return fv_columns[0] logger.warning( "inputs provided contains no specified FloatVector column and lacks columns with FloatVector semantic" ) return None
import os from d3m import utils D3M_API_VERSION = '2018.1.26' VERSION = "0.1.0" TAG_NAME = "{git_commit}".format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ) REPOSITORY = "https://github.com/rooshenas/dsbox-spen" PACAKGE_NAME = "dsbox-spen" D3M_PERFORMER_TEAM = 'UMASS' if TAG_NAME: PACKAGE_URI = "git+" + REPOSITORY + "@" + TAG_NAME else: PACKAGE_URI = "git+" + REPOSITORY PACKAGE_URI = PACKAGE_URI + "#egg=" + PACAKGE_NAME INSTALLATION_TYPE = 'GIT' if INSTALLATION_TYPE == 'PYPI': INSTALLATION = {"type": "PIP", "package": PACAKGE_NAME, "version": VERSION} else: # INSTALLATION_TYPE == 'GIT' INSTALLATION = { "type": "PIP", "package_uri": PACKAGE_URI, }
class unicorn(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "475c26dc-eb2e-43d3-acdb-159b80d9f099", 'version': __version__, 'name': "unicorn", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Image Clustering', 'fast fourier transfom', 'Image'], 'source': { 'name': __author__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/unicorn-d3m-wrapper", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": "PIP", "package_uri": "git+https://github.com/NewKnowledge/d3m_unicorn.git@97b24ce39c3a26c1d753104c80012c352efd6920#egg=d3m_unicorn" }, { "type": "PIP", "package_uri": "git+https://github.com/NewKnowledge/unicorn-d3m-wrapper.git@{git_commit}#egg=UNICORNd3mWrapper".format( git_commit=utils.current_git_commit(os.path.dirname(__file__)) ), }, { "type": "TGZ", "key": "croc_weights", "file_uri": "http://public.datadrivendiscovery.org/croc.tar.gz", "file_digest":"0be3e8ab1568ec8225b173112f4270d665fb9ea253093cd9ea98c412c9053c92" }, ], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.distil.unicorn', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.MULTILABEL_CLASSIFICATION # TODO ], "primitive_family": metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, volumes: typing.Dict[str,str]=None)-> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, volumes=volumes) self.volumes = volumes def _get_column_base_path(self, inputs: Inputs, column_name: str) -> str: # fetches the base path associated with a column given a name if it exists column_metadata = inputs.metadata.query((metadata_base.ALL_ELEMENTS,)) if not column_metadata or len(column_metadata) == 0: return None num_cols = column_metadata['dimension']['length'] for i in range(0, num_cols): col_data = inputs.metadata.query((metadata_base.ALL_ELEMENTS, i)) if col_data['name'] == column_name and 'location_base_uris' in col_data: return col_data['location_base_uris'][0] return None def produce(self, *, inputs: Inputs) -> CallResult[Outputs]: """ Produce image object classification predictions and OCR for an image provided as an URI or filepath Parameters ---------- inputs : pandas dataframe where a column is a pd.Series of image paths/URLs Returns ------- output : A dataframe with image labels/classifications/cluster assignments """ target_columns = self.hyperparams['target_columns'] output_labels = self.hyperparams['output_labels'] imagepath_df = inputs image_analyzer = Unicorn(weights_path=self.volumes["croc_weights"]+"/inception_v3_weights_tf_dim_ordering_tf_kernels.h5") for i, ith_column in enumerate(target_columns): # initialize an empty dataframe result_df = pd.DataFrame() output_label = output_labels[i] # get the base uri from the column metadata and remove the the # scheme portion base_path = self._get_column_base_path(inputs, ith_column) if base_path: base_path = base_path.split('://')[1] # update the paths with the base if necessary col_paths = imagepath_df.loc[:, ith_column] if base_path: for i in range(0, len(col_paths)): col_paths[i] = os.path.join(base_path, col_paths[i]) result_df = image_analyzer.cluster_images(col_paths) imagepath_df = pd.concat( [imagepath_df.reset_index(drop=True), result_df], axis=1) K.clear_session() # create metadata for the unicorn output dataframe unicorn_df = d3m_DataFrame(imagepath_df) # first column (d3mIndex) col_dict = dict(unicorn_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("1") col_dict['name'] = 'd3mIndex' col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') unicorn_df.metadata = unicorn_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict) # second column (filename) col_dict = dict(unicorn_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("it is a string") col_dict['name'] = "filename" col_dict['semantic_types'] = ('http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') unicorn_df.metadata = unicorn_df.metadata.update((metadata_base.ALL_ELEMENTS, 1), col_dict) # third column (bounding_box) col_dict = dict(unicorn_df.metadata.query((metadata_base.ALL_ELEMENTS, 2))) col_dict['structural_type'] = type("it is a string") col_dict['name'] = "bounding_box" col_dict['semantic_types'] = ('http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') unicorn_df.metadata = unicorn_df.metadata.update((metadata_base.ALL_ELEMENTS, 2), col_dict) # fourth column (label) col_dict = dict(unicorn_df.metadata.query((metadata_base.ALL_ELEMENTS, 3))) col_dict['structural_type'] = type("it is a string") col_dict['name'] = "label" col_dict['semantic_types'] = ('http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') unicorn_df.metadata = unicorn_df.metadata.update((metadata_base.ALL_ELEMENTS, 3), col_dict) # fifth column (pred_class) col_dict = dict(unicorn_df.metadata.query((metadata_base.ALL_ELEMENTS, 4))) col_dict['structural_type'] = type("1") col_dict['name'] = 'pred_class' col_dict['semantic_types'] = ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute') unicorn_df.metadata = unicorn_df.metadata.update((metadata_base.ALL_ELEMENTS, 4), col_dict) return CallResult(unicorn_df)
class GoatReversePrimitive(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Accept a set of lat/long pair, processes it and returns a set corresponding geographic location names Parameters ---------- inputs : pandas dataframe containing 2 coordinate float values, i.e., [longitude,latitude] representing each geographic location of interest - a pair of values per location/row in the specified target column Returns ------- Outputs Pandas dataframe containing one location per longitude/latitude pair (if reverse geocoding possible, otherwise NaNs) appended as new columns """ # Make sure to populate this with JSON annotations... # This should contain only metadata which cannot be automatically determined from the code. metadata = metadata_base.PrimitiveMetadata( { # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". "id": "f6e4880b-98c7-32f0-b687-a4b1d74c8f99", "version": __version__, "name": "Goat_reverse", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. "keywords": ["Reverse Geocoder"], "source": { "name": __author__, "contact": __contact__, "uris": [ # Unstructured URIs. "https://github.com/NewKnowledge/goat-d3m-wrapper" ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/NewKnowledge/goat-d3m-wrapper.git@{git_commit}#egg=GoatD3MWrapper".format( git_commit=utils.current_git_commit(os.path.dirname(__file__)) ), }, { "type": "UBUNTU", "package": "default-jre", "version": "2:1.8-56ubuntu2", }, { "type": "TGZ", "key": "photon-db-latest", "file_uri": "http://public.datadrivendiscovery.org/photon.tar.gz", "file_digest": "d7e3d5c6ae795b5f53d31faa3a9af63a9691070782fa962dfcd0edf13e8f1eab", }, ], # The same path the primitive is registered with entry points in setup.py. "python_path": "d3m.primitives.data_cleaning.geocoding.Goat_reverse", # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD], "primitive_family": metadata_base.PrimitiveFamily.DATA_CLEANING, } ) def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, volumes: typing.Dict[str, str] = None, ) -> None: super().__init__( hyperparams=hyperparams, random_seed=random_seed, volumes=volumes, ) self._decoder = JSONDecoder() self.volumes = volumes self.goat_cache = LRUCache(self.hyperparams["cache_size"]) def produce( self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> CallResult[Outputs]: """ Accept a set of lat/long pair, processes it and returns a set corresponding geographic location names Parameters ---------- inputs : pandas dataframe containing 2 coordinate float values, i.e., [longitude,latitude] representing each geographic location of interest - a pair of values per location/row in the specified target column Returns ------- Outputs Pandas dataframe containing one location per longitude/latitude pair (if reverse geocoding possible, otherwise NaNs) """ # confirm that server is responding before proceeding address = "http://localhost:2322/" PopenObj = check_geocoding_server( address, self.volumes, self.hyperparams["rampup_timeout"] ) # find location columns, real columns, and real-vector columns targets = inputs.metadata.get_columns_with_semantic_type( "https://metadata.datadrivendiscovery.org/types/Location" ) real_values = inputs.metadata.get_columns_with_semantic_type( "http://schema.org/Float" ) real_values += inputs.metadata.get_columns_with_semantic_type( "http://schema.org/Integer" ) real_values = list(set(real_values)) real_vectors = inputs.metadata.get_columns_with_semantic_type( "https://metadata.datadrivendiscovery.org/types/FloatVector" ) target_column_idxs = [] target_columns = [] # convert target columns to list if they have single value and are adjacent in the df for target, target_col in zip(targets, [list(inputs)[idx] for idx in targets]): if target in real_vectors: target_column_idxs.append(target) target_columns.append(target_col) # pair of individual lat / lon columns already in list elif list(inputs)[target - 1] in target_columns: continue elif target in real_values: if target + 1 in real_values: # convert to single column with list of [lat, lon] col_name = "new_col_" + target_col inputs[col_name] = inputs.iloc[ :, target : target + 2 ].values.tolist() target_columns.append(col_name) target_column_idxs.append(target) target_column_idxs.append(target + 1) target_column_idxs.append(inputs.shape[1] - 1) # make sure columns are structured as 1) lat , 2) lon pairs for col in target_columns: if inputs[col].apply(lambda x: x[0]).max() > 90: inputs[col] = inputs[col].apply(lambda x: x[::-1]) # delete columns with path names of nested media files outputs = inputs.remove_columns(target_column_idxs) # reverse-geocode each requested location output_data = [] for i, ith_column in enumerate(target_columns): j = 0 for longlat in inputs[ith_column]: cache_ret = self.goat_cache.get(longlat) row_data = [] if cache_ret == -1: r = requests.get( address + "reverse?lat=" + str(longlat[0]) + "&lon=" + str(longlat[1]) ) tmp = self._decoder.decode(r.text) if len(tmp["features"]) == 0: if self.hyperparams["geocoding_resolution"] == "postcode": row_data = float("nan") else: row_data = "" elif ( self.hyperparams["geocoding_resolution"] not in tmp["features"][0]["properties"].keys() ): if self.hyperparams["geocoding_resolution"] == "postcode": row_data = float("nan") else: row_data = "" else: row_data = tmp["features"][0]["properties"][ self.hyperparams["geocoding_resolution"] ] self.goat_cache.set(longlat, row_data) else: row_data = cache_ret if len(output_data) <= j: output_data.append(row_data) else: output_data[j] = output_data[j] + row_data j = j + 1 # need to cleanup by closing the server when done... PopenObj.kill() # Build d3m-type dataframe out_df = pd.DataFrame(index=range(inputs.shape[0]),columns=target_columns) d3m_df = d3m_DataFrame(out_df) for i, ith_column in enumerate(target_columns): # for every column col_dict = dict(d3m_df.metadata.query((metadata_base.ALL_ELEMENTS, i))) if self.hyperparams["geocoding_resolution"] == "postcode": col_dict["structural_type"] = type(1) col_dict["semantic_types"] = ( "http://schema.org/Integer", "https://metadata.datadrivendiscovery.org/types/Attribute", ) else: col_dict["structural_type"] = type("it is a string") col_dict["semantic_types"] = ( "http://schema.org/Text", "https://metadata.datadrivendiscovery.org/types/Attribute", ) col_dict["name"] = target_columns[i] d3m_df.metadata = d3m_df.metadata.update( (metadata_base.ALL_ELEMENTS, i), col_dict ) df_dict = dict(d3m_df.metadata.query((metadata_base.ALL_ELEMENTS,))) df_dict_1 = dict(d3m_df.metadata.query((metadata_base.ALL_ELEMENTS,))) df_dict["dimension"] = df_dict_1 df_dict_1["name"] = "columns" df_dict_1["semantic_types"] = ( "https://metadata.datadrivendiscovery.org/types/TabularColumn", ) df_dict_1["length"] = d3m_df.shape[1] d3m_df.metadata = d3m_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict) return CallResult(outputs.append_columns(d3m_df))
class Tsne(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): ''' Primitive that applies the T-distributed stochastic neighbour embedding algorith to unsupervised, supervised or semi-supervised datasets. Training inputs: D3M dataset with features and labels, and D3M indices Outputs:D3M dataframe with t-SNE dimensions and D3M indices ''' metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "15586787-80d5-423e-b232-b61f55a117ce", 'version': __version__, 'name': "tsne", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Dimensionality Reduction'], 'source': { 'name': __author__, 'contact': __contact__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/D3M-Unsupervised", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [ { 'type': metadata_base.PrimitiveInstallationType.PIP, 'package': 'cython', 'version': '0.29.14', }, { 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/NewKnowledge/D3M-Unsupervised.git@{git_commit}#egg=D3MUnsupervised'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.dimensionality_reduction.t_distributed_stochastic_neighbor_embedding.Tsne', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.T_DISTRIBUTED_STOCHASTIC_NEIGHBOR_EMBEDDING, ], 'primitive_family': metadata_base.PrimitiveFamily.DIMENSIONALITY_REDUCTION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0)-> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self.clf = TSNE(n_components = self.hyperparams['n_components'],random_state=self.random_seed) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : dataframe with attached metadata for semi-supervised or unsupervised data Returns ---------- Outputs D3M dataframe with t-SNE dimensions and D3M indices """ # store information on target, index variable targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget') target_names = [list(inputs)[t] for t in targets] index = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey') index_names = [list(inputs)[i] for i in index] n_ts = len(inputs.d3mIndex.unique()) if n_ts == inputs.shape[0]: X_test = inputs.drop(columns = list(inputs)[index[0]]) X_test = X_test.drop(columns = target_names).values else: ts_sz = int(inputs.shape[0] / n_ts) X_test = np.array(inputs.value).reshape(n_ts, ts_sz) # fit_transform data and create new dataframe n_components = self.hyperparams['n_components'] col_names = ['Dim'+ str(c) for c in range(0,n_components)] tsne_df = d3m_DataFrame(pandas.DataFrame(self.clf.fit_transform(X_test), columns = col_names)) tsne_df = pandas.concat([inputs.d3mIndex, tsne_df], axis=1) # add index colmn metadata col_dict = dict(tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type('1') col_dict['name'] = index_names[0] col_dict['semantic_types'] = ('http://schema.org/Int', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') tsne_df.metadata = tsne_df.metadata.update((metadata_base.ALL_ELEMENTS, 0), col_dict) # add dimenion columns metadata for c in range(1,n_components+1): col_dict = dict(tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, c))) col_dict['structural_type'] = type(1.0) col_dict['name'] = 'Dim'+str(c-1) col_dict['semantic_types'] = ('http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute') tsne_df.metadata = tsne_df.metadata.update((metadata_base.ALL_ELEMENTS, c), col_dict) df_dict = dict(tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict(tsne_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ('https://metadata.datadrivendiscovery.org/types/TabularColumn',) df_dict_1['length'] = n_components+1 tsne_df.metadata = tsne_df.metadata.update((metadata_base.ALL_ELEMENTS,), df_dict) return CallResult(tsne_df)
class LinkPredictionRankClassifier(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ A primitive that predicts the existence of a link if it falls within the interquartile range of inner products. """ # This should contain only metadata which cannot be automatically determined from the code. metadata = metadata_module.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': '25e97696-b96f-4f5c-8620-b340fe83414d', 'version': "0.1.0", 'name': "jhu.link_pred_rc", # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.link_prediction.rank_classification.JHU', # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['graph', 'inner product'], 'source': { 'name': "JHU", 'uris': [ # Unstructured URIs. Link to file and link to repo in this case. 'https://github.com/neurodata/primitives-interfaces/blob/master/jhu_primitives/link_pred_rc/link_pred_rc.py', # 'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py', 'https://github.com/neurodata/primitives-interfaces.git', ], 'contact': 'mailto:[email protected]' }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [ { 'type': 'UBUNTU', 'package': 'libxml2-dev', 'version': '2.9.4' }, { 'type': 'UBUNTU', 'package': 'libpcre3-dev', 'version': '2.9.4' }, { 'type': 'PIP', 'package_uri': 'git+https://github.com/neurodata/primitives-interfaces.git@{git_commit}#egg=jhu_primitives'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__)),), }, ], 'algorithm_types': [ "HEURISTIC" ], 'primitive_family': "LINK_PREDICTION", 'preconditions': ['NO_MISSING_VALUES'] }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, base.DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._fitted: bool = False self._inner_products: container.List = [] self._embeddings: container.List = [] def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if not self._fitted: raise ValueError("Not fitted") random_state = np.random.RandomState(seed=self.random_seed) csv = inputs[0] # print(csv, file=sys.stderr) csv_headers = csv.columns for header in csv_headers: if header[:6] == "source": SOURCE = header elif header[:6] == "target": TARGET = header source_nodeID = np.array(csv[SOURCE]).astype(int) target_nodeID = np.array(csv[TARGET]).astype(int) try: int(np.array(csv['linkType'])[0]) except: csv['linkType'] = np.zeros(len(source_nodeID)) link_types = np.array(csv['linkType']).astype(int) n_links = len(self._inner_products) - 1 n_nodes = int(self._embeddings.shape[0] / n_links) n_preds = csv.shape[0] predictions = np.zeros(n_preds) global_noexists = self._inner_products[-1][0] global_exists = self._inner_products[-1][1] # The following code is used for "global" classification only; i.e. we ignore edge type training data for i in range(n_preds): temp_source = source_nodeID[i] temp_target = target_nodeID[i] temp_link = link_types[i] temp_inner_product = self._embeddings[temp_link*n_nodes + temp_source-1] @ self._embeddings[temp_link*n_nodes + temp_target-1] temp_noexists = self._inner_products[temp_link][0] temp_exists = self._inner_products[temp_link][1] # There are three 'degenerate' cases -- # 1) Both the exists and no exists lists are empty (first 'if') # 2/3) One but not the other is empty ('elif') # if len(temp_noexists) == 0 and len(temp_exists) == 0: rank_noexists = np.sum(temp_inner_product > global_noexists) quantile_noexists = rank_noexists / len(global_noexists) rank_exists = np.sum(temp_inner_product > global_noexists) quantile_exists = rank_exists / len(global_exists) if abs(quantile_noexists - 1/2) < abs(quantile_exists - 1/2): predictions[i] = int(0) elif abs(quantile_noexists - 1/2) > abs(quantile_exists - 1/2): predictions[i] = int(1) else: predictions[i] = int(random_state.binomial(1, 0.5)) csv['linkExists'] = predictions.astype(int) outputs = container.DataFrame(csv[['d3mIndex', 'linkExists']]) return base.CallResult(outputs) def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: if self._fitted: return base.CallResult(None) embeddings = self._training_inputs[1][0] csv = self._training_inputs[0] n_nodes, n_links = self._training_inputs[3] n_info = csv.shape[0] ranks = [[[], []] for i in range(n_links + 1)] try: int(np.array(csv['linkType'])[0]) except: csv['linkType'] = np.zeros(n_info) # print(csv, file=sys.stderr) csv_headers = csv.columns for header in csv_headers: if header[:6] == "source": SOURCE = header elif header[:6] == "target": TARGET = header for i in range(n_info): temp_link = int(np.array(csv['linkType'])[i]) temp_exists = int(np.array(csv['linkExists'])[i]) temp_source = int(np.array(csv[SOURCE])[i]) temp_target = int(np.array(csv[TARGET])[i]) temp_dot = embeddings[temp_link*n_nodes + temp_source - 1] @ embeddings[temp_link*n_nodes + temp_target - 1] ranks[temp_link][temp_exists].append(temp_dot) ranks[-1][temp_exists].append(temp_dot) for i in range(len(ranks)): ranks[i][0] = np.sort(ranks[i][0]) ranks[i][1] = np.sort(ranks[i][1]) self._embeddings = container.ndarray(embeddings) self._inner_products = container.List(ranks) self._fitted = True return base.CallResult(None) def set_training_data(self, *, inputs: Inputs) -> None: self._training_inputs = inputs def get_params(self) -> Params: if not self._fitted: raise ValueError("Fit not performed.") return Params( inner_products = self._inner_products, embeddings = self._embeddings ) def set_params(self, *, params: Params) -> None: self._fitted = True self._inner_products = params['inner_products'] self._embeddings = params['embeddings']
class MIRankingPrimitive( transformer.TransformerPrimitiveBase[container.DataFrame, container.DataFrame, Hyperparams]): """ Feature ranking based on a mutual information between features and a selected target. Will rank any feature column with a semantic type of Float, Boolean, Integer or Categorical, and a corresponding structural type of int, float or str. Features that could not be ranked are excluded from the returned set. Parameters ---------- inputs : A container.Dataframe with columns containing numeric or string data. Returns ------- output : A DataFrame containing (col_idx, col_name, score) tuples for each ranked feature. """ # allowable target column types _discrete_types = ( 'http://schema.org/Boolean', 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/CategoricalData') _continous_types = ('http://schema.org/Float', ) _roles = ( 'https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/TrueTarget', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', ) _structural_types = set((int, float)) _semantic_types = set(_discrete_types).union(_continous_types) _random_seed = 100 __author__ = 'Uncharted Software', metadata = metadata_base.PrimitiveMetadata({ 'id': 'a31b0c26-cca8-4d54-95b9-886e23df8886', 'version': '0.2.1', 'name': 'Mutual Information Feature Ranking', 'python_path': 'd3m.primitives.feature_selection.mi_ranking.DistilMIRanking', 'keywords': ['vector', 'columns', 'dataframe'], 'source': { 'name': 'Uncharted Software', 'contact': 'mailto:[email protected]', 'uris': ['http://github.com/uncharted-distil/distil-mi-ranking'] }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/uncharted-distil/distil-mi-ranking.git@' + '{git_commit}#egg=distil-mi-ranking'.format( git_commit=d3m_utils.current_git_commit( os.path.dirname(__file__)), ), }], 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.MUTUAL_INFORMATION, ], 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, }) @classmethod def _can_use_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: typing.Optional[int]) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) valid_struct_type = column_metadata.get('structural_type', None) in cls._structural_types semantic_types = column_metadata.get('semantic_types', []) valid_semantic_type = len( set(cls._semantic_types).intersection(semantic_types)) > 0 valid_role_type = len(set(cls._roles).intersection(semantic_types)) > 0 return valid_struct_type and valid_semantic_type @classmethod def _append_rank_info( cls, inputs: container.DataFrame, result: typing.List[typing.Tuple[int, str, float]], rank_np: np.array, rank_df: pd.DataFrame ) -> typing.List[typing.Tuple[int, str, float]]: for i, rank in enumerate(rank_np): col_name = rank_df.columns.values[i] result.append((inputs.columns.get_loc(col_name), col_name, rank)) return result def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: cols = ['idx', 'name', 'rank'] # Make sure the target column is of a valid type and return no ranked features if it isn't. target_idx = self.hyperparams['target_col_index'] if not self._can_use_column(inputs.metadata, target_idx): return base.CallResult(container.DataFrame(data={}, columns=cols)) # check if target is discrete or continuous semantic_types = inputs.metadata.query_column( target_idx)['semantic_types'] discrete = len(set(semantic_types).intersection( self._discrete_types)) > 0 # make a copy of the inputs and clean out any missing data feature_df = inputs.copy() feature_df.dropna(inplace=True) # split out the target feature target_df = feature_df.iloc[:, target_idx] # drop features that are not compatible with ranking feature_indices = set( inputs.metadata.list_columns_with_semantic_types( self._semantic_types)) role_indices = set( inputs.metadata.list_columns_with_semantic_types(self._roles)) feature_indices = feature_indices.intersection(role_indices) feature_indices.remove(target_idx) # return an empty result if all features were incompatible if len(feature_indices) is 0: return base.CallResult(container.DataFrame(data={}, columns=cols)) all_indices = set(range(0, inputs.shape[1])) skipped_indices = all_indices.difference(feature_indices) for i, v in enumerate(skipped_indices): feature_df.drop(inputs.columns[v], axis=1, inplace=True) # figure out the discrete and continuous feature indices and create an array # that flags them discrete_indices = inputs.metadata.list_columns_with_semantic_types( self._discrete_types) discrete_flags = [False] * feature_df.shape[1] for v in discrete_indices: col_name = inputs.columns[v] if col_name in feature_df: # only mark columns with a least 1 duplicate value as discrete when predicting # a continuous target - there's a check in the bowels of MI code that will throw # an exception otherwise if feature_df[col_name].duplicated().any() and not discrete: col_idx = feature_df.columns.get_loc(col_name) discrete_flags[col_idx] = True target_np = target_df.values feature_np = feature_df.values # compute mutual information for discrete or continuous target ranked_features_np = None if discrete: ranked_features_np = mutual_info_classif( feature_np, target_np, discrete_features=discrete_flags, random_state=self._random_seed) else: ranked_features_np = mutual_info_regression( feature_np, target_np, discrete_features=discrete_flags, random_state=self._random_seed) # merge back into a single list of col idx / rank value tuples data: typing.List[typing.Tuple[int, str, float]] = [] data = self._append_rank_info(inputs, data, ranked_features_np, feature_df) # wrap as a D3M container - metadata should be auto generated results = container.DataFrame(data=data, columns=cols, generate_metadata=True) results = results.sort_values(by=['rank'], ascending=False).reset_index(drop=True) return base.CallResult(results)
class FairnessInProcessing(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]): ''' Primitive that applies an in-processing algorithm to training data while fitting a learning algorithm. Algorithm is 'Adversarial_Debiasing', which learns a classifier (tf nn based) that maximizes prediction accuracy, while simultaneously reducing an adversary’s ability to determine the protected attribute from the predictions. ''' metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "f9822847-d19f-40f9-8e23-3fdcd5dcb847", 'version': __version__, 'name': "In-processing Fairness Techniques", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['fairness, bias, debias, data inprocessing, data augmentation'], 'source': { 'name': __author__, 'contact': __contact__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/D3M-Fairness-Primitives", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [ { 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/NewKnowledge/D3M-Fairness-Primitives.git@{git_commit}#egg=FairnessPrimitives'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.data_augmentation.data_conversion.FairnessInProcessing', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, ], 'primitive_family': metadata_base.PrimitiveFamily.DATA_AUGMENTATION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0)-> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self.label_names = None self.protected_attributes = None self.idx = None self.attribute_names = None self.unfavorable_label = None self.train_dataset = None self.clf = None def get_params(self) -> Params: return self._params def set_params(self, *, params:Params) -> None: self.params = params def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: ''' Sets primitive's training data Parameters ---------- inputs : features outputs : labels ''' # only select attributes from training data targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget') if not len(targets): targets = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget') self.label_names = [list(inputs)[t] for t in targets] # calculate protected attributes self.protected_attributes = [list(inputs)[c] for c in self.hyperparams['protected_attribute_cols']] # save index and metadata idx = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey') self.idx = [list(inputs)[i] for i in idx] # mark attributes that are not priveleged data attributes = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/Attribute') priveleged_data = inputs.metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrivilegedData') attributes = list(set(attributes) - set(priveleged_data)) self.attribute_names = [list(inputs)[a] for a in attributes] # transfrom dataframe to IBM 360 compliant dataset # 1. assume datacleaning primitive has been applied so there are no NAs # 2. assume categorical columns have been converted to unique numeric values # 3. assume the label column is numeric self.unfavorable_label = 0. if self.hyperparams['favorable_label'] == 1. else 1. self.train_dataset = datasets.BinaryLabelDataset(df = inputs[self.attribute_names + self.label_names], label_names = self.label_names, protected_attribute_names = self.protected_attributes, favorable_label=self.hyperparams['favorable_label'], unfavorable_label=self.unfavorable_label) # apply in-processing algorithm self.clf = inprocessing.AdversarialDebiasing(unprivileged_groups = [{self.protected_attributes[0]: self.train_dataset.unprivileged_protected_attributes}], privileged_groups = [{self.protected_attributes[0]: self.train_dataset.privileged_protected_attributes}], scope_name = 'adversarial_debiasing', sess = tf.Session()) def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fit primitive using adversarial debiasing algorithm Parameters ---------- inputs : None Returns ---------- Outputs : None """ self.clf = self.clf.fit(self.train_dataset) return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce predictions using fit adversarial debiasing algorithm Parameters ---------- inputs : D3M dataframe Returns ---------- Outputs : D3M dataframe -> predictions from fit debiasing algorithm """ # transfrom test dataframe to IBM 360 compliant dataset inputs[self.label_names] = self.train_dataset.convert_to_dataframe()[0][self.label_names].values[:inputs.shape[0]].astype(int) test_dataset = datasets.BinaryLabelDataset(df = inputs[self.attribute_names + self.label_names], label_names = self.label_names, protected_attribute_names = self.protected_attributes, favorable_label=self.hyperparams['favorable_label'], unfavorable_label=self.unfavorable_label) transformed_dataset = self.clf.predict(test_dataset) # transform IBM dataset back to D3M dataset df = transformed_dataset.convert_to_dataframe()[0][self.label_names].astype(int) df = d3m_DataFrame(pandas.concat([inputs[self.idx].reset_index(drop=True), df.reset_index(drop=True)], axis = 1)) df.metadata = df.metadata.update((metadata_base.ALL_ELEMENTS, 0), inputs.metadata.query_column(0)) df.metadata = df.metadata.update((metadata_base.ALL_ELEMENTS, 1), inputs.metadata.query_column(1)) print(df.head(), file = sys.__stdout__) return CallResult(df)
class Hdbscan(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): ''' Produce primitive's best guess for the cluster number of each series. ''' metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "ca014488-6004-4b54-9403-5920fbe5a834", 'version': __version__, 'name': "hdbscan", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Time Series'], 'source': { 'name': __author__, 'contact': __contact__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/TimeSeries-D3M-Wrappers", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package': 'cython', 'version': '0.29.7', }, { 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/NewKnowledge/TimeSeries-D3M-Wrappers.git@{git_commit}#egg=TimeSeriesD3MWrappers' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.clustering.hdbscan.Hdbscan', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.DBSCAN, ], 'primitive_family': metadata_base.PrimitiveFamily.CLUSTERING, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) hp_class = TimeSeriesFormatterPrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] self._hp = hp_class.defaults().replace({ 'file_col_index': 1, 'main_resource_index': 'learningData' }) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Parameters ---------- inputs : numpy ndarray of size (number_of_time_series, time_series_length) containing new time series Returns ---------- Outputs The output is a dataframe containing a single column where each entry is the associated series' cluster number. """ # temporary (until Uncharted adds conversion primitive to repo) if not self.hyperparams['long_format']: inputs = TimeSeriesFormatterPrimitive( hyperparams=self._hp).produce(inputs=inputs).value['0'] else: hyperparams_class = DatasetToDataFrame.DatasetToDataFramePrimitive.metadata.query( )['primitive_code']['class_type_arguments']['Hyperparams'] ds2df_client = DatasetToDataFrame.DatasetToDataFramePrimitive( hyperparams=hyperparams_class.defaults().replace( {"dataframe_resource": "learningData"})) inputs = d3m_DataFrame(ds2df_client.produce(inputs=inputs).value) # parse values from output of time series formatter n_ts = len(inputs.d3mIndex.unique()) ts_sz = int(inputs.shape[0] / n_ts) input_vals = np.array(inputs.value).reshape(n_ts, ts_sz) # use HP to produce DBSCAN clustering if self.hyperparams['algorithm'] == 'DBSCAN': #SimilarityMatrix = cluster.GenerateSimilarityMatrix(input_vals) _, labels, _ = cluster.ClusterSimilarityMatrix( input_vals, self.hyperparams['eps'], self.hyperparams['min_samples']) else: #SimilarityMatrix = cluster.GenerateSimilarityMatrix(input_vals) _, labels, _ = cluster.HClusterSimilarityMatrix( input_vals, self.hyperparams['min_cluster_size'], self.hyperparams['min_samples']) # transform labels for D3M classification task labels = [x + 1 if x >= 0 else x + 2 for x in labels] # add metadata to output labels = pandas.DataFrame(labels) out_df = pandas.concat( [pandas.DataFrame(inputs.d3mIndex.unique()), labels], axis=1) # get column names from metadata out_df.columns = ['d3mIndex', 'label'] hdbscan_df = d3m_DataFrame(out_df) # first column ('d3mIndex') col_dict = dict( hdbscan_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("1") # confirm that this metadata still exists #index = inputs['0'].metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/PrimaryKey') #col_dict['name'] = inputs.metadata.query_column(index[0])['name'] col_dict['name'] = 'd3mIndex' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', ) hdbscan_df.metadata = hdbscan_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) # second column ('labels') col_dict = dict( hdbscan_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("1") #index = inputs['0'].metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/SuggestedTarget') #col_dict['name'] = inputs.metadata.query_column(index[0])['name'] col_dict['name'] = 'label' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/TrueTarget', 'https://metadata.datadrivendiscovery.org/types/Target') hdbscan_df.metadata = hdbscan_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) return CallResult(hdbscan_df)
class simon(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]): metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "d2fa8df2-6517-3c26-bafc-87b701c4043a", 'version': __version__, 'name': "simon", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Data Type Predictor'], 'source': { 'name': __author__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/simon-d3m-wrapper", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/NewKnowledge/simon-d3m-wrapper.git@{git_commit}#egg=SimonD3MWrapper' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.distil.simon', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.CONVOLUTIONAL_NEURAL_NETWORK, ], 'primitive_family': metadata_base.PrimitiveFamily.DATA_CLEANING, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._decoder = JSONDecoder() self._params = {} def fit(self) -> None: pass def get_params(self) -> Params: return self._params def set_params(self, *, params: Params) -> None: self.params = params def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: pass def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's best guess for the structural type of each input column. Parameters ---------- inputs : Input pandas frame Returns ------- Outputs The outputs is two lists of lists, each has length equal to number of columns in input pandas frame. Each entry of the first one is a list of strings corresponding to each column's multi-label classification. Each entry of the second one is a list of floats corresponding to prediction probabilities. """ """ Accept a pandas data frame, predicts column types in it frame: a pandas data frame containing the data to be processed -> a list of two lists of lists of 1) column labels and then 2) prediction probabilities """ frame = inputs try: # setup model as you typically would in a Simon main file maxlen = 20 max_cells = 500 p_threshold = 0.5 DEBUG = True # boolean to specify whether or not print DEBUG information checkpoint_dir = "pretrained_models/" with open('Categories.txt', 'r') as f: Categories = f.read().splitlines() # orient the user a bit print("fixed categories are: ") Categories = sorted(Categories) print(Categories) category_count = len(Categories) execution_config = "Base.pkl" # load specified execution configuration if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] X = encoder.encodeDataFrame(frame) # build classifier model model = Classifier.generate_model(maxlen, max_cells, category_count) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model_compile = lambda m: m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) model_compile(model) y = model.predict(X) # discard empty column edge case y[np.all(frame.isnull(), axis=0)] = 0 out = encoder.reverse_label_encode(y, p_threshold) return pd.DataFrame.from_records( out, columns=['semantic types', 'probabilities']) except: # Should probably do some more sophisticated error logging here return "Failed predicting data frame"
class TimeSeriesLoaderPrimitive( transformer.TransformerPrimitiveBase[container.DataFrame, container.DataFrame, Hyperparams]): """ Reads the time series files from a given column in an input dataframe into a new M x N dataframe, where each timeseries occupies one of M rows, and each of the row's N entries represents a timestamp. The loading process assumes that each series file has an identical set of timestamps. """ _semantic_types = ( 'https://metadata.datadrivendiscovery.org/types/FileName', 'https://metadata.datadrivendiscovery.org/types/Timeseries') _media_types = ('text/csv', ) __author__ = 'Uncharted Software', metadata = metadata_base.PrimitiveMetadata({ 'id': '1689aafa-16dc-4c55-8ad4-76cadcf46086', 'version': '0.2.0', 'name': 'Time series loader', 'python_path': 'd3m.primitives.data_preprocessing.timeseries_loader.DistilTimeSeriesLoader', 'keywords': ['series', 'reader', 'csv'], 'source': { 'name': 'Uncharted Software', 'contact': 'mailto:[email protected]', 'uris': ['https://gitlab.com/uncharted-distil/distil-timeseries-loader'] }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/uncharted-distil/distil-timeseries-loader.git@' + '{git_commit}#egg=DistilTimeSeriesLoader-0.2.0'.format( git_commit=d3m_utils.current_git_commit( os.path.dirname(__file__)), ), }], 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION, ], 'supported_media_types': _media_types, 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, }) @classmethod def _find_csv_file_column( cls, inputs_metadata: metadata_base.DataMetadata ) -> typing.Optional[int]: indices = utils.list_columns_with_semantic_types( inputs_metadata, cls._semantic_types) for i in indices: if cls._is_csv_file_column(inputs_metadata, i): return i return None @classmethod def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: # check to see if a given column is a file pointer that points to a csv file column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata['structural_type'] != str: return False semantic_types = column_metadata.get('semantic_types', []) media_types = column_metadata.get('media_types', []) return set(cls._semantic_types).issubset(semantic_types) and set( cls._media_types).issubset(media_types) def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None) -> base.CallResult[container.DataFrame]: file_index = self.hyperparams['file_col_index'] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, file_index): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(file_index) + ' from ' + str(inputs.columns) + ' does not contain csv file names') else: file_index = self._find_csv_file_column(inputs.metadata) if file_index is None: raise exceptions.InvalidArgumentValueError( 'no column from ' + str(inputs.columns) + ' contains csv file names') value_index = self.hyperparams['value_col_index'] time_index = self.hyperparams['time_col_index'] # load each time series file, transpose, and append base_path = inputs.metadata.query( (metadata_base.ALL_ELEMENTS, file_index))['location_base_uris'][0] timeseries_dataframe: pd.DataFrame for idx, file_path in enumerate(inputs.iloc[:, file_index]): csv_path = os.path.join(base_path, file_path) timeseries_row = pd.read_csv(csv_path).transpose() # use the time values as the column headers if idx is 0: timeseries_dataframe = pd.DataFrame( columns=timeseries_row.iloc[time_index]) timeseries_dataframe = timeseries_dataframe.append( timeseries_row.iloc[value_index]) # get the index to use a range of ints rather than the value col name timeseries_dataframe = timeseries_dataframe.reset_index(drop=True) # wrap as a D3M container - metadata should be auto generated return base.CallResult(container.DataFrame(data=timeseries_dataframe))
class SSC_ADMM( clustering.ClusteringDistanceMatrixMixin[Inputs, Outputs, type(None), SSC_ADMMHyperparams, DistanceMatrixOutput], clustering.ClusteringTransformerPrimitiveBase[Inputs, Outputs, SSC_ADMMHyperparams]): metadata = metadata_module.PrimitiveMetadata({ 'id': '83083e82-088b-47f4-9c0b-ba29adf5a51d', 'version': "0.0.5", 'name': 'SSC_ADMM', 'description': """Does sparse subspace clustering, using the Alternating Direction Method of Multipliers framework for optimization.""", 'keywords': [ 'clustering', 'subspace', 'sparse', 'Alternating Direction Method of Multipliers' ], 'source': { 'name': 'Michigan', 'contact': 'mailto:[email protected]', 'uris': [ #link to file and repo 'https://github.com/dvdmjohnson/d3m_michigan_primitives/blob/master/spider/cluster/ssc_admm/ssc_admm.py', 'https://github.com/dvdmjohnson/d3m_michigan_primitives' ], 'citation': """@article{elhamifar2013sparse, title={Sparse subspace clustering: Algorithm, theory, and applications}, author={Elhamifar, Ehsan and Vidal, Rene}, journal={IEEE transactions on pattern analysis and machine intelligence}, volume={35}, number={11}, pages={2765--2781}, year={2013}, publisher={IEEE}}""" }, 'installation': [{ 'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/dvdmjohnson/d3m_michigan_primitives.git@{git_commit}#egg=spider' .format( git_commit=utils.current_git_commit(os.path.dirname(__file__))) }, { 'type': metadata_module.PrimitiveInstallationType.UBUNTU, 'package': 'ffmpeg', 'version': '7:2.8.11-0ubuntu0.16.04.1' }], 'python_path': 'd3m.primitives.clustering.ssc_admm.Umich', 'hyperparams_to_tune': ['n_clusters', 'alpha'], 'algorithm_types': [metadata_module.PrimitiveAlgorithmType.SUBSPACE_CLUSTERING], 'primitive_family': metadata_module.PrimitiveFamily.CLUSTERING }) def __init__( self, *, hyperparams: SSC_ADMMHyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, base.DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._use_affine = hyperparams['use_affine'] self._use_outliers = hyperparams['use_outliers'] self._alpha = hyperparams['alpha'] if hyperparams['alpha'] != -1 else ( 20 if self._use_outliers else 800) self._epsilon = 0.0002 self._k = hyperparams['n_clusters'] self._random_state = np.random.RandomState(random_seed) def set_training_data(self, *, inputs: Inputs) -> None: pass ## computes regularization paramater lambda to be used in ADMM algorithm # @param Y DxN data matrix # @param P Dx? modified data matrix # @return regularization paramater lambda for ADMM algorithm def _compute_lambda(self, Y, P): T = P.T * Y np.fill_diagonal(T, 0.0) T = np.absolute(T) l = np.min(np.amax(T, axis=0)) return l ## shrinkage threshold operator # @param eta number # @param M NumPy matrix # @return NumPy matrix resulting from applying shrinkage threshold operator to each entry of M def _shrinkage_threshold(self, eta, M): ST = np.matrix( np.maximum(np.zeros(M.shape), np.array(np.absolute(M)) - eta) * np.array(np.sign(M))) return ST ## computes maximum L2-norm error among columns of residual of linear system # @param P DxN NumPy matrix # @param Z NxN NumPy matrix # @return maximum L2-norm of columns of P-P*Z def _error_linear_system(self, P, Z): R, N = Z.shape Y = P[:, :N] if R > N else P Y0 = Y - P[:, N:] * Z[N:, :] if R > N else P C = Z[:N, :] if R > N else Z n = np.linalg.norm(Y0, 2, axis=0) S = np.array((Y0 / n) - Y * (C / n)) err = np.sqrt(np.max(sum(S * S))) return err ## computes adjacency matrix given coefficient matrix # @param C NxN coefficient matrix (NumPy matrix) # @return NxN adjacency matrix (NumPy matrix) def _build_adjacency_matrix(self, C): eps = 2.220446049250313e-16 N = C.shape[0] CAbs = np.absolute(C) for i in range(N): CAbs[:, i] = CAbs[:, i] / (np.amax(CAbs[:, i]) + eps) A = CAbs + np.transpose(CAbs) + eps np.fill_diagonal(A, 0.0) return A ## spectral clustering algorithm # @param W NxN adjacency matrix (NumPy matrix) # @param n_clusters number of clusters # @param max_iter maximum number of iterations for KMeans # @param n_init number of replications for KMeans # @return labels for N points def _spectral_clustering(self, W, n_clusters=10, max_iter=1000, n_init=20): N, _ = W.shape eps = 2.220446049250313e-16 DN = np.diag(1 / np.sqrt(np.sum(W, axis=0) + eps)) LapN = np.identity(N) - np.matmul(np.matmul(DN, W), DN) _, _, VN = np.linalg.svd(LapN) kerN = VN.T[:, (N - n_clusters):N] normN = np.sqrt(np.sum(np.square(kerN), axis=1)) kerNS = (kerN.T / (normN + eps).T).T l = KMeans(n_clusters, n_init=n_init, max_iter=max_iter, random_state=self._random_state).fit(kerNS) labels = l.labels_.reshape((N, )) return labels ## ADMM algorithm with outliers # @param X DxN NumPy array/matrix representing N points in D-dimensional space # @param use_affine whether or not data points come from union of affine subspaces instead of linear subspaces # @param alpha constant used in calculating updates # @param epsilon termination constant # @param max_iter maximum number of iterations # @return sparse coefficient matrix (NumPy array) def _outlier_admm(self, X, use_affine=False, alpha=20.0, epsilon=0.0002, max_iter=200): Y = np.matrix(X) D, N = Y.shape gamma = alpha / np.linalg.norm(Y, 1) P = np.concatenate((Y, np.matlib.eye(D) / gamma), axis=1) mu1 = alpha / self._compute_lambda(Y, P) mu2 = alpha C = np.matlib.zeros((N + D, N)) if not use_affine: # initializations k = 1 A = np.linalg.pinv(mu1 * P.T * P + mu2 * np.matlib.eye(N + D)) Lambda1 = np.matlib.zeros((D, N)) Lambda2 = np.matlib.zeros((N + D, N)) err1 = 10.0 * epsilon err2 = 10.0 * epsilon # main loop while k < max_iter and (err1 > epsilon or err2 > epsilon): Z = A * (mu1 * P.T * (Y + Lambda1 / mu1) + mu2 * (C - Lambda2 / mu2)) np.fill_diagonal(Z, 0.0) C = self._shrinkage_threshold(1.0 / mu2, Z + Lambda2 / mu2) np.fill_diagonal(C, 0.0) Lambda1 = Lambda1 + mu1 * (Y - P * Z) Lambda2 = Lambda2 + mu2 * (Z - C) err1 = np.amax(np.absolute(Z - C)) err2 = self._error_linear_system(P, Z) k += 1 else: # initializations k = 1 delta = np.matrix([[float(i < N)] for i in range(N + D)]) A = np.linalg.pinv(mu1 * P.T * P + mu2 * np.matlib.eye(N + D) + mu2 * delta * delta.T) Lambda1 = np.matlib.zeros((D, N)) Lambda2 = np.matlib.zeros((N + D, N)) lambda3 = np.matlib.zeros((1, N)) err1 = 10.0 * epsilon err2 = 10.0 * epsilon err3 = 10.0 * epsilon # main loop while k < max_iter and (err1 > epsilon or err2 > epsilon or err3 > epsilon): Z = A * (mu1 * P.T * (Y + Lambda1 / mu1) + mu2 * (C - Lambda2 / mu2) + mu2 * delta * (1.0 - lambda3 / mu2)) np.fill_diagonal(Z, 0.0) C = self._shrinkage_threshold(1.0 / mu2, Z + Lambda2 / mu2) np.fill_diagonal(C, 0.0) Lambda1 = Lambda1 + mu1 * (Y - P * Z) Lambda2 = Lambda2 + mu2 * (Z - C) lambda3 = lambda3 + mu2 * (delta.T * Z - 1.0) err1 = np.amax(np.absolute(Z - C)) err2 = self._error_linear_system(P, Z) err3 = np.amax(np.absolute(delta.T * Z - 1.0)) k += 1 C = np.array(C[:N, :]) return C ## ADMM algorithm without outliers # @param X DxN NumPy array/matrix representing D points in N-dimensional space # @param use_affine whether or not data points come from union of affine subspaces instead of linear subspaces # @param alpha constant used in calculating updates # @param epsilon termination constant # @param max_iter maximum number of iterations # @return sparse coefficient matrix (NumPy array) def _lasso_admm(self, X, use_affine=False, alpha=800.0, epsilon=0.0002, max_iter=200): Y = np.matrix(X) N = Y.shape[1] mu1 = alpha / self._compute_lambda(Y, Y) mu2 = alpha C = np.matlib.zeros((N, N)) if not use_affine: # initializations k = 1 A = np.linalg.pinv(mu1 * Y.T * Y + mu2 * np.matlib.eye(N)) Lambda2 = np.matlib.zeros((N, N)) err1 = 10.0 * epsilon # main loop while k < max_iter and err1 > epsilon: Z = A * (mu1 * Y.T * Y + mu2 * (C - Lambda2 / mu2)) np.fill_diagonal(Z, 0.0) C = self._shrinkage_threshold(1.0 / mu2, Z + Lambda2 / mu2) np.fill_diagonal(C, 0.0) Lambda2 = Lambda2 + mu2 * (Z - C) err1 = np.amax(np.absolute(Z - C)) k += 1 else: # initializations k = 1 A = np.linalg.pinv(mu1 * Y.T * Y + mu2 * np.matlib.eye(N) + mu2) Lambda2 = np.matlib.zeros((N, N)) lambda3 = np.matlib.zeros((1, N)) err1 = 10.0 * epsilon err3 = 10.0 * epsilon # main loop while k < max_iter and (err1 > epsilon or err3 > epsilon): Z = A * (mu1 * Y.T * Y + mu2 * (C - Lambda2 / mu2) + mu2 * np.matlib.ones( (N, 1)) * (1.0 - lambda3 / mu2)) np.fill_diagonal(Z, 0.0) C = self._shrinkage_threshold(1.0 / mu2, Z + Lambda2 / mu2) np.fill_diagonal(C, 0.0) Lambda2 = Lambda2 + mu2 * (Z - C) lambda3 = lambda3 + mu2 * (np.matlib.ones((1, N)) * Z - 1.0) err1 = np.amax(np.absolute(Z - C)) err3 = np.amax(np.absolute(np.matlib.ones((1, N)) * Z - 1.0)) k += 1 C = np.array(C) return C ## computes sparse coefficient matrix using SSC algorithm with ADMM # @param X NxD NumPy array/matrix representing N points in D-dimensional space # @return sparse coefficient matrix (NumPy array) def _compute_sparse_coefficient_matrix(self, X, max_iter): XX = np.transpose(X) a = self._alpha C = self._outlier_admm( XX, self._use_affine, a, self._epsilon, max_iter) if self._use_outliers else self._lasso_admm( XX, self._use_affine, a, self._epsilon, max_iter) return C def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: assert inputs.ndim == 2, "Inputs are not in the right shape" if iterations == None or iterations < 5: iterations = 200 C = self._compute_sparse_coefficient_matrix(inputs, iterations) W = self._build_adjacency_matrix(C) labels = self._spectral_clustering(W, self._k) labels = np.array(labels) return base.CallResult(Outputs(labels)) def produce_distance_matrix(self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> base.CallResult[DistanceMatrixOutput]: """ Returns 1 - the affinity matrix generated from the subspace-transformed data """ assert inputs.ndim == 2, "Inputs are not in the right shape" if iterations == None or iterations < 5: iterations = 200 C = self._compute_sparse_coefficient_matrix(inputs, iterations) W = self._build_adjacency_matrix(C) return base.CallResult(DistanceMatrixOutput(1 - W)) def __getstate__(self) -> dict: return { 'constructor': { 'hyperparams': self.hyperparams, 'random_seed': self.random_seed, 'docker_containers': self.docker_containers, }, 'random_state': self._random_state, } def __setstate__(self, state: dict) -> None: self.__init__(**state['constructor']) # type: ignore self._random_state = state['random_state'] #placeholder for now, just calls base version. @classmethod def can_accept( cls, *, method_name: str, arguments: typing.Dict[str, typing.Union[metadata_module.Metadata, type]], hyperparams: SSC_ADMMHyperparams ) -> typing.Optional[metadata_module.DataMetadata]: return super().can_accept(method_name=method_name, arguments=arguments, hyperparams=hyperparams)
class FailPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ A primitive which fails on the requested method (given as hyper-parameter). Moreover, primitive does not correctly preserve state so if you pickle and unpickle it, it does not seen itself as fitted anymore. """ metadata: typing.ClassVar[ metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ 'id': 'd6dfbefa-0fb8-11e9-ab14-d663bd873d93', 'version': __version__, 'name': "Failure Tester", 'keywords': ['test primitive'], 'source': { 'name': __author__, 'contact': 'mailto:[email protected]', 'uris': [ 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/fail.py', 'https://gitlab.com/datadrivendiscovery/tests-data.git', ], }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }], 'location_uris': [ 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/fail.py' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), ], 'python_path': 'd3m.primitives.operator.null.FailTest', 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.IDENTITY_FUNCTION, ], 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, }) def __init__(self, *, hyperparams: Hyperparams) -> None: super().__init__(hyperparams=hyperparams) self._conditional_fail('__init__') self._fitted = False def _conditional_fail(self, method_name: str) -> None: if self.hyperparams['method_to_fail'] == method_name: raise IntentionalError(self.__class__.__name__, method_name) def set_training_data(self) -> None: # type: ignore self._conditional_fail('set_training_data') self._fitted = False super().set_training_data() def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: self._conditional_fail('fit') self._fitted = True return super().fit(timeout=timeout, iterations=iterations) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: self._conditional_fail('produce') if not self._fitted: raise exceptions.PrimitiveNotFittedError( "Primitive is not fitted.") return base.CallResult(inputs)
class DistilSeededGraphMatchingPrimitive( PrimitiveBase[container.List, container.DataFrame, Params, Hyperparams] ): """ A primitive that matches seeded graphs. """ metadata = metadata_base.PrimitiveMetadata( { "id": "8baea8e6-9d3a-46d7-acf1-04fd593dcd37", "version": version.__version__, "name": "SeededGraphMatcher", "python_path": "d3m.primitives.graph_matching.seeded_graph_matching.DistilSeededGraphMatcher", "source": { "name": "Distil", "contact": "mailto:[email protected]", "uris": [ "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/seeded_graph_matching.py", "https://github.com/uncharted-distil/distil-primitives", ], }, "installation": [ CYTHON_DEP, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives".format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }, ], "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, ], "primitive_family": metadata_base.PrimitiveFamily.GRAPH_MATCHING, }, ) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._model = SGMGraphMatcher(target_metric="accuracy") def set_training_data( self, *, inputs: container.List, outputs: container.DataFrame ) -> None: self._inputs = inputs self._outputs = outputs self._target_col = outputs.columns[0] def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: logger.debug(f"Fitting {__name__}") X_train, y_train, U_train = self._inputs X_train = X_train.value self._model.fit(X_train, y_train, U_train) return CallResult(None) def produce( self, *, inputs: container.List, timeout: float = None, iterations: int = None ) -> CallResult[container.DataFrame]: logger.debug(f"Producing {__name__}") X_train, _, _ = inputs X_train = X_train.value result = self._model.predict(X_train).astype(int) # create dataframe to hold d3mIndex and result result_df = container.DataFrame( {X_train.index.name: X_train.index, self._target_col: result} ) # mark the semantic types on the dataframe result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), "https://metadata.datadrivendiscovery.org/types/PrimaryKey", ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 1), "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ) return base.CallResult(result_df) def get_params(self) -> Params: return Params(model=self._model, target_col=self._target_col) def set_params(self, *, params: Params) -> None: self._model = params["model"] self._target_col = params["target_col"] return
class NonParametricClustering(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): # This should contain only metadata which cannot be automatically determined from the code. metadata = metadata_module.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': '2e3cda2b-ce4a-39ae-ae02-22dc33affd17', 'version': "0.1.0", 'name': "jhu.nonpar", # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.jhu_primitives.NonParametricClustering', # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['nonparametric'], 'source': { 'name': "JHU", 'uris': [ # Unstructured URIs. Link to file and link to repo in this case. 'https://github.com/neurodata/primitives-interfaces/jhu_primitives/nonpar/nonpar.py', # 'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py', 'https://github.com/neurodata/primitives-interfaces.git', ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/neurodata/primitives-interfaces.git@{git_commit}#egg=jhu_primitives'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }], # URIs at which one can obtain code for the primitive, if available. # 'location_uris': [ # 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format( # git_commit=utils.current_git_commit(os.path.dirname(__file__)), # ), # ], # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ "HIGHER_ORDER_SINGULAR_VALUE_DECOMPOSITION" ], 'primitive_family': "DATA_TRANSFORMATION" }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, str] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Non-parametric clustering **Positional Arguments:** xhat1: - A numpy.ndarray type "matrix" xhat2: - A numpy.ndarray type "matrix" **Optional Arguments:** sigma: - a sigma for the Gaussian kernel """ #xhat1 = inputs[0,:,:] #xhat2 = inputs[1,:,:] xhat1 = inputs[0] xhat2 = inputs[1] sigma = self.hyperparams['sigma'] path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "nonpar.interface.R") cmd = """ source("%s") fn <- function(xhat1, xhat2, sigma) { nonpar.interface(xhat1, xhat2, sigma) } """ % path result = np.array(robjects.r(cmd)(xhat1, xhat2, sigma)) outputs = container.ndarray(result) return base.CallResult(outputs)
class TextEncoderPrimitive(base.PrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Encodes string fields using TFIDF scoring combined with a linear SVC classifier. The original string field is removed and replaced with encoding columns. """ _attribute_semantic = "https://metadata.datadrivendiscovery.org/types/Attribute" metadata = metadata_base.PrimitiveMetadata( { "id": "09f252eb-215d-4e0b-9a60-fcd967f5e708", "version": version.__version__, "name": "Text encoder", "python_path": "d3m.primitives.data_transformation.encoder.DistilTextEncoder", "source": { "name": "Distil", "contact": "mailto:[email protected]", "uris": [ "https://github.com/uncharted-distil/distil-primitives/blob/main/distil/primitives/text_encoder.py", "https://github.com/uncharted-distil/distil-primitives", ], }, "installation": [ CYTHON_DEP, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/uncharted-distil/distil-primitives.git@{git_commit}#egg=distil-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.ENCODE_BINARY, ], "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, }, ) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._encoders: List[SVMTextEncoder] = [] self._cols: List[int] = [] def __getstate__(self) -> dict: state = base.PrimitiveBase.__getstate__(self) state["models"] = self._encoders state["columns"] = self._cols return state def __setstate__(self, state: dict) -> None: base.PrimitiveBase.__setstate__(self, state) self._encoders = state["models"] self._cols = state["columns"] def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: self._inputs = inputs # https://github.com/scikit-learn/scikit-learn/issues/14429#issuecomment-513887163 if type(outputs ) == container.pandas.DataFrame and outputs.shape[1] == 1: outputs = outputs.values.reshape(outputs.shape[0], ) else: outputs = outputs.iloc[:, 0].values self._outputs = pd.Series(outputs) def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: logger.debug(f"Fitting {__name__}") # determine columns to operate on cols = distil_utils.get_operating_columns( self._inputs, self.hyperparams["use_columns"], ("http://schema.org/Text", )) logger.debug(f"Found {len(cols)} columns to encode") self._cols = list(cols) self._encoders: List[SVMTextEncoder] = [] if len(cols) is 0: return base.CallResult(None) for i, c in enumerate(self._cols): if self.hyperparams["encoder_type"] == "svm": self._encoders.append( SVMTextEncoder(self.hyperparams["metric"], self.random_seed)) elif self.hyperparams["encoder_type"] == "tfidf": self._encoders.append(TfidifEncoder()) else: raise Exception( f"{self.hyperparams['encoder_type']} is not a valid encoder type" ) text_inputs = self._inputs.iloc[:, c] try: self._encoders[i].fit_transform( text_inputs, self._outputs ) # requires fit transform to fit SVM on vectorizer results except: text_inputs[:] = "avoiding a bug" self._encoders[i].fit_transform(text_inputs, self._outputs) return base.CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: logger.debug(f"Producing {__name__}") if len(self._cols) == 0: return base.CallResult(inputs) outputs = inputs.copy() encoded_cols = container.DataFrame() encoded_cols_source = [] # encode columns into a new dataframe for i, c in enumerate(self._cols): text_inputs = outputs.iloc[:, c] result = self._encoders[i].transform(text_inputs) for j in range(result.shape[1]): encoded_idx = i * result.shape[1] + j encoded_cols[(f"__text_{encoded_idx}")] = result[:, j] encoded_cols_source.append(c) # generate metadata for encoded columns encoded_cols.metadata = encoded_cols.metadata.generate(encoded_cols) for c in range(encoded_cols.shape[1]): encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), "http://schema.org/Float") encoded_cols.metadata = encoded_cols.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, c), self._attribute_semantic) col_dict = dict( encoded_cols.metadata.query((metadata_base.ALL_ELEMENTS, c))) col_dict["source_column"] = outputs.metadata.query( (metadata_base.ALL_ELEMENTS, encoded_cols_source[c]))["name"] encoded_cols.metadata = encoded_cols.metadata.update( (metadata_base.ALL_ELEMENTS, c), col_dict) # append the encoded columns and remove the source columns outputs = outputs.append_columns(encoded_cols) outputs = outputs.remove_columns(self._cols) logger.debug(f"\n{outputs}") return base.CallResult(outputs) def get_params(self) -> Params: return Params(_encoders=self._encoders, _cols=self._cols) def set_params(self, *, params: Params) -> None: self._encoders = params["_encoders"] self._cols = params["_cols"]
class StatisticalMeanAbsTemporalDerivativePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Primitive to find mean_abs_temporal_derivative of time series """ __author__ = "DATA Lab at Texas A&M University", metadata = metadata_base.PrimitiveMetadata( { 'id': 'eb571238-6229-4fe4-94b3-684f043e4dbf', 'version': '0.1.0', 'name': 'Time Series Decompostional', 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_mean_abs_temporal_derivative', 'keywords': ['Time Series','MeanAbsTemporalDerivative'], "hyperparams_to_tune": ['window_size'], 'source': { 'name': 'DATA Lab at Texas A&M University', 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalMeanAbsTemporalDerivative.py'], 'contact': 'mailto:[email protected]' }, 'installation': [ {'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), ), } ], 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, ], 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, } ) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: """ Args: inputs: Container DataFrame timeout: Default iterations: Default Returns: Container DataFrame containing mean_abs_temporal_derivative of time series """ self.logger.info('Statistical MeanAbsTemporalDerivative Primitive called') # Get cols to fit. self._fitted = False self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) self._input_column_names = self._training_inputs.columns if len(self._training_indices) > 0: # self._clf.fit(self._training_inputs) self._fitted = True else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") if not self._fitted: raise PrimitiveNotFittedError("Primitive not fitted.") statistical_mean_abs_temporal_derivative_input = inputs if self.hyperparams['use_semantic_types']: statistical_mean_abs_temporal_derivative_input = inputs.iloc[:, self._training_indices] output_columns = [] if len(self._training_indices) > 0: statistical_mean_abs_temporal_derivative_output = self._mean_abs_temporal_derivative(statistical_mean_abs_temporal_derivative_input,self.hyperparams["window_size"]) if sparse.issparse(statistical_mean_abs_temporal_derivative_output): statistical_mean_abs_temporal_derivative_output = statistical_mean_abs_temporal_derivative_output.toarray() outputs = self._wrap_predictions(inputs, statistical_mean_abs_temporal_derivative_output) #if len(outputs.columns) == len(self._input_column_names): # outputs.columns = self._input_column_names output_columns = [outputs] else: if self.hyperparams['error_on_no_input']: raise RuntimeError("No input columns were selected") self.logger.warn("No input columns were selected") outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], inputs=inputs, column_indices=self._training_indices, columns_list=output_columns) self.logger.info('Statistical MeanAbsTemporalDerivative Primitive returned') return base.CallResult(outputs) @classmethod def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): """ Select columns to fit. Args: inputs: Container DataFrame hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: list """ if not hyperparams['use_semantic_types']: return inputs, list(range(len(inputs.columns))) inputs_metadata = inputs.metadata def can_produce_column(column_index: int) -> bool: return cls._can_produce_column(inputs_metadata, column_index, hyperparams) use_columns = hyperparams['use_columns'] exclude_columns = hyperparams['exclude_columns'] columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column) return inputs.iloc[:, columns_to_produce], columns_to_produce # return columns_to_produce @classmethod def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: """ Output whether a column can be processed. Args: inputs_metadata: d3m.metadata.base.DataMetadata column_index: int Returns: bool """ column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, numpy.integer, numpy.float64) accepted_semantic_types = set() accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) return True if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False @classmethod def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: """ Updata metadata for selected columns. Args: inputs_metadata: metadata_base.DataMetadata outputs: Container Dataframe target_columns_metadata: list Returns: d3m.metadata.base.DataMetadata """ outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) for column_index, column_metadata in enumerate(target_columns_metadata): column_metadata.pop("structural_type", None) outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) return outputs_metadata def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: """ Wrap predictions into dataframe Args: inputs: Container Dataframe predictions: array-like data (n_samples, n_features) Returns: Dataframe """ outputs = d3m_dataframe(predictions, generate_metadata=True) target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) return outputs @classmethod def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): """ Add target columns metadata Args: outputs_metadata: metadata.base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: List[OrderedDict] """ outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): # column_name = "output_{}".format(column_index) column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_metadata['semantic_types'] = list(semantic_types) # column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata def _write(self, inputs: Inputs): inputs.to_csv(str(time.time()) + '.csv') def _mean_abs_temporal_derivative(self,X,window_size): """ statistical mean_abs_temporal_derivative of time series sequence Args: X : DataFrame Time series. Returns: DataFrame A object with mean_abs_temporal_derivative """ if(window_size==-1): window_size = len(X) transformed_X = utils.pandas.DataFrame() for column in X.columns: column_value = X[column].values column_mean_abs_temporal_derivative = np.zeros(len(column_value)) for iter in range(window_size-1,len(column_value)): sequence = column_value[iter-window_size+1:iter+1] column_mean_abs_temporal_derivative[iter] = np.mean(np.abs(np.diff(sequence))) column_mean_abs_temporal_derivative[:window_size-1] = column_mean_abs_temporal_derivative[window_size-1] transformed_X[column + "_mean_abs_temporal_derivative"] = column_mean_abs_temporal_derivative return transformed_X
class NBEATSPrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ This primitive applies the Neural basis expansion analysis for interpretable time series forecasting (NBEATS) method for time series forecasting. The implementation is based off of this paper: https://arxiv.org/abs/1905.10437 and this repository: https://gluon-ts.mxnet.io/index.html """ metadata = metadata_base.PrimitiveMetadata({ "id": "3952a074-145e-406d-9cee-80232ae8f3ae", "version": __version__, "name": "NBEATS", "keywords": [ "time series", "forecasting", "deep neural network", "fully-connected", "residual network", "interpretable", ], "source": { "name": __author__, "contact": __contact__, "uris": [ "https://github.com/kungfuai/d3m-primitives", ], }, "installation": [ { "type": "PIP", "package": "cython", "version": "0.29.16" }, { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, ], "python_path": "d3m.primitives.time_series_forecasting.feed_forward_neural_net.NBEATS", "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.DEEP_NEURAL_NETWORK, ], "primitive_family": metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING, "can_use_gpus": True, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._freq = None self._is_fit = False self.preds = None def get_params(self) -> Params: return Params( nbeats_dataset=self._nbeats_dataset, timestamp_column=self._timestamp_column, group_cols=self._grouping_columns, output_column=self._output_column, target_column=self._target_column, freq=self._freq, reind_freq=self._reind_freq, is_fit=self._is_fit, min_trains=self._min_trains, ) def set_params(self, *, params: Params) -> None: self._nbeats_dataset = params["nbeats_dataset"] self._timestamp_column = params["timestamp_column"] self._grouping_columns = params["group_cols"] self._output_column = params["output_column"] self._target_column = params["target_column"] self._freq = params["freq"] self._reind_freq = params["reind_freq"] self._is_fit = params["is_fit"] self._min_trains = params["min_trains"] def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe containing attributes outputs {Outputs} -- D3M dataframe containing targets Raises: ValueError: If multiple columns are annotated with 'Time' or 'DateTime' metadata """ self._output_column = outputs.columns[0] frame = inputs.append_columns(outputs) self._get_cols(frame) self._set_freq(frame) frame, self._min_trains, max_train_length, _ = self._reindex(frame) self._check_window_support(max_train_length) self._nbeats_dataset = NBEATSDataset( frame, self._grouping_columns, self._timestamp_column, self._target_column, self._freq, self.hyperparams["prediction_length"], self.hyperparams["num_context_lengths"], ) self._train_data = self._nbeats_dataset.get_data() def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """Fits NBEATS model using training data from set_training_data and hyperparameters Keyword Arguments: timeout {float} -- timeout, considered (default: {None}) iterations {int} -- iterations, considered (default: {None}) Returns: CallResult[None] """ if iterations is None: iterations = self.hyperparams["epochs"] has_finished = True else: has_finished = False if self.hyperparams["interpretable"]: num_stacks = 2 num_blocks = [1] widths = [256, 2048] sharing = [True] expansion_coefficient_lengths = [3] stack_types = ["T", "S"] estimator_class = NBEATSEnsembleEstimatorHook else: num_stacks = 30 num_blocks = [3] widths = [512] sharing = [False] expansion_coefficient_lengths = [32] stack_types = ["G"] estimator_class = NBEATSEnsembleEstimator estimator = estimator_class( freq=self._freq, prediction_length=self.hyperparams["prediction_length"], meta_context_length=[ i for i in range(2, self.hyperparams["num_context_lengths"] + 2) ], meta_loss_function=["sMAPE", "MASE", "MAPE"], meta_bagging_size=self.hyperparams["num_estimators"], num_stacks=num_stacks, num_blocks=num_blocks, widths=widths, sharing=sharing, expansion_coefficient_lengths=expansion_coefficient_lengths, stack_types=stack_types, trainer=Trainer( epochs=iterations, learning_rate=self.hyperparams["learning_rate"], batch_size=self.hyperparams["training_batch_size"], num_batches_per_epoch=self.hyperparams["steps_per_epoch"], ), ) logger.info(f"Fitting for {iterations} iterations") start_time = time.time() predictor = estimator.train(self._train_data) predictor.batch_size = self.hyperparams["inference_batch_size"] predictor.set_aggregation_method("none") self._is_fit = True logger.info( f"Fit for {iterations} epochs, took {time.time() - start_time}s") if not os.path.isdir(self.hyperparams["weights_dir"]): os.mkdir(self.hyperparams["weights_dir"]) predictor.serialize(Path(self.hyperparams["weights_dir"])) return CallResult(None, has_finished=has_finished) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """Produce primitive's predictions for specific time series at specific future time instances * these specific timesteps / series are specified implicitly by input dataset Arguments: inputs {Inputs} -- D3M dataframe containing attributes Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- (N, 2) dataframe with d3m_index and value for each prediction slice requested. prediction slice = specific horizon idx for specific series in specific regression """ all_preds, pred_intervals = self._produce(inputs) if self.hyperparams["interpretable"]: all_components = [[] for c in range(3)] for series, idxs in zip(all_preds, pred_intervals): for i, component in enumerate(series): all_components[i].append(component[idxs]) all_components = [ np.concatenate(component) for component in all_components ] col_names = ( self._output_column, "trend-component", "seasonality-component", ) df_data = { col_name: component for col_name, component in zip(col_names, all_components) } else: point_estimates = np.concatenate([ series[0][idxs] for series, idxs in zip(all_preds, pred_intervals) ]) df_data = {self._output_column: point_estimates} result_df = container.DataFrame( df_data, generate_metadata=True, ) result_df.metadata = result_df.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"), ) return CallResult(result_df, has_finished=self._is_fit) def _get_col_names(self, col_idxs, all_col_names): """ transform column indices to column names """ return [all_col_names[i] for i in col_idxs] def _process_special_col(self, col_list, col_type): """private util function that warns if multiple special columns""" if len(col_list) == 0: return None elif len(col_list) > 1: logger.warn( f"""There are more than one {col_type} marked. This primitive will use the first""" ) return col_list[0] def _sort_by_timestamp(self, frame): """private util function: convert to pd datetime and sort""" time_name = frame.columns[self._timestamp_column] new_frame = frame.copy() if "http://schema.org/Integer" in frame.metadata.query_column_field( self._timestamp_column, "semantic_types"): new_frame.iloc[:, self._timestamp_column] = pd.to_datetime( new_frame.iloc[:, self._timestamp_column] - 1, unit="D") self._freq = "D" self._reind_freq = "D" else: new_frame.iloc[:, self._timestamp_column] = pd.to_datetime( new_frame.iloc[:, self._timestamp_column], unit="s") return new_frame.sort_values(by=time_name) def _set_freq(self, frame): """sets frequency using differences in timestamp column in data frame ASSUMPTION: frequency is the same across all grouped time series """ if len(self._grouping_columns) == 0: if self._freq is None: diff = (frame.iloc[1, self._timestamp_column] - frame.iloc[0, self._timestamp_column]) self._freq, self._reind_freq = calculate_time_frequency( diff, model="gluon") else: if self._freq is None: g_cols = self._get_col_names(self._grouping_columns, frame.columns) for g, df in frame.groupby(g_cols, sort=False): diff = (df.iloc[1, self._timestamp_column] - df.iloc[0, self._timestamp_column]) break self._freq, self._reind_freq = calculate_time_frequency( diff, model="gluon") def _robust_reindex(self, frame): """ reindex dataframe IFF it has > 1 row, interpolate target column """ frame = self._sort_by_timestamp(frame) original_times = frame.iloc[:, self._timestamp_column] frame = frame.drop_duplicates( subset=frame.columns[self._timestamp_column]) frame.index = frame.iloc[:, self._timestamp_column] if frame.shape[0] > 1: frame = frame.reindex( pd.date_range( frame.index[0], frame.index[-1], freq=self._reind_freq, )) # only interpolate when target exists during training if self._target_column < frame.shape[1]: frame.iloc[:, self. _target_column] = frame.iloc[:, self. _target_column].interpolate( ) frame.iloc[:, self. _grouping_columns] = frame.iloc[:, self. _grouping_columns].ffill() return frame, original_times def _reindex(self, frame): """reindex data, interpolating target columns""" if len(self._grouping_columns) == 0: df, original_times = self._robust_reindex(frame) return df, [df.index[0]], df.shape[0], original_times else: all_dfs, min_trains, original_times = [], {}, OrderedDict() max_train_length = 0 g_cols = self._get_col_names(self._grouping_columns, frame.columns) for grp, df in frame.groupby(g_cols, sort=False): df, orig_times = self._robust_reindex(df) if df.shape[0] > max_train_length: max_train_length = df.shape[0] all_dfs.append(df) min_trains[grp] = df.index[0] original_times[grp] = orig_times return pd.concat( all_dfs), min_trains, max_train_length, original_times def _get_cols(self, frame): """private util function: get indices of important columns from metadata""" input_metadata = frame.metadata # get target idx (first column by default) target_columns = input_metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/SuggestedTarget", "https://metadata.datadrivendiscovery.org/types/TrueTarget", "https://metadata.datadrivendiscovery.org/types/Target", )) if len(target_columns) == 0: raise ValueError("At least one column must be marked as a target") self._target_column = self._process_special_col( target_columns, "target column") # get timestamp idx (first column by default) timestamp_columns = input_metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/Time", "http://schema.org/DateTime", )) self._timestamp_column = self._process_special_col( timestamp_columns, "timestamp column") # get grouping idx self._grouping_columns = input_metadata.list_columns_with_semantic_types( ("https://metadata.datadrivendiscovery.org/types/GroupingKey", )) suggested_group_cols = input_metadata.list_columns_with_semantic_types(( "https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey", )) if len(self._grouping_columns) == 0: self._grouping_columns = suggested_group_cols def _check_window_support(self, max_train_length): """ ensures that at least one series of target series is >= context_length """ if max_train_length < self.hyperparams["prediction_length"]: raise ValueError( f"This training set does not support a prediction length of {self.hyperparams['prediction_length']} " + f"because its longest series has length {max_train_length} observations. Please " + f"choose a shorter prediction length.") def _get_pred_intervals(self, original_times): """private util function that retrieves unevenly spaced prediction intervals from data frame""" if len(self._grouping_columns) == 0: intervals = discretize_time_difference(original_times, self._min_trains[0], self._freq, zero_index=True) all_intervals = [np.array(intervals) + 1] else: all_intervals = [] for grp, times in original_times.items(): if grp in self._min_trains.keys(): intervals = discretize_time_difference( times, self._min_trains[grp], self._freq, zero_index=True) else: logger.info( f"Series with category {grp} did not exist in training data, " + f"These predictions will be returned as np.nan.") intervals = np.zeros(times.shape[0]).astype(int) all_intervals.append(np.array(intervals) + 1) return all_intervals def _produce(self, inputs: Inputs): """ internal produce method to support produce() and produce_confidence_intervals() methods """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") test_frame = inputs.copy() nbeats_forecast = NBEATSForecast( self._nbeats_dataset, self.hyperparams["weights_dir"], self.hyperparams["interpretable"], self.hyperparams["output_mean"], self.hyperparams["nan_padding"], ) test_frame, _, _, original_times = self._reindex(test_frame) pred_intervals = self._get_pred_intervals(original_times) st = time.time() preds = nbeats_forecast.predict(test_frame, pred_intervals) logger.info(f"Making predictions took {time.time() - st}s") return preds, pred_intervals
class SignalFramer(FeaturizationTransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ BBN D3M Signal Framing Primitive divides the audio signal into number of frames. Input: List of arrays with samples of shape [ num_samples ] Output: List of arrays with frames of shape [ num_frames, frame_length ] Applications include: audio, time-series classification """ __git_commit__=utils.current_git_commit(os.path.dirname(__file__)) metadata = metadata_module.PrimitiveMetadata({ 'id': '4d7160ef-ca70-4150-b513-36b90817ba45', 'version': __version__, 'name': "Signal Framing", 'description': """BBN D3M Signal Framing Primitive divides the audio signal into number of frames.\n Input: List of arrays with samples of shape [ num_samples ]\n Output: List of arrays with frames of shape [ num_frames, frame_length ]\n Applications include: audio, time-series classification""", 'keywords': [], 'source': { 'name': __author__, 'contact':'mailto:[email protected]', 'uris': [ 'https://github.com/BBN-E/d3m-bbn-primitives/blob/{git_commit}/bbn_primitives/time_series/signal_framing.py'.format( git_commit=__git_commit__ ), 'https://github.com/BBN-E/d3m-bbn-primitives.git', ], }, 'installation': [{ 'type': 'PIP', 'package_uri': 'git+https://github.com/BBN-E/d3m-bbn-primitives.git@{git_commit}#egg={egg}'.format( git_commit=__git_commit__, egg='bbn_primitives' ), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.time_series_segmentation.signal_framer.SignalFramer',#'d3m.primitives.bbn.time_series.SignalFramer', #'d3m.primitives.time_series_segmentation.signal_framer.BBN', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [metadata_module.PrimitiveAlgorithmType.UNIFORM_TIME_SERIES_SEGMENTATION], 'primitive_family': metadata_module.PrimitiveFamily.TIME_SERIES_SEGMENTATION, }) def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, DockerContainer] = None ) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) return def _frame_length(self, sampling_rate: float) -> int: return int(self.hyperparams['frame_length_s'] * sampling_rate) def _frame_shift(self, sampling_rate: float) -> int: return max(int(self.hyperparams['frame_shift_s'] * sampling_rate), 1) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Arguments: - inputs: [ num_samples ] Returns: - [ num_windows, window_len ] """ with stopit.ThreadingTimeout(timeout) as timer: outputs = Outputs() metadata = inputs.metadata.clear({ 'schema': metadata_module.CONTAINER_SCHEMA_VERSION, 'structural_type': Outputs, 'dimension': { 'length': len(outputs) } }, for_value=outputs).update((metadata_module.ALL_ELEMENTS,), { 'structural_type': d3m_ndarray, }) for input_id in range(len(inputs)): cinput = inputs[input_id] # TODO: review the following because it's hacky # It was done in the way to enable handling both audio (high sampling_rate) and frames sampling_rate = inputs.metadata.query((input_id,))['sampling_rate'] if 'sampling_rate' in inputs.metadata.query((input_id,)) else 1 frame_length = self._frame_length(sampling_rate) frame_shift = self._frame_shift(sampling_rate) if cinput.size == 0: outputs.append(d3m_ndarray(np.array([]), generate_metadata=False)) continue if cinput.shape[0] <= frame_length: if len(cinput.shape) <= 2: cinput = np.concatenate((cinput, #np.matlib.repmat(cinput[-1], frame_length-cinput.shape[0], 1) np.zeros((frame_length-cinput.shape[0],)+cinput.shape[1:], dtype=cinput.dtype) )) shape = ((cinput.shape[0] - frame_length) // frame_shift + 1, frame_length) + cinput.shape[1:] strides = (cinput.strides[0]*frame_shift,cinput.strides[0]) + cinput.strides[1:] coutput = np.lib.stride_tricks.as_strided(cinput, shape=shape, strides=strides) outputs.append(d3m_ndarray( coutput.flatten() if self.hyperparams['flatten_output'] else coutput, generate_metadata=False)) if 'sampling_rate' in inputs.metadata.query((input_id,)): metadata = metadata.update((input_id,), { 'sampling_rate': inputs.metadata.query((input_id,))['sampling_rate'] }) #metadata = metadata.update((), { 'dimension': { 'length': len(outputs) } }) # Set metadata attribute. outputs.metadata = metadata if timer.state == timer.EXECUTED: return CallResult(outputs) else: raise TimeoutError('SignalFramer exceeded time limit')
class Sent2VecPrimitive(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Produce numerical representations (features) for short texts or sentences. Parameters ---------- inputs : Input pandas dataframe Returns ------- Outputs The output is a pandas dataframe """ metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". "id": "cf450079-9333-4a3f-aed4-b77a4e8c7be7", "version": __version__, "name": "sent2vec_wrapper", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. "keywords": ["Sent2Vec", "Embedding", "NLP", "Natural Language Processing"], "source": { "name": __author__, "contact": __contact__, "uris": [ # Unstructured URIs. "https://github.com/NewKnowledge/nk-sent2vec-d3m-wrapper" ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/NewKnowledge/nk-sent2vec-d3m-wrapper.git@{git_commit}#egg=sent2vec_wrapper" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__))), }, { "type": "FILE", "key": "sent2vec_model", "file_uri": "http://public.datadrivendiscovery.org/twitter_bigrams.bin", "file_digest": "9e8ccfea2aaa4435ca61b05b11b60e1a096648d56fff76df984709339f423dd6", }, ], # The same path the primitive is registered with entry points in setup.py. "python_path": "d3m.primitives.feature_extraction.nk_sent2vec.Sent2Vec", # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [metadata_base.PrimitiveAlgorithmType.VECTORIZATION], "primitive_family": metadata_base.PrimitiveFamily.FEATURE_EXTRACTION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, volumes: typing.Dict[str, str] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, volumes=volumes) self.volumes = volumes def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce numerical representations (features) for short texts or sentences. Parameters ---------- inputs : Input pandas dataframe Returns ------- Outputs The output is a pandas dataframe """ # extract sentences from stored in nested media files text_columns = inputs.metadata.get_columns_with_semantic_type( 'https://metadata.datadrivendiscovery.org/types/FileName') base_paths = [ inputs.metadata.query( (metadata_base.ALL_ELEMENTS, t))['location_base_uris'][0].replace('file:///', '/') for t in text_columns ] txt_paths = [[ os.path.join(base_path, filename) for filename in inputs.iloc[:, col] ] for base_path, col in zip(base_paths, text_columns)] txt = [[ open(path, 'r').read().replace('\n', '') for path in path_list ] for path_list in txt_paths] txt_df = pd.DataFrame(np.array(txt).T) # concatenate with text columns that aren't stored in nested files local_text_columns = inputs.metadata.get_columns_with_semantic_type( 'http://schema.org/Text') local_text_columns = [ col for col in local_text_columns if col not in text_columns ] frame = pd.concat((txt_df, inputs[local_text_columns]), axis=1) # delete columns with path names of nested media files outputs = inputs.remove_columns(text_columns) try: vectorizer = _Sent2Vec(path=self.volumes["sent2vec_model"]) #print('loaded sent2vec model', file = sys.__stdout__) output_vectors = [] for col in range(frame.shape[1]): text = frame.iloc[:, col].tolist() embedded_sentences = vectorizer.embed_sentences(sentences=text) output_vectors.append(embedded_sentences) embedded_df = pd.DataFrame( np.array(output_vectors).reshape(len(embedded_sentences), -1)) except ValueError: # just return inputs with file names deleted if vectorizing fails return CallResult(outputs) #print('successfully vectorized text\n', file = sys.__stdout__) # create df with vectorized columns and append to input df embedded_df = d3m_DataFrame(embedded_df) for col in range(embedded_df.shape[1]): col_dict = dict( embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, col))) col_dict['structural_type'] = type(1.0) col_dict['name'] = "vector_" + str(col) col_dict["semantic_types"] = ( "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/Attribute", ) embedded_df.metadata = embedded_df.metadata.update( (metadata_base.ALL_ELEMENTS, col), col_dict) df_dict = dict( embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict_1 = dict( embedded_df.metadata.query((metadata_base.ALL_ELEMENTS, ))) df_dict['dimension'] = df_dict_1 df_dict_1['name'] = 'columns' df_dict_1['semantic_types'] = ( 'https://metadata.datadrivendiscovery.org/types/TabularColumn', ) df_dict_1['length'] = embedded_df.shape[1] embedded_df.metadata = embedded_df.metadata.update( (metadata_base.ALL_ELEMENTS, ), df_dict) return CallResult(outputs.append_columns(embedded_df))
class SSC_OMP(clustering.ClusteringDistanceMatrixMixin[Inputs, Outputs, type(None), SSC_OMPHyperparams, DistanceMatrixOutput], clustering.ClusteringTransformerPrimitiveBase[Inputs, Outputs, SSC_OMPHyperparams]): """ This code implements the subspace clustering algorithm described in Chong You, Daniel Robinson, Rene Vidal, "Scalable Sparse Subspace Clustering by Orthogonal Matching Pursuit", CVPR 2016. It performs the OMP algorithm on every column of X using all other columns as a dictionary :param data: A dxN numpy array :param K: The maximum subspace dimension :param thres: termination condition :return: the SSC-OMP representation of the data """ metadata = metadata_module.PrimitiveMetadata({ 'id': '50f89f90-7cef-4bb6-b56f-642f85bd1d58', 'version': "0.0.5", 'name': 'SSC_OMP', 'description': """Does sparse subspace clustering using orthogonal matching pursuit.""", 'keywords': ['clustering', 'subspace', 'sparse', 'orthogonal matching pursuit'], 'source': { 'name': 'Michigan', 'contact': 'mailto:[email protected]', 'uris': [ #link to file and repo 'https://github.com/dvdmjohnson/d3m_michigan_primitives/blob/master/spider/cluster/ssc_omp/ssc_omp.py', 'https://github.com/dvdmjohnson/d3m_michigan_primitives'], 'citation': """@inproceedings{you2016scalable, title={Scalable sparse subspace clustering by orthogonal matching pursuit}, author={You, Chong and Robinson, Daniel and Vidal, Ren{\'e}}, booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, pages={3918--3927}, year={2016}}""" }, 'installation': [ {'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/dvdmjohnson/d3m_michigan_primitives.git@{git_commit}#egg=spider'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__))) }, {'type': metadata_module.PrimitiveInstallationType.UBUNTU, 'package': 'ffmpeg', 'version': '7:2.8.11-0ubuntu0.16.04.1'}], 'python_path': 'd3m.primitives.clustering.ssc_omp.Umich', 'hyperparams_to_tune': ['n_clusters', 'sparsity_level'], 'algorithm_types': [ metadata_module.PrimitiveAlgorithmType.SUBSPACE_CLUSTERING], 'primitive_family': metadata_module.PrimitiveFamily.CLUSTERING }) def __init__(self, *, hyperparams: SSC_OMPHyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, base.DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._k = hyperparams['n_clusters'] self._max_subspace_dim = hyperparams['sparsity_level'] self._thres = hyperparams['thresh'] self._random_state = np.random.RandomState(random_seed) def set_training_data(self, *, inputs: Inputs) -> None: pass @staticmethod def _cNormalize(data, norm=2): """ This method performs the column wise normalization of the input data :param data: A dxN numpy array :param norm: the desired norm value (This has to be in accordance with the accepted numpy norm values :return: Returns the column wise normalised data """ return data / (np.linalg.norm(data, ord=norm, axis = 0) + 2.220446049250313e-16) @staticmethod def _OMPMatFunction(data, K, thres): memory_total = 0.1 * 10**9 _, n = data.shape data_normalised = SSC_OMP._cNormalize(data) support_set = np.ones((n, K), dtype=np.int64) indices = np.arange(n, dtype=np.int64).reshape(n, 1) * np.ones((1, K)) values = np.zeros((n, K)) t_vector = np.ones((n, 1), dtype=np.int64) * K residual = np.copy(data_normalised) for t in range(K): counter = 0 block_size = np.ceil(memory_total / n) while True: mask = np.arange(counter, min(counter+block_size, n)) iMat = np.abs(np.matmul(data.T, residual[:, mask])) np.fill_diagonal(iMat, 0.0) jMat = np.argmax(iMat, axis=0) support_set[mask, t] = jMat counter = counter + block_size if counter >= n: break if t+1 != K: for iN in range(n): if t_vector[iN] == K: B = data_normalised[:, support_set[iN, 0:(t+1)]] mat_tmp, _, _, _ = lstsq(B, data_normalised[:, iN]) residual[:, iN] = data_normalised[:, iN] - np.matmul(B, mat_tmp) if np.sum(residual[:, iN]**2) < thres: t_vector[iN] = t if not np.any(K == t_vector): break for iN in range(n): tmp, _, _, _ = lstsq(data[:, support_set[iN, 0:np.asscalar(t_vector[iN] + 1)]], (data[:, iN])) values[iN, 0:np.asscalar(t_vector[iN])] = tmp.T sparse_mat = sps.coo_matrix((values.flat, (support_set.flat, indices.flat)), shape=(n, n)) sparse_mat = sparse_mat.toarray() return sparse_mat def _spectral_clustering(self, W, n_clusters = 10, max_iter = 1000, n_init = 20): N,_ = W.shape eps = 2.220446049250313e-16 DN = np.diag(1/np.sqrt(np.sum(W, axis = 0) + eps)) LapN = np.identity(N) - np.matmul(np.matmul(DN, W), DN) _, _, VN = np.linalg.svd(LapN) kerN = VN.T[:,(N - n_clusters):N] normN = np.sqrt(np.sum(np.square(kerN), axis = 1)); kerNS = (kerN.T / (normN + eps).T).T l = KMeans(n_clusters, n_init = n_init, max_iter = max_iter, random_state = self._random_state).fit(kerNS) labels = l.labels_.reshape((N,)) return labels def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: assert inputs.ndim == 2, "Data is not in the right shape" assert self._max_subspace_dim <= inputs.shape[1], "max_subspace dim can't be greater than the" + \ "input feature space" if iterations is None or iterations < 5: iterations = 200 data = inputs.T R = SSC_OMP._OMPMatFunction(data, self._max_subspace_dim, self._thres) np.fill_diagonal(R, 0) A = np.abs(R) + np.abs(R.T) labels = self._spectral_clustering(A, n_clusters=self._k, max_iter=iterations, n_init=20) return base.CallResult(Outputs(labels)) def produce_distance_matrix(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[DistanceMatrixOutput]: """ Returns 1 - the affinity matrix generated from the subspace-transformed data """ assert inputs.ndim == 2, "Data is not in the right shape" assert self._max_subspace_dim <= inputs.shape[1], "max_subspace dim can't be greater than the" + \ "input feature space" data = inputs.T R = SSC_OMP._OMPMatFunction(data, self._max_subspace_dim, self._thres) np.fill_diagonal(R, 0) A = np.abs(R) + np.abs(R.T) return base.CallResult(DistanceMatrixOutput(1 - A)) def __getstate__(self) -> dict: return { 'constructor': { 'hyperparams': self.hyperparams, 'random_seed': self.random_seed, 'docker_containers': self.docker_containers, }, 'random_state': self._random_state, } def __setstate__(self, state: dict) -> None: self.__init__(**state['constructor']) # type: ignore self._random_state = state['random_state'] #placeholder for now, just calls base version. @classmethod def can_accept(cls, *, method_name: str, arguments: typing.Dict[str, typing.Union[metadata_module.Metadata, type]], hyperparams: SSC_OMPHyperparams) -> typing.Optional[metadata_module.DataMetadata]: return super().can_accept(method_name=method_name, arguments=arguments, hyperparams=hyperparams)
class TimeSeriesFormatterPrimitive( transformer.TransformerPrimitiveBase[container.Dataset, container.Dataset, Hyperparams]): """ Reads the time series files from a given column in an input dataset resource into a new M x N data resource, where each value in timeseries occupies one of M rows. Each row has N columns, representing the union of the fields found in the timeseries files and in the main data resource. The loading process assumes that each series file has an identical set of timestamps. """ _semantic_types = ( 'https://metadata.datadrivendiscovery.org/types/FileName', 'https://metadata.datadrivendiscovery.org/types/Timeseries', 'http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute') _media_types = ('text/csv', ) __author__ = 'Uncharted Software', metadata = metadata_base.PrimitiveMetadata({ 'id': '24b09066-836f-4b8f-9773-8c86a5eee26c', 'version': '0.2.0', 'name': 'Time series formatter', 'python_path': 'd3m.primitives.data_preprocessing.timeseries_formatter.DistilTimeSeriesFormatter', 'keywords': ['series', 'reader', 'csv'], 'source': { 'name': 'Uncharted Software', 'contact': 'mailto:[email protected]', 'uris': ['https://gitlab.com/uncharted-distil/distil-timeseries-loader'] }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/uncharted-distil/distil-timeseries-loader.git@' + '{git_commit}#egg=DistilTimeSeriesLoader-0.2.0'.format( git_commit=d3m_utils.current_git_commit( os.path.dirname(__file__)), ), }], 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION, ], 'supported_media_types': _media_types, 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, }) @classmethod def _find_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, res_id: int) -> typing.Optional[int]: indices = inputs_metadata.list_columns_with_semantic_types( cls._semantic_types, at=(res_id, )) for i in indices: if cls._is_csv_file_column(inputs_metadata, res_id, i): return i return None @classmethod def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, res_id: int, column_index: int) -> bool: # check to see if a given column is a file pointer that points to a csv file column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata['structural_type'] != str: return False # check if a foreign key exists if column_metadata['foreign_key'] is None: return False ref_col_index = column_metadata['foreign_key']['column_index'] ref_res_id = column_metadata['foreign_key']['resource_id'] return cls._is_csv_file_reference(inputs_metadata, ref_res_id, ref_col_index) @classmethod def _is_csv_file_reference(cls, inputs_metadata: metadata_base.DataMetadata, res_id: int, column_index: int) -> bool: # check to see if the column is a csv resource column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata['structural_type'] != str: return False semantic_types = column_metadata.get('semantic_types', []) media_types = column_metadata.get('media_types', []) semantic_types_set = set(semantic_types) _semantic_types_set = set(cls._semantic_types) return bool( semantic_types_set.intersection(_semantic_types_set)) and set( cls._media_types).issubset(media_types) def produce(self, *, inputs: container.Dataset, timeout: float = None, iterations: int = None) -> base.CallResult[container.Dataset]: main_resource_index = self.hyperparams['main_resource_index'] if main_resource_index is None: raise exceptions.InvalidArgumentValueError( 'no main resource specified') file_index = self.hyperparams['file_col_index'] if file_index is not None: if not self._is_csv_file_column(inputs.metadata, main_resource_index, file_index): raise exceptions.InvalidArgumentValueError( 'column idx=' + str(file_index) + ' from does not contain csv file names') else: file_index = self._find_csv_file_column(inputs.metadata) if file_index is None: raise exceptions.InvalidArgumentValueError( 'no column from contains csv file names') # generate the long form timeseries data base_path = self._get_base_path(inputs.metadata, main_resource_index, file_index) output_data = [] timeseries_dataframe = pd.DataFrame() for idx, tRow in inputs[main_resource_index].iterrows(): # read the timeseries data csv_path = os.path.join(base_path, tRow[file_index]) timeseries_row = pd.read_csv(csv_path) # add the timeseries id tRow = tRow.append(pd.Series({'series_id': int(idx)})) # combine the timeseries data with the value row output_data.extend([ pd.concat([tRow, vRow]) for vIdx, vRow in timeseries_row.iterrows() ]) # add the timeseries index timeseries_dataframe = timeseries_dataframe.append(output_data, ignore_index=True) # join the metadata from the 2 data resources timeseries_dataframe = container.DataFrame(timeseries_dataframe) # wrap as a D3M container #return base.CallResult(container.Dataset({'0': timeseries_dataframe}, metadata)) return base.CallResult( container.Dataset({'0': timeseries_dataframe}, generate_metadata=True)) def _get_base_path(self, inputs_metadata: metadata_base.DataMetadata, res_id: str, column_index: int) -> str: # get the base uri from the referenced column column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) ref_col_index = column_metadata['foreign_key']['column_index'] ref_res_id = column_metadata['foreign_key']['resource_id'] return inputs_metadata.query((ref_res_id, metadata_base.ALL_ELEMENTS, ref_col_index))['location_base_uris'][0] def _get_ref_resource(self, inputs_metadata: metadata_base.DataMetadata, res_id: str, column_index: int) -> str: # get the referenced resource from the referenced column column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) ref_res_id = column_metadata['foreign_key']['resource_id'] return ref_res_id
class Parrot(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]): ''' Produce the primitive's prediction for future time series data. The output is a list of length 'n_periods' that contains a prediction for each of 'n_periods' future time periods. 'n_periods' is a hyperparameter that must be set before making the prediction. ''' metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "d473d487-2c32-49b2-98b5-a2b48571e07c", 'version': __version__, 'name': "parrot", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Time Series'], 'source': { 'name': __author__, 'contact': __contact__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/parrot-d3m-wrapper", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package': 'cython', 'version': '0.28.5', }, { "type": "PIP", "package_uri": "git+https://github.com/NewKnowledge/sloth.git@82a1e08049531270256f38ca838e6cc7d1119223#egg=Sloth-2.0.3" }, { 'type': metadata_base.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/NewKnowledge/parrot-d3m-wrapper.git@{git_commit}#egg=ParrotD3MWrapper' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.time_series_forecasting.arima.Parrot', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType. AUTOREGRESSIVE_INTEGRATED_MOVING_AVERAGE, ], 'primitive_family': metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed) self._params = {} self._X_train = None # training inputs self._sloth = Sloth() # Sloth model self._arima = None # ARIMA classifier def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Fits ARIMA model using training data from set_training_data and hyperparameters """ # fits ARIMA model using training data from set_training_data and hyperparameters self._arima = self._sloth.FitSeriesARIMA( self._X_train, self.hyperparams['seasonal'], self.hyperparams['seasonal_differencing']) return CallResult(None) def get_params(self) -> Params: return self._params def set_params(self, *, params: Params) -> None: self.params = params def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Set primitive's training data Parameters ---------- inputs : pandas data frame containing training data where first column contains dates and second column contains values """ # use column according to hyperparameter index self._X_train = (inputs.iloc[:, self.hyperparams['index']].values).astype( np.float) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's prediction for future time series data Parameters ---------- None Returns ---------- Outputs The output is a data frame containing the d3m index and a forecast for each of the 'n_periods' future time periods """ # add metadata to output # just take d3m index from input test set output_df = inputs['d3mIndex'] # produce future foecast using arima future_forecast = pandas.DataFrame( self._sloth.PredictSeriesARIMA(self._arima, self.hyperparams['n_periods'])) output_df = pandas.concat([output_df, future_forecast], axis=1) parrot_df = d3m_DataFrame(output_df) # first column ('d3mIndex') col_dict = dict( parrot_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict['structural_type'] = type("1") col_dict['name'] = 'd3mIndex' col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', ) parrot_df.metadata = parrot_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) # second column ('predictions') col_dict = dict( parrot_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict['structural_type'] = type("1") col_dict['name'] = list(inputs)[self.hyperparams['index']] col_dict['semantic_types'] = ( 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute', ) parrot_df.metadata = parrot_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) return CallResult(parrot_df)
class SegmentCurveFitter(FeaturizationTransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ BBN D3M Segment Curve Fitter takes segmented sequence of feature vectors as input and for each segment and feature dimension separately replaces the series of values by coefficients of its polynomial approximation of specified degree Input: List of lists of segmented sequence of feature vectors, i.e. List( [ seg_length_1, num_features ], [ seg_length_2, num_features ], ...) Output: List of lists of segmented sequence of polynomial coefficients, i.e. List( [ poly_deg, num_features ], [ poly_deg, num_features ], ...) Applications include: audio, time-series classification For details, refer to Gish, H. and Ng, K., 1996, October. Parametric trajectory models for speech recognition. In Spoken Language, 1996. ICSLP 96. Proceedings., Fourth International Conference on (Vol. 1, pp. 466-469). IEEE. """ __git_commit__ = utils.current_git_commit(os.path.dirname(__file__)) metadata = metadata_module.PrimitiveMetadata({ 'id': '7c1d88a3-2388-4ba8-97c6-aa0aa2673024', 'version': __version__, 'name': "Segment Curve Fitter", 'description': """BBN D3M Segment Curve Fitter takes segmented sequence of feature vectors as input and for each segment and feature dimension separately replaces the series of values by coefficients of its polynomial approximation of specified degree\n Input: List of lists of segmented sequence of feature vectors, i.e. List( [ seg_length_1, num_features ], [ seg_length_2, num_features ], ...)\n Output: List of lists of segmented sequence of polynomial coefficients, i.e. List( [ poly_deg, num_features ], [ poly_deg, num_features ], ...)\n Applications include: audio, time-series classification""", 'keywords': [], 'source': { 'name': __author__, 'contact': 'mailto:[email protected]', 'uris': [ 'https://github.com/BBN-E/d3m-bbn-primitives/blob/{git_commit}/bbn_primitives/time_series/segment_curve_fitter.py' .format(git_commit=__git_commit__), 'https://github.com/BBN-E/d3m-bbn-primitives.git', ], }, 'installation': [{ 'type': 'PIP', 'package_uri': 'git+https://github.com/BBN-E/d3m-bbn-primitives.git@{git_commit}#egg={egg}' .format(git_commit=__git_commit__, egg='bbn_primitives'), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.data_transformation.segment_curve_fitter.SegmentCurveFitter', #'d3m.primitives.bbn.time_series.SegmentCurveFitter', #'d3m.primitives.data_transformation.segment_curve_fitter.BBN', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_module.PrimitiveAlgorithmType. PARAMETRIC_TRAJECTORY_MODELING ], 'primitive_family': metadata_module.PrimitiveFamily.DATA_TRANSFORMATION, }) def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) return def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Arguments: - inputs: List( List([ num_frames, num_feats ], [ num_frames, num_feats], ...) ) Returns: - List( # Data List( # Segments [ deg, num_feats ], ... ) ) """ with stopit.ThreadingTimeout(timeout) as timer: outputs = Outputs() metadata = inputs.metadata.clear( { 'schema': metadata_module.CONTAINER_SCHEMA_VERSION, 'structural_type': Outputs, 'dimension': { 'length': len(outputs) } }, for_value=outputs).update((metadata_module.ALL_ELEMENTS, ), { 'structural_type': List, }) for cinput in inputs: coutput = List() for segment in cinput: if segment.ndim != 2 or segment.shape[ 0] < self.hyperparams['deg']: raise ValueError('Incompatible shape ' + str(segment.shape) + ' of cinput.') n = segment.shape[0] x = np.linspace(0., 1., n) p = np.polyfit(x, segment, deg=self.hyperparams['deg']) E = segment - applyFitting(n, p) # for d in range(segment.shape[1]): # pfcn = np.poly1d(p[:, d]) # E[:, d] = segment[:, d]-pfcn(x) Sigma = np.dot(E.T, E) / n #segment_output = CurveFitting(deg = self.deg, # beta = p, sigma = Sigma, N = n) coutput.append(d3m_ndarray(p, generate_metadata=False)) outputs.append(coutput) # Set metadata attribute. outputs.metadata = metadata if timer.state == timer.EXECUTED: return CallResult(outputs) else: raise TimeoutError('SegmentCurveFitter exceeded time limit')
class simon(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Simon uses a LSTM-FCN neural network trained on 18 different semantic types to infer the semantic type of each column. A hyperparameter `return_result` controls whether Simon's inferences replace existing metadata, append new columns with inferred metadata, or return a new dataframe with only the inferred columns. Simon can append multiple annotations if the hyperparameter `multi_label_classification` is set to 'True'. If `statistical_classification` is set to True, Simon will use rule-based heuristics to label categorical and ordinal columns. Finally, the `p_threshold` hyperparameter varies the prediction probability threshold for adding annotations. The following annotations will only be considered if `statistical_classification` is set to False: "https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber", "http://schema.org/addressCountry", "http://schema.org/Country", "http://schema.org/longitude", "http://schema.org/latitude", "http://schema.org/postalCode", "http://schema.org/City", "http://schema.org/State", "http://schema.org/address", "http://schema.org/email", "https://metadata.datadrivendiscovery.org/types/FileName" The following annotations will only be considered if `statistical_classification` is set to True: "https://metadata.datadrivendiscovery.org/types/OrdinalData", Arguments: hyperparams {Hyperparams} -- D3M Hyperparameter object Keyword Arguments: random_seed {int} -- random seed (default: {0}) volumes {Dict[str, str]} -- large file dictionary containing model weights (default: {None}) """ metadata = metadata_base.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". "id": "d2fa8df2-6517-3c26-bafc-87b701c4043a", "version": __version__, "name": "simon", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. "keywords": [ "Data Type Predictor", "Semantic Classification", "Text", "NLP", "Tabular", ], "source": { "name": __author__, "contact": __contact__, "uris": [ # Unstructured URIs. "https://github.com/NewKnowledge/simon-d3m-wrapper", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. "installation": [ { "type": metadata_base.PrimitiveInstallationType.PIP, "package_uri": "git+https://github.com/NewKnowledge/simon-d3m-wrapper.git@{git_commit}#egg=SimonD3MWrapper" .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }, { "type": "TGZ", "key": "simon_models_1", "file_uri": "http://public.datadrivendiscovery.org/simon_models_1.tar.gz", "file_digest": "d071106b823ab1168879651811dd03b829ab0728ba7622785bb5d3541496c45f", }, ], # The same path the primitive is registered with entry points in setup.py. "python_path": "d3m.primitives.data_cleaning.column_type_profiler.Simon", # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. "algorithm_types": [ metadata_base.PrimitiveAlgorithmType.CONVOLUTIONAL_NEURAL_NETWORK, ], "primitive_family": metadata_base.PrimitiveFamily.DATA_CLEANING, }) def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, volumes: typing.Dict[str, str] = None, ) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, volumes=volumes) self._volumes = volumes self._X_train: Inputs = None self._add_semantic_types: typing.List[typing.List[str]] = None self._remove_semantic_types: typing.List[typing.List[str]] = None def set_training_data(self, *, inputs: Inputs) -> None: """ Sets primitive's training data Arguments: inputs {Inputs} -- D3M dataframe """ self._X_train = inputs self._is_fit = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Learns column annotations using training data. Saves to apply to testing data. Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Returns: CallResult[None] """ true_target_columns = self._X_train.metadata.list_columns_with_semantic_types( ['https://metadata.datadrivendiscovery.org/types/TrueTarget']) index_columns = self._X_train.metadata.get_index_columns() # Target and index columns should be set only once, if they are set. self.has_set_target_columns = False self.has_set_index_column = False columns_to_use = self._get_columns(self._X_train.metadata) self._add_semantic_types = [] self._remove_semantic_types = [] # compute SIMON annotations self.simon_annotations = self._produce_annotations( inputs=self._X_train) logger.debug(f"simon annotations: {self.simon_annotations}") for col_idx in columns_to_use: # Target and index columns should be set only once, if they are set. self.has_set_target_columns = False self.has_set_index_column = False input_column = self._X_train.select_columns([col_idx]) column_metadata = self._X_train.metadata.query_column(col_idx) column_name = column_metadata.get('name', str(col_idx)) column_semantic_types = list( column_metadata.get('semantic_types', [])) # We might be here because column has a known type, but it has "https://metadata.datadrivendiscovery.org/types/SuggestedTarget" set. has_unknown_type = not column_semantic_types or 'https://metadata.datadrivendiscovery.org/types/UnknownType' in column_semantic_types # A normalized copy of semantic types, which always includes unknown type. normalized_column_semantic_types = copy.copy(column_semantic_types) # If we are processing this column and it does not have semantic type then it has missing semantic types, # we first set it, to normalize the input semantic types. If we will add any other semantic type, # we will then remove this semantic type. if has_unknown_type \ and 'https://metadata.datadrivendiscovery.org/types/UnknownType' in self.hyperparams['detect_semantic_types'] \ and 'https://metadata.datadrivendiscovery.org/types/UnknownType' not in normalized_column_semantic_types: normalized_column_semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/UnknownType' ) # A working copy of semantic types. new_column_semantic_types = copy.copy( normalized_column_semantic_types) # append simon labels if has_unknown_type: new_column_semantic_types = self._append_simon_annotations( new_column_semantic_types, col_idx) # handle target columns new_column_semantic_types = self._set_target_column( new_column_semantic_types, true_target_columns) if has_unknown_type: # handle index columns if not index_columns and not self.has_set_index_column: new_column_semantic_types = self._set_index_column( new_column_semantic_types, column_name) # handle attribute columns new_column_semantic_types = self._set_attribute_column( new_column_semantic_types) # handle additional time label new_column_semantic_types = self._set_additional_time_label( new_column_semantic_types) # Have we added any other semantic type besides unknown type? if new_column_semantic_types != normalized_column_semantic_types: if self.hyperparams[ 'remove_unknown_type'] and 'https://metadata.datadrivendiscovery.org/types/UnknownType' in new_column_semantic_types: new_column_semantic_types.remove( 'https://metadata.datadrivendiscovery.org/types/UnknownType' ) new_column_semantic_types_set = set(new_column_semantic_types) column_semantic_types_set = set(column_semantic_types) self._add_semantic_types.append( sorted(new_column_semantic_types_set - column_semantic_types_set)) self._remove_semantic_types.append( sorted(column_semantic_types_set - new_column_semantic_types_set)) assert len(self._add_semantic_types) == len(columns_to_use) assert len(self._remove_semantic_types) == len(columns_to_use) self._is_fit = True return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Inputs]: """ Add SIMON annotations Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- Input pd frame with metadata augmented """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") ## BEGIN originally from from d3m.primitives.schema_discovery.profiler.Common """ assert self._add_semantic_types is not None assert self._remove_semantic_types is not None columns_to_use, output_columns = self._produce_columns( inputs, self._add_semantic_types, self._remove_semantic_types) if self.hyperparams['replace_index_columns'] and self.hyperparams[ 'return_result'] == 'append': assert len(columns_to_use) == len(output_columns) index_columns = inputs.metadata.get_index_columns() index_columns_to_use = [] other_columns_to_use = [] index_output_columns = [] other_output_columns = [] for column_to_use, output_column in zip(columns_to_use, output_columns): if column_to_use in index_columns: index_columns_to_use.append(column_to_use) index_output_columns.append(output_column) else: other_columns_to_use.append(column_to_use) other_output_columns.append(output_column) outputs = base_utils.combine_columns( inputs, index_columns_to_use, index_output_columns, return_result='replace', add_index_columns=self.hyperparams['add_index_columns']) outputs = base_utils.combine_columns( outputs, other_columns_to_use, other_output_columns, return_result='append', add_index_columns=self.hyperparams['add_index_columns']) else: outputs = base_utils.combine_columns( inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) ## EMD originally from from d3m.primitives.schema_discovery.profiler.Common """ return CallResult(outputs, has_finished=self._is_fit) def produce_metafeatures(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's best guess for the structural type of each input column. Arguments: inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target Keyword Arguments: timeout {float} -- timeout, not considered (default: {None}) iterations {int} -- iterations, not considered (default: {None}) Raises: PrimitiveNotFittedError: if primitive not fit Returns: CallResult[Outputs] -- dataframe with two columns: "semantic type classifications" and "probabilities" Each row represents a column in the original dataframe. The column "semantic type classifications" contains a list of all semantic type labels and the column "probabilities" contains a list of the model's confidence in assigning each respective semantic type label """ if not self._is_fit: raise PrimitiveNotFittedError("Primitive not fitted.") out_df = self._produce_annotations(inputs=inputs) # add metadata to output data frame simon_df = d3m_DataFrame(out_df) # first column list of ('semantic types') col_dict = dict( simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 0))) col_dict["structural_type"] = typing.List[str] col_dict["name"] = "semantic types" col_dict["semantic_types"] = ( "http://schema.org/Text", "https://metadata.datadrivendiscovery.org/types/Attribute", ) simon_df.metadata = simon_df.metadata.update( (metadata_base.ALL_ELEMENTS, 0), col_dict) # second column ('probabilities') col_dict = dict( simon_df.metadata.query((metadata_base.ALL_ELEMENTS, 1))) col_dict["structural_type"] = typing.List[float] col_dict["name"] = "probabilities" col_dict["semantic_types"] = ( "http://schema.org/Text", "https://metadata.datadrivendiscovery.org/types/Attribute", "https://metadata.datadrivendiscovery.org/types/FloatVector") simon_df.metadata = simon_df.metadata.update( (metadata_base.ALL_ELEMENTS, 1), col_dict) return CallResult(simon_df, has_finished=self._is_fit) def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: """ originally from from d3m.primitives.schema_discovery.profiler.Common """ column_metadata = inputs_metadata.query_column(column_index) semantic_types = column_metadata.get('semantic_types', []) # We detect only on columns which have no semantic types or where it is explicitly set as unknown. if not semantic_types or 'https://metadata.datadrivendiscovery.org/types/UnknownType' in semantic_types: return True # A special case to handle setting "https://metadata.datadrivendiscovery.org/types/TrueTarget". if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in semantic_types: return True return False def _get_columns( self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: """ originally from from d3m.primitives.schema_discovery.profiler.Common """ def can_use_column(column_index: int) -> bool: # if overwrite, we detect on all columns if self.hyperparams['overwrite']: return True return self._can_use_column(inputs_metadata, column_index) columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) # We are OK if no columns ended up being parsed. # "base_utils.combine_columns" will throw an error if it cannot work with this. if self.hyperparams['use_columns'] and columns_not_to_use: self.logger.warning( "Not all specified columns can parsed. Skipping columns: %(columns)s", { 'columns': columns_not_to_use, }) return columns_to_use def _append_simon_annotations(self, new_column_semantic_types, col_idx): simon_labels = self.simon_annotations["semantic types"][col_idx] simon_probabilities = self.simon_annotations["probabilities"][col_idx] # filter labels and probs by those specified in HP filtered_labels, filtered_probabilities = [], [] for label, prob in zip(simon_labels, simon_probabilities): if SIMON_ANNOTATIONS_DICT[label] in self.hyperparams[ 'detect_semantic_types']: filtered_labels.append(SIMON_ANNOTATIONS_DICT[label]) filtered_probabilities.append(prob) if self.hyperparams["multi_label_classification"]: new_column_semantic_types.extend(filtered_labels) else: if len(filtered_labels) > 0: new_column_semantic_types.append( filtered_labels[np.argmax(filtered_probabilities)]) return new_column_semantic_types def _produce_annotations(self, inputs: Inputs) -> Outputs: """ generates dataframe with semantic type classifications and classification probabilities for each column of original dataframe Arguments: inputs {Inputs} -- D3M dataframe Returns: Outputs -- dataframe with two columns: "semantic type classifications" and "probabilities" Each row represents a column in the original dataframe. The column "semantic type classifications" contains a list of all semantic type labels and the column "probabilities" contains a list of the model's confidence in assigning each respective semantic type label """ # load model checkpoint checkpoint_dir = (self._volumes["simon_models_1"] + "/simon_models_1/pretrained_models/") if self.hyperparams["statistical_classification"]: execution_config = "Base.pkl" category_list = "/Categories.txt" else: execution_config = "Base_stat_geo.pkl" category_list = "/Categories_base_stat_geo.txt" with open( self._volumes["simon_models_1"] + "/simon_models_1" + category_list, "r") as f: Categories = f.read().splitlines() # create model object Classifier = Simon(encoder={}) config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config["encoder"] checkpoint = config["checkpoint"] model = Classifier.generate_model(20, self.hyperparams["max_rows"], len(Categories)) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"]) # prepare data and make predictions frame = inputs.copy() prepped_data = encoder.encodeDataFrame(frame) preds = model.predict_on_batch(tf.constant(prepped_data)) decoded_preds = encoder.reverse_label_encode( preds, self.hyperparams["p_threshold"]) # apply statistical / ordinal classification if desired if self.hyperparams["statistical_classification"]: logger.debug( "Beginning Guessing categorical/ordinal classifications...") raw_data = frame.values guesses = [ guess(raw_data[:, i], for_types="category") for i in np.arange(raw_data.shape[1]) ] # probability of rule-based statistical / ordinal classifications = min probability of existing classifications for i, g in enumerate(guesses): if g[0] == "category": if len(decoded_preds[1][i]) == 0: guess_prob = self.hyperparams['p_threshold'] else: guess_prob = min(decoded_preds[1][i]) decoded_preds[0][i] += ("categorical", ) decoded_preds[1][i].append(guess_prob) if (("int" in decoded_preds[1][i]) or ("float" in decoded_preds[1][i]) or ("datetime" in decoded_preds[1][i])): decoded_preds[0][i] += ("ordinal", ) decoded_preds[1][i].append(guess_prob) logger.debug("Done with statistical variable guessing") # clear tf session, remove unnecessary files Classifier.clear_session() os.remove('unencoded_chars.json') out_df = pd.DataFrame.from_records(list(decoded_preds)).T out_df.columns = ["semantic types", "probabilities"] return out_df def _set_target_column(self, new_column_semantic_types, true_target_columns): """ originally from from d3m.primitives.schema_discovery.profiler.Common """ if not true_target_columns \ and not self.has_set_target_columns \ and 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in self.hyperparams['detect_semantic_types'] \ and 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in new_column_semantic_types: # It should not be set because there are no columns with this semantic type in whole DataFrame. assert 'https://metadata.datadrivendiscovery.org/types/TrueTarget' not in new_column_semantic_types new_column_semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/TrueTarget') if 'https://metadata.datadrivendiscovery.org/types/Target' not in new_column_semantic_types: new_column_semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/Target') if 'https://metadata.datadrivendiscovery.org/types/Attribute' in new_column_semantic_types: new_column_semantic_types.remove( 'https://metadata.datadrivendiscovery.org/types/Attribute') self.has_set_target_columns = True return new_column_semantic_types def _set_index_column(self, new_column_semantic_types, column_name): """ originally from from d3m.primitives.schema_discovery.profiler.Common """ if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in self.hyperparams['detect_semantic_types'] \ and column_name == 'd3mIndex' \ and 'https://metadata.datadrivendiscovery.org/types/UniqueKey' in new_column_semantic_types: # It should not be set because there are no columns with this semantic type in whole DataFrame. assert 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' not in new_column_semantic_types assert 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' not in new_column_semantic_types new_column_semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') new_column_semantic_types.remove( 'https://metadata.datadrivendiscovery.org/types/UniqueKey') if 'https://metadata.datadrivendiscovery.org/types/Attribute' in new_column_semantic_types: new_column_semantic_types.remove( 'https://metadata.datadrivendiscovery.org/types/Attribute') self.has_set_index_column = True elif 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' in self.hyperparams['detect_semantic_types'] \ and column_name == 'd3mIndex': assert 'https://metadata.datadrivendiscovery.org/types/UniqueKey' not in new_column_semantic_types # It should not be set because there are no columns with this semantic type in whole DataFrame. assert 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' not in new_column_semantic_types assert 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' not in new_column_semantic_types new_column_semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' ) if 'https://metadata.datadrivendiscovery.org/types/Attribute' in new_column_semantic_types: new_column_semantic_types.remove( 'https://metadata.datadrivendiscovery.org/types/Attribute') self.has_set_index_column = True return new_column_semantic_types def _set_attribute_column(self, new_column_semantic_types): """ originally from from d3m.primitives.schema_discovery.profiler.Common """ if 'https://metadata.datadrivendiscovery.org/types/Attribute' in self.hyperparams['detect_semantic_types'] \ and 'https://metadata.datadrivendiscovery.org/types/TrueTarget' not in new_column_semantic_types \ and 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' not in new_column_semantic_types \ and 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' not in new_column_semantic_types \ and 'https://metadata.datadrivendiscovery.org/types/Attribute' not in new_column_semantic_types: new_column_semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/Attribute') return new_column_semantic_types def _set_additional_time_label(self, new_column_semantic_types): """ originally from from d3m.primitives.schema_discovery.profiler.Common """ if 'https://metadata.datadrivendiscovery.org/types/Time' in self.hyperparams['detect_semantic_types'] \ and 'http://schema.org/DateTime' in new_column_semantic_types \ and 'https://metadata.datadrivendiscovery.org/types/Time' not in new_column_semantic_types: new_column_semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/Time') return new_column_semantic_types def _produce_columns( self, inputs: Inputs, add_semantic_types: typing.List[typing.List[str]], remove_semantic_types: typing.List[typing.List[str]], ) -> typing.Tuple[typing.List[int], typing.List[Outputs]]: """ originally from from d3m.primitives.schema_discovery.profiler.Common """ columns_to_use = self._get_columns(inputs.metadata) assert len(add_semantic_types), len(remove_semantic_types) if len(columns_to_use) != len(add_semantic_types): raise exceptions.InvalidStateError( "Producing on a different number of columns than fitting.") output_columns = [] for col_index, column_add_semantic_types, column_remove_semantic_types in zip( columns_to_use, add_semantic_types, remove_semantic_types): output_column = inputs.select_columns([col_index]) for remove_semantic_type in column_remove_semantic_types: output_column.metadata = output_column.metadata.remove_semantic_type( (metadata_base.ALL_ELEMENTS, 0), remove_semantic_type) for add_semantic_type in column_add_semantic_types: output_column.metadata = output_column.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, 0), add_semantic_type) output_columns.append(output_column) assert len(output_columns) == len(columns_to_use) return columns_to_use, output_columns def get_params(self) -> Params: if not self._is_fit: return Params( add_semantic_types=None, remove_semantic_types=None, ) return Params( add_semantic_types=self._add_semantic_types, remove_semantic_types=self._remove_semantic_types, ) def set_params(self, *, params: Params) -> None: self._add_semantic_types = params['add_semantic_types'] self._remove_semantic_types = params['remove_semantic_types'] self._is_fit = all(param is not None for param in params.values())
"logger": "logging.Logger", "metadata": "d3m.metadata.base.PrimitiveMetadata" }, "instance_attributes": { "hyperparams": "d3m.metadata.hyperparams.Hyperparams", "random_seed": "int", "docker_containers": "typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]", "volumes": "typing.Dict[str, str]", "temporary_directory": "typing.Union[NoneType, str]" } }, "structural_type": "test_primitives.increment.IncrementPrimitive", "description": "A primitive which increments each value by a fixed amount, by default 1." } """.replace('__INTERFACES_VERSION__', d3m.__version__).replace( '__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR)).replace( '__DIGEST__', IncrementPrimitive.metadata.query()['digest']) class TestIncrementPrimitive(unittest.TestCase): def call_primitive(self, primitive, method_name, **kwargs): return getattr(primitive, method_name)(**kwargs) def test_basic(self): hyperparams_class = IncrementPrimitive.metadata.get_hyperparams() primitive = IncrementPrimitive( hyperparams=hyperparams_class.defaults()) inputs = container.DataFrame(
class NearestNeighborNomination(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Creates a similarity matrix from pairwise distances, and subsequently nominates the closest neighbor in the second graph to each vertex in the first graph. """ # This should contain only metadata which cannot be automatically determined from the code. metadata = metadata_module.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. # Generated using "uuid.uuid4()". 'id': '66e09f5b-3538-4d9a-9397-e32230608a35', 'version': "0.1.0", 'name': "jhu.nearest_neighbor_nomination", # Keywords do not have a controlled vocabulary. Authors can put here # whatever they find suitable. 'keywords': ['nearest', 'neighbor', 'nomination', 'matching'], 'source': { 'name': "JHU", 'uris': [ # Unstructured URIs. Link to file and link to repo in this case. 'https://github.com/neurodata/primitives-interfaces/blob/master/jhu_primitives/nearest_neighbor_nomination/nearest_neighbor_nomination.py', 'https://github.com/neurodata/primitives-interfaces', ], 'contact': 'mailto:[email protected]' }, 'description': 'Creates a similarity matrix from pairwise distances, and subsequently nominates the closest neighbor in the second graph to each vertex in the first graph.', 'hyperparams_configuration': {}, # A list of dependencies in order. These can be Python packages, system # packages, or Docker images. Of course Python packages can also have # their own dependencies, but sometimes it is necessary to install a # Python package first to be even able to run setup.py of another # package. Or you have a dependency which is not on PyPi. 'installation': [{ 'type': 'UBUNTU', 'package': 'libxml2-dev', 'version': '2.9.4' }, { 'type': 'UBUNTU', 'package': 'libpcre3-dev', 'version': '2.9.4' }, { 'type': 'PIP', 'package_uri': 'git+https://github.com/neurodata/primitives-interfaces.git@{git_commit}#egg=jhu_primitives' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }], # URIs at which one can obtain code for the primitive, if available. # 'location_uris': [ # 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format( # git_commit=utils.current_git_commit(os.path.dirname(__file__)), # ), # ], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.graph_matching.nearest_neighbor_nomination.JHU', # Choose these from a controlled vocabulary in the schema. If anything # is missing which would best describe the primitive, make a merge # request. 'algorithm_types': ["RANDOM_GRAPH"], 'primitive_family': 'GRAPH_MATCHING', 'preconditions': ['NO_MISSING_VALUES'] }) def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, base.DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) def produce(self, *, inputs_1: Inputs, inputs_2: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: xhat = inputs_1 yhat = inputs_2 # do this more carefully TODO xhat_embedding = xhat.values[:, 1:].astype(np.float32) yhat_embedding = yhat.values[:, 1:].astype(np.float32) S = cdist( xhat_embedding, yhat_embedding, ) match = np.argmin(S, axis=1) matches = np.zeros(len(reference), dtype=int) for i in range(len(reference)): e_id = xhat.index[xhat[xhat.columns[0]] == reference[ reference.columns[1]].iloc[i]] g_id = yhat.index[yhat[yhat.columns[0]] == reference[ reference.columns[2]].iloc[i]] matches[i] = 1 if g_id == match[e_id] else 0 reference['match'] = matches results = reference[['d3mIndex', 'match']] predictions = { "d3mIndex": reference['d3mIndex'], "match": reference['match'] } return base.CallResult(container.DataFrame(predictions), has_finished=True, iterations_done=1) # return base.CallResult(reference, #results, # has_finished=True, # iterations_done=1) def multi_produce( self, *, produce_methods: Sequence[str], inputs_1: Inputs, inputs_2: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # type: ignore return self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs_1=inputs_1, inputs_2=inputs_2, reference=reference) def fit_multi_produce( self, *, produce_methods: Sequence[str], inputs_1: Inputs, inputs_2: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # type: ignore return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs_1=inputs_1, inputs_2=inputs_2, reference=reference)