class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): metadata = metadata_module.PrimitiveMetadata({ 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', 'version': '0.1.0', 'name': "Test Primitive", 'source': { 'name': 'Test', }, 'python_path': 'd3m.primitives.test.TestPrimitive', 'algorithm_types': [ metadata_module.PrimitiveAlgorithmType.NUMERICAL_METHOD, ], 'primitive_family': metadata_module.PrimitiveFamily.OPERATOR, }) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: pass
class MultiLabelClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Multi-label classfier primitive """ __author__ = 'UMASS/Pedram Rooshenas' metadata = metadata.PrimitiveMetadata({ 'id': '2dfa8611-a55d-47d6-afb6-e5d531cf5281', 'version': config.VERSION, 'name': "dsbox-spen-mlclassifier", 'description': 'Multi-label classification using SPEN', 'python_path': 'd3m.primitives.dsbox.MLCLassifier', 'primitive_family': metadata.PrimitiveFamily.SupervisedClassification, 'algorithm_types': [ metadata.PrimitiveAlgorithmType.FEEDFORWARD_NEURAL_NETWORK, ], 'keywords': ['spen', 'multi-label', 'classification'], 'source': { 'name': config.D3M_PERFORMER_TEAM, 'uris': [config.REPOSITORY] }, # The same path the primitive is registered with entry points in setup.py. 'installation': [config.INSTALLATION], # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. # A metafeature about preconditions required for this primitive to operate well. 'precondition': [], 'hyperparms_to_tune': [] }) def __init__(self): pass def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: if len(inputs) != len(outputs): raise ValueError( 'Training data sequences "inputs" and "outputs" should have the same length.' ) self._training_size = len(inputs) self._training_inputs = inputs self._training_outputs = outputs self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: pass
class TestPrimitive( transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): metadata = metadata_module.PrimitiveMetadata({ 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', 'version': '0.1.0', 'name': "Test Primitive", 'source': { 'name': 'Test', }, 'installation': [{ # Once with enum value. 'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/[email protected]', }], 'python_path': 'd3m.primitives.test.TestPrimitive', 'algorithm_types': [ metadata_module.PrimitiveAlgorithmType. NUMERICAL_METHOD, ], 'primitive_family': metadata_module.PrimitiveFamily.OPERATOR, }) def produce( self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: pass
class ResNet50ImageFeature(FeaturizationTransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): """ Image Feature Generation using pretrained deep neural network RestNet50. Parameters ---------- _layer_index : int, default: 0, domain: range(11) Layer of the network to use to generate features. Smaller indices are closer to the output layers of the network. _resize_data : Boolean, default: True, domain: {True, False} If True resize images to 224 by 224. """ __author__ = 'USC ISI' metadata = metadata.PrimitiveMetadata({ 'id': 'dsbox-featurizer-image-resnet50', 'version': 'v' + config.VERSION, 'name': "DSBox Image Featurizer RestNet50", 'description': 'Generate image features using RestNet50', 'python_path': 'd3m.primitives.dsbox.ResNet50ImageFeature', 'primitive_family': metadata.PrimitiveFamily.FEATURE_EXTRACTION, 'algorithm_types': [ metadata.PrimitiveAlgorithmType.FEEDFORWARD_NEURAL_NETWORK, ], 'keywords': ['image', 'featurization', 'resnet50'], 'source': { 'name': config.D3M_PERFORMER_TEAM, 'uris': [config.REPOSITORY] }, # The same path the primitive is registered with entry points in setup.py. 'installation': [config.INSTALLATION], # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. # A metafeature about preconditions required for this primitive to operate well. 'precondition': [], 'hyperparms_to_tune': [] }) def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Union[typing.Dict[str, str], None] = None ) -> None: # All primitives must define these attributes self.hyperparams = hyperparams self.random_seed = random_seed self.docker_containers = docker_containers # All other attributes must be private with leading underscore self._has_finished = False self._iterations_done = False #============TODO: these three could be hyperparams========= self._layer_index = 0 self._preprocess_data = True self._resize_data = False self._RESNET50_MODEL = None #=========================================================== if self._RESNET50_MODEL is None: self._RESNET50_MODEL = resnet50.ResNet50(weights='imagenet') self._layer_numbers = [ -2, -4, -8, -11, -14, -18, -21, -24, -30, -33, -36 ] if self._layer_index < 0: self._layer_index = 0 elif self._layer_index > len(self._layer_numbers): self._layer_numbers = len(self._layer_numbers) - 1 self._layer_number = self._layer_numbers[self._layer_index] self._org_model = self._RESNET50_MODEL self._model = Model(self._org_model.input, self._org_model.layers[self._layer_number].output) self._annotation = None def _preprocess(self, image_tensor): """Preprocess image data by modifying it directly""" resnet50.preprocess_input(image_tensor) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """Apply neural network-based feature extraction to image_tensor""" image_tensor = inputs # preprocess() modifies the data. For now just copy the data. if not len(image_tensor.shape) == 4: raise ValueError('Expect shape to have 4 dimension') resized = False if self._resize_data: if not (image_tensor.shape[1] == 244 and image_tensor.shape[2] == 244): resized = True y = np.empty((image_tensor.shape[0], 224, 224, 3)) for index in range(image_tensor.shape[0]): y[index] = imresize(image_tensor[index], (224, 224)) image_tensor = y # preprocess() modifies the data. For now just copy the data. if self._preprocess_data: if resized: # Okay to modify image_tensor, since its not input data = image_tensor else: data = image_tensor.copy() self._preprocess(data) else: data = image_tensor result = self._model.predict(data) self._has_finished = True self._iterations_done = True return CallResult(result.reshape(result.shape[0], -1), self._has_finished, self._iterations_done)
class TensorMachinesBinaryClassification(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Learns a polynomial function using logistic regression for binary classification by modeling the polynomial's coefficients as low-rank tensors. Meant as a faster, more scalable alternative to polynomial random feature map approaches like CRAFTMaps. """ __author__ = "ICSI" # a la directions on https://gitlab.datadrivendiscovery.org/jpl/primitives_repo metadata = metadata_module.PrimitiveMetadata({ 'id': 'ecc83605-d340-490d-9a2d-81c2ea6cb6cb', #uuid3(NAMESPACE_DNS, "realML.kernel.TensorMachineBinaryClassification" + __version__), 'version': __version__, 'name': 'Tensor Machine Binary Classifier', 'description': 'Fit a polynomial function for logistic regression by modeling the polynomial coefficients as collection of low-rank tensors', 'python_path': 'd3m.primitives.realML.kernel.TensorMachinesBinaryClassification', 'primitive_family': metadata_module.PrimitiveFamily.CLASSIFICATION, 'algorithm_types' : [ metadata_module.PrimitiveAlgorithmType.LOGISTIC_REGRESSION, ], 'keywords' : ['kernel learning', 'binary classification', 'adaptive features', 'polynomial model', 'classification'], 'source' : { 'name': __author__, 'contact': 'mailto:[email protected]', 'citation': 'https://arxiv.org/abs/1504.01697', 'uris' : [ "http://*****:*****@{git_commit}#egg=realML'.format(git_commit=utils.current_git_commit(os.path.dirname(__file__))) } ], 'location_uris': [ # NEED TO REF SPECIFIC COMMIT 'https://github.com/alexgittens/realML/blob/master/realML/kernel/TensorMachinesBinaryClassification.py', ], 'preconditions': [ metadata_module.PrimitivePrecondition.NO_MISSING_VALUES, metadata_module.PrimitivePrecondition.NO_CATEGORICAL_VALUES ], }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, str] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._seed = random_seed self._training_inputs = None self._training_outputs = None self._fitted = False self._weights = None self._norms = None def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: self._training_inputs = inputs self._training_outputs = outputs if self.hyperparams['preprocess'] == 'YES': (self._training_inputs, self._norms) = tm_preprocess(self._training_inputs) self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._training_inputs is None or self._training_outputs is None: raise ValueError("Missing training data.") if len(self._training_outputs.shape) == 1: self._training_outputs = np.expand_dims(self._training_outputs, axis=1) (self._weights, _) = tm_fit(self._training_inputs, self._training_outputs, 'bc', self.hyperparams['r'], self.hyperparams['q'], self.hyperparams['gamma'], self.hyperparams['solver'], self.hyperparams['epochs'], self.hyperparams['alpha'], seed=self._seed) self._fitted = True return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if self.hyperparams['preprocess'] == 'YES': inputs = tm_preprocess(inputs, colnorms=self._norms) pred_test = tm_predict(self._weights, inputs, self.hyperparams['q'], self.hyperparams['r'], 'bc') return CallResult(sign(pred_test.flatten()).astype(int)) def get_params(self) -> Params: return Params(weights=self._weights, norms=self._norms) def set_params(self, *, params: Params) -> None: self._weights = params['weights'] self._norms = params['norms']
class TensorMachinesRegularizedLeastSquares( SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Fits an l2-regularized least squares polynomial regression model by modeling the coefficients of the polynomial with a low-rank tensor. Intended as a scalable alternative to polynomial random feature maps like CRAFTMaps. """ __author__ = "ICSI" # a la directions on https://gitlab.datadrivendiscovery.org/jpl/primitives_repo metadata = metadata_module.PrimitiveMetadata({ 'id': '2d8155bb-3ca8-39de-8964-adb21225868e', 'version': __version__, 'name': 'Tensor Machine Regularized Least Squares', 'description': 'Fit a polynomial function for l2-regularized regression by modeling the polynomial coefficients as collection of low-rank tensors', 'python_path': 'd3m.primitives.realML.kernel.TensorMachinesRegularizedLeastSquares', 'primitive_family': metadata_module.PrimitiveFamily.REGRESSION, 'algorithm_types': [ metadata_module.PrimitiveAlgorithmType.KERNEL_METHOD, metadata_module.PrimitiveAlgorithmType.POLYNOMIAL_NEURAL_NETWORK ], 'keywords': [ 'kernel learning', 'polynomial regression', 'adaptive features', 'polynomial model', 'regression' ], 'source': { 'name': __author__, 'contact': 'mailto:[email protected]', 'citation': 'https://arxiv.org/abs/1504.01697', 'uris': [ "http://*****:*****@{git_commit}#egg=realML' .format( git_commit=utils.current_git_commit(os.path.dirname(__file__))) }], 'location_uris': [ # NEED TO REF SPECIFIC COMMIT 'https://github.com/alexgittens/realML/blob/master/realML/kernel/TensorMachinesRegularizedLeastSquares.py', ], 'preconditions': [ metadata_module.PrimitivePrecondition.NO_MISSING_VALUES, metadata_module.PrimitivePrecondition.NO_CATEGORICAL_VALUES ], }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, str] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._seed = random_seed self._training_inputs = None self._training_outputs = None self._fitted = False self._weights = None self._norms = None def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: self._training_inputs = inputs self._training_outputs = outputs if self.hyperparams['preprocess'] == 'YES': (self._training_inputs, self._norms) = tm_preprocess(self._training_inputs) self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._training_inputs is None or self._training_outputs is None: raise ValueError("Missing training data.") (self._weights, _) = tm_fit(self._training_inputs, self._training_outputs, 'regression', self.hyperparams['r'], self.hyperparams['q'], self.hyperparams['gamma'], self.hyperparams['solver'], self.hyperparams['epochs'], self.hyperparams['alpha'], seed=self._seed) self._fitted = True return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if self.hyperparams['preprocess'] == 'YES': inputs = tm_preprocess(inputs, colnorms=self._norms) pred_test = tm_predict(self._weights, inputs, self.hyperparams['q'], self.hyperparams['r'], 'regression') return CallResult(pred_test.flatten()) def get_params(self) -> Params: return Params(weights=self._weights, norms=self._norms) def set_params(self, *, params: Params) -> None: self._weights = params['weights'] self._norms = params['norms']
class MonomialPrimitive( supervised_learning.SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): # It is important to provide a docstring because this docstring is used as a description of # a primitive. Some callers might analyze it to determine the nature and purpose of a primitive. """ A primitive which fits output = a * input. """ # This should contain only metadata which cannot be automatically determined from the code. metadata = metadata_module.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': '4a0336ae-63b9-4a42-860e-86c5b64afbdd', 'version': "crap", 'name': "Monomial Regressor", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['test primitive'], 'source': { 'name': "boss", 'uris': [ # Unstructured URIs. Link to file and link to repo in this case. 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/monomial.py', 'https://gitlab.com/datadrivendiscovery/tests-data.git', ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }], # URIs at which one can obtain code for the primitive, if available. 'location_uris': [ 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), ], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.test.MonomialPrimitive', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_module.PrimitiveAlgorithmType.LINEAR_REGRESSION, ], 'primitive_family': metadata_module.PrimitiveFamily.REGRESSION, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, str] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._a: float = None self._training_inputs: Inputs = None self._training_outputs: Outputs = None self._fitted: bool = False def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: if self._a is None: raise ValueError("Calling produce before fitting.") # We compute the result. We use (...) here and not [...] to create a # generator and not a list which would then just be copied into "List". result = (self._a * input + self.hyperparams['bias'] for input in inputs) # We convert a regular list to container list which supports metadata attribute. outputs: container.List[float] = container.List[float](result) # We clear old metadata (but which keeps history and link to inputs metadata) and set new metadata. # "for_value" tells that this new metadata will be associated with "outputs", # and "source" tells which primitive generated this metadata. metadata = inputs.metadata.clear( { 'schema': metadata_module.CONTAINER_SCHEMA_VERSION, 'structural_type': type(outputs), 'dimension': { 'length': len(outputs) } }, for_value=outputs, source=self).update((metadata_module.ALL_ELEMENTS, ), { 'structural_type': float, }, source=self) # Set metadata attribute. outputs.metadata = metadata # Wrap it into default "CallResult" object: we are not doing any iterations. return base.CallResult(outputs) def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: self._training_inputs = inputs self._training_outputs = outputs self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: if self._fitted: return base.CallResult(None) if not self._training_inputs or not self._training_inputs: raise ValueError("Missing training data.") quotients = [ output / input for output, input in zip( self._training_outputs, self._training_inputs) if input != 0 ] self._a = sum(quotients) / len(quotients) self._fitted = True return base.CallResult(None) def get_params(self) -> Params: # You can pass a dict or keyword arguments. return Params(a=self._a) def set_params(self, *, params: Params) -> None: # Params are just a fancy dict. self._a = params['a']
class BBNSVC(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Primitive wrapping for sklearn.ensemble.AdaBoostClassifier """ __author__ = "JPL MARVIN" metadata = metadata_module.PrimitiveMetadata({ "algorithm_types": ['ADABOOST'], "name": "sklearn.svm.classes.SVC", "primitive_family": "CLASSIFICATION", "python_path": "d3m.primitives.bbn.time_series.BBNSVC", "source": {'name': 'JPL'}, "version": "0.1.0", "id": "a2ee7b2b-99c6-4326-b2e7-e081cd292d78", 'installation': [{'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.datadrivendiscovery.org/jpl/d3m_sklearn_wrap.git@{git_commit}'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }] }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, str] = None, _verbose: int = 0) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._clf = SVC( C=self.hyperparams['C'], kernel=self.hyperparams['kernel'], degree=self.hyperparams['degree'], gamma=self.hyperparams['gamma'], coef0=self.hyperparams['coef0'], probability=self.hyperparams['probability'], shrinking=self.hyperparams['shrinking'], tol=self.hyperparams['tol'], class_weight=self.hyperparams['class_weight'], max_iter=self.hyperparams['max_iter'], decision_function_shape=self.hyperparams['decision_function_shape'], verbose=_verbose, random_state=self.random_seed, ) self._training_inputs = None self._training_outputs = None self._fitted = False def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: self._training_inputs = inputs self._training_outputs = outputs self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._training_inputs is None or self._training_outputs is None: raise ValueError("Missing training data.") self._clf.fit(self._training_inputs, self._training_outputs) self._fitted = True return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: return CallResult(self._clf.predict(inputs)) def produce_log_proba(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: return CallResult(self._clf.predict_log_proba(inputs)) def get_params(self) -> Params: return Params( support=self._clf.support_, support_vectors=self._clf.support_vectors_, n_support=self._clf.n_support_, dual_coef=self._clf.dual_coef_, coef=self._clf.coef_, intercept=self._clf.intercept_, ) def set_params(self, *, params: Params) -> None: self._clf.support_ = params.support self._clf.support_vectors_ = params.support_vectors self._clf.n_support_ = params.n_support self._clf.dual_coef_ = params.dual_coef self._clf.intercept_ = params.intercept
class MultiTableFeaturization( FeaturizationTransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): __author__ = 'USC ISI' metadata = metadata.PrimitiveMetadata({ 'id': 'dsbox-multi-table-featurization-aggregation', 'version': 'v' + config.VERSION, 'name': "DSBox Multiple Table Featurizer Aggregation", 'description': 'Generate a featurized table from multiple-table dataset using aggregation', 'python_path': 'd3m.primitives.dsbox.MultiTableFeaturization', 'primitive_family': metadata.PrimitiveFamily.FEATURE_EXTRACTION, 'algorithm_types': [ metadata.PrimitiveAlgorithmType.RELATIONAL_DATA_MINING, ], 'keywords': ['multiple table'], 'source': { 'name': config.D3M_PERFORMER_TEAM, 'uris': [config.REPOSITORY] }, # The same path the primitive is registered with entry points in setup.py. 'installation': [config.INSTALLATION], # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. # A metafeature about preconditions required for this primitive to operate well. 'precondition': [], 'hyperparms_to_tune': [] }) def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Union[typing.Dict[str, str], None] = None ) -> None: # All primitives must define these attributes self.hyperparams = hyperparams self.random_seed = random_seed self.docker_containers = docker_containers # All other attributes must be private with leading underscore self._has_finished = False self._iterations_done = False self._verbose = hyperparams['verbose'] if hyperparams else 0 def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: if (timeout is None): big_table = self._core(inputs) self._has_finished = True self._iterations_done = True return CallResult(big_table, self._has_finished, self._iterations_done) else: # setup the timeout with stopit.ThreadingTimeout(timeout) as to_ctx_mrg: assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING # core computations big_table = self._core(inputs) if to_ctx_mrg.state == to_ctx_mrg.EXECUTED: self._has_finished = True self._iterations_done = True return CallResult(big_table, self._has_finished, self._iterations_done) elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT: self._has_finished = False self._iterations_done = False return CallResult(None, self._has_finished, self._iterations_done) def _core(self, inputs) -> Outputs: """ core calculations """ data = inputs[0] names = inputs[1][:-1] master_table_name = inputs[1][-1] # TODO: format names, make sure it has to be like xx.csv # for example, if original names is like [0,1,2], make it to be [0.csv, 1.csv, 2.csv] # step 1: get relation matrix relation_matrix = get_relation_matrix(data, names) # step 2: get prime key - foreign key relationship relations = relationMat2foreignKey(data, names, relation_matrix) # print (relations) # to see if the relations are correct # step 3: featurization aggregator = Aggregator(relations, data, names) big_table = aggregator.forward(master_table_name) return big_table
class JHUGraph(ClusteringPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): # TODO: Create metadata for this # This should contain only metadata which cannot be automatically determined from the code. metadata = metadata_module.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': 'b940ccbd-9e9b-3166-af50-210bfd79251b', 'version': "crap", 'name': "Monomial Regressor", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['test primitive'], 'source': { 'name': "boss", 'uris': [ # Unstructured URIs. Link to file and link to repo in this case. 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/monomial.py', 'https://gitlab.com/datadrivendiscovery/tests-data.git', ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirecto\ ry=primitives'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }], # URIs at which one can obtain code for the primitive, if available. 'location_uris': [ 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), ], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.test.MonomialPrimitive', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_module.PrimitiveAlgorithmType.LINEAR_REGRESSION, ], 'primitive_family': metadata_module.PrimitiveFamily.REGRESSION, }) _adjacency_matrix = None _num_vertices = None _num_edges = None _directed = None _weighted = None _dangling_nodes = None def read_graph(self, *, fname: str) -> None: dtype = self.hyperparams['dtype'] if dtype == "gml": self._object = read_graph(fname, "gml") elif dtype.startswith("edge"): self._object = read_graph(fname, "edge") else: raise NotImplementedError("Reading graphs of type '{}'".\ format(dtype)) self._num_vertices = ig_get_num_vertices(self._object) self._num_edges = ig_get_num_edges(self._object) self._directed = ig_is_directed(self._object) self._weighted = ig_is_weighted(self._object) def compute_statistics(self) -> Outputs: self._dangling_nodes = ig_get_dangling_nodes(self._object) def get_adjacency_matrix(self) -> Outputs: return ig_get_adjacency_matrix(self._object) def get_dense_matrix(self) -> Outputs: return ig_get_dense_matrix(self._object) def get_num_vertices(self) -> int: return self._num_vertices def get_num_edges(self) -> int: return self._num_edges def is_directed(self) -> bool: return self._directed def is_weighted(self) -> bool: return self._weighted def get_dangling_nodes(self) -> Outputs: if (self._dangling_nodes is None): self.compute_statistics() return self._dangling_nodes def summary(self) -> None: ig_summary(self._object) def set_training_data(self, *, inputs: Inputs) -> None: # type: ignore pass def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: return base.CallResult(self.get_adjacency_matrix()) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: return base.CallResult(None) def get_params(self) -> Params: return Params(other={}) def set_params(self, *, params: Params) -> None: return None
class duke(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]): metadata = metadata_module.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': "46612a42-6120-3559-9db9-3aa9a76eb94f", 'version': __version__, 'name': "duke", # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['Dataset Descriptor'], 'source': { 'name': __author__, 'uris': [ # Unstructured URIs. "https://github.com/NewKnowledge/duke-thin-client", ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/NewKnowledge/duke-thin-client.git@{git_commit}#egg=DukeThinClient' .format(git_commit=utils.current_git_commit( os.path.dirname(__file__)), ), }], # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.distil.duke', # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ metadata_module.PrimitiveAlgorithmType.RECURRENT_NEURAL_NETWORK, ], 'primitive_family': metadata_module.PrimitiveFamily.DATA_CLEANING, }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Dict[str, str] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._decoder = JSONDecoder() self._params = {} def fit(self) -> None: pass def get_params(self) -> Params: return self._params def set_params(self, *, params: Params) -> None: self.params = params def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: pass def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Produce primitive's best guess for the structural type of each input column. Parameters ---------- inputs : Input pandas frame Returns ------- Outputs The outputs is a list that has length equal to number of columns in input pandas frame. Each entry is a list of strings corresponding to each column's multi-label classification. """ """ Accept a pandas data frame, predicts column types in it frame: a pandas data frame containing the data to be processed -> a list of lists of column labels """ filename = inputs[1] files = {'file': open(filename, 'rb')} try: r = requests.post(inputs[0] + "/fileUpload", files=files) return self._decoder.decode(r.text) except: # Should probably do some more sophisticated error logging here return "Failed processing input file"
class RFMPreconditionedGaussianKRR(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ Performs gaussian kernel regression using a random feature map to precondition the problem for faster convergence: forms the kernel K_{ij} = exp(-||x_i - x_j||^2/(2sigma^2)) and solves alphahat = argmin ||K alpha - y||_F^2 + lambda ||alpha||_F^2 predictions are then formed by ypred = K(trainingData, x) alphahat """ __author__ = "ICSI" # a la directions on https://gitlab.datadrivendiscovery.org/jpl/primitives_repo metadata = metadata_module.PrimitiveMetadata({ 'id': '511c3536-2fff-369a-b81d-96755ff5b81b', 'version': __version__, 'name': 'RFM Preconditioned Gaussian Kernel Ridge Regression', 'description': 'Gaussian regression using random fourier features as a preconditioner for faster solves', 'python_path': 'd3m.primitives.realML.kernel.RFMPreconditionedGaussianKRR', 'primitive_family': metadata_module.PrimitiveFamily.REGRESSION, 'algorithm_types' : [ metadata_module.PrimitiveAlgorithmType.KERNEL_METHOD ], 'keywords' : ['kernel learning', 'kernel ridge regression', 'preconditioned CG', 'Gaussian', 'RBF', 'regression'], 'source' : { 'name': __author__, 'contact': 'mailto:[email protected]', 'uris' : [ "http://*****:*****@{git_commit}#egg=realML'.format(git_commit=utils.current_git_commit(os.path.dirname(__file__))) } ], 'location_uris': [ # NEED TO REF SPECIFIC COMMIT 'https://github.com/alexgittens/realML/blob/master/realML/kernel/RFMPreconditionedGaussianKRR.py', ], 'preconditions': [ metadata_module.PrimitivePrecondition.NO_MISSING_VALUES, metadata_module.PrimitivePrecondition.NO_CATEGORICAL_VALUES ], }) def __init__(self, *, hyperparams : Hyperparams, random_seed: int = 0, docker_containers : Dict[str, str] = None) -> None: """ Initializes the preconditioned gaussian kernel ridge regression primitive. """ super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._seed = random_seed self._Xtrain = None self._ytrain = None self._fitted = False np.random.seed(random_seed) def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: """ Sets the training data: Input: array, shape = [n_samples, n_features] Output: array, shape = [n_samples, n_targets] Only uses one input and output """ self._Xtrain = inputs self._ytrain = outputs maxPCGsize = 20000 # TODO: make a control hyperparameter for when to switch to GS if len(self._ytrain.shape) == 1: self._ytrain = np.expand_dims(self._ytrain, axis=1) if self._Xtrain.shape[0] > maxPCGsize: print("need to implement Gauss-Siedel for large datasets; currently training with a smaller subset") choices = np.random.choice(self._Xtrain.shape[0], size=maxPCGsize, replace=False) self._Xtrain = self._Xtrain[choices, :] self._ytrain = self._ytrain[choices, :] self._n, self._d = self._Xtrain.shape self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: """ Learns the kernel regression coefficients alpha given training pairs (X,y) """ if self._fitted: return CallResult(None) if self._Xtrain is None or self._ytrain is None: raise ValueError("Missing training data.") self._U = generateGaussianPreconditioner(self._Xtrain, self.hyperparams['sigma'], self.hyperparams['lparam']) def mykernel(X, Y): return GaussianKernel(X, Y, self.hyperparams['sigma']) self._coeffs = PCGfit(self._Xtrain, self._ytrain, mykernel, self._U, self.hyperparams['lparam'], self.hyperparams['eps'], self.hyperparams['maxIters']) self._fitted = True return CallResult(None) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: """ Predict the value for each sample in X Inputs: X: array of shape [n_samples, n_features] Outputs: y: array of shape [n_samples, n_targets] """ return CallResult(GaussianKernel(inputs, self._Xtrain, self.hyperparams['sigma']).dot(self._coeffs).flatten()) def set_params(self, *, params: Params) -> None: self._Xtrain = params['exemplars'] self._coeffs = params['coeffs'] def get_params(self) -> Params: return Params(exemplars=self._Xtrain, coeffs=self._coeffs)
class AdjacencySpectralEmbedding(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): # This should contain only metadata which cannot be automatically determined from the code. metadata = metadata_module.PrimitiveMetadata({ # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". 'id': 'b940ccbd-9e9b-3166-af50-210bfd79251b', 'version': "0.3.0", 'name': "jhu.ase", # The same path the primitive is registered with entry points in setup.py. 'python_path': 'd3m.primitives.jhu_primitives.AdjacencySpectralEmbedding', # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. 'keywords': ['ase primitive'], 'source': { 'name': "JHU", 'uris': [ # Unstructured URIs. Link to file and link to repo in this case. 'https://github.com/youngser/D3M/primitives-interfaces/jhu_primitives/ase/ase.py', # 'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py', 'https://github.com/youngser/D3M/primitives-interfaces.git', ], }, # A list of dependencies in order. These can be Python packages, system packages, or Docker images. # Of course Python packages can also have their own dependencies, but sometimes it is necessary to # install a Python package first to be even able to run setup.py of another package. Or you have # a dependency which is not on PyPi. 'installation': [{ 'type': metadata_module.PrimitiveInstallationType.PIP, 'package_uri': 'git+https://github.com/youngser/D3M/primitives-interfaces.git@{git_commit}#egg=jhu.ase'.format( git_commit=utils.current_git_commit(os.path.dirname(__file__)), ), }], # URIs at which one can obtain code for the primitive, if available. # 'location_uris': [ # 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format( # git_commit=utils.current_git_commit(os.path.dirname(__file__)), # ), # ], # Choose these from a controlled vocabulary in the schema. If anything is missing which would # best describe the primitive, make a merge request. 'algorithm_types': [ "HIGHER_ORDER_SINGULAR_VALUE_DECOMPOSITION" ], 'primitive_family': "DATA_TRANSFORMATION" }) def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, str] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: # def embed(self, *, g : JHUGraph, dim: int): """ Perform Adjacency Spectral Embedding on a graph TODO: YP description **Positional Arguments:** g: - Graph in JHUGraph format **Optional Arguments:** dim: - The number of dimensions in which to embed the data """ dim = self.hyperparams['dim'] path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "ase.interface.R") cmd = """ source("%s") fn <- function(inputs, dim) { ase.interface(inputs, dim) } """ % path print(cmd) result = np.array(robjects.r(cmd)(inputs, dim)) outputs = container.ndarray(result) return base.CallResult(outputs) #return np.array(robjects.r(cmd)(g._object, dim)) def set_training_data(self) -> None: # type: ignore """ A noop. """ return def fit(self, *, timeout: float = None, iterations: int = None) -> None: """ A noop. """ return def get_params(self) -> None: """ A noop. """ return None def set_params(self, *, params: None) -> None: """ A noop. """ return