Beispiel #1
0
        class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs,
                                                                 Outputs,
                                                                 Hyperparams]):
            metadata = metadata_module.PrimitiveMetadata({
                'id':
                '67568a80-dec2-4597-a10f-39afb13d3b9c',
                'version':
                '0.1.0',
                'name':
                "Test Primitive",
                'source': {
                    'name': 'Test',
                },
                'python_path':
                'd3m.primitives.test.TestPrimitive',
                'algorithm_types': [
                    metadata_module.PrimitiveAlgorithmType.NUMERICAL_METHOD,
                ],
                'primitive_family':
                metadata_module.PrimitiveFamily.OPERATOR,
            })

            def produce(self,
                        *,
                        inputs: Inputs,
                        timeout: float = None,
                        iterations: int = None) -> base.CallResult[Outputs]:
                pass
Beispiel #2
0
class MultiLabelClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs,
                                                          Params,
                                                          Hyperparams]):
    """
  Multi-label classfier primitive
  """

    __author__ = 'UMASS/Pedram Rooshenas'
    metadata = metadata.PrimitiveMetadata({
        'id':
        '2dfa8611-a55d-47d6-afb6-e5d531cf5281',
        'version':
        config.VERSION,
        'name':
        "dsbox-spen-mlclassifier",
        'description':
        'Multi-label classification using SPEN',
        'python_path':
        'd3m.primitives.dsbox.MLCLassifier',
        'primitive_family':
        metadata.PrimitiveFamily.SupervisedClassification,
        'algorithm_types': [
            metadata.PrimitiveAlgorithmType.FEEDFORWARD_NEURAL_NETWORK,
        ],
        'keywords': ['spen', 'multi-label', 'classification'],
        'source': {
            'name': config.D3M_PERFORMER_TEAM,
            'uris': [config.REPOSITORY]
        },
        # The same path the primitive is registered with entry points in setup.py.
        'installation': [config.INSTALLATION],
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.

        # A metafeature about preconditions required for this primitive to operate well.
        'precondition': [],
        'hyperparms_to_tune': []
    })

    def __init__(self):
        pass

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        if len(inputs) != len(outputs):
            raise ValueError(
                'Training data sequences "inputs" and "outputs" should have the same length.'
            )
        self._training_size = len(inputs)
        self._training_inputs = inputs
        self._training_outputs = outputs

        self._fitted = False

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        pass
Beispiel #3
0
            class TestPrimitive(
                    transformer.TransformerPrimitiveBase[Inputs, Outputs,
                                                         Hyperparams]):
                metadata = metadata_module.PrimitiveMetadata({
                    'id':
                    '67568a80-dec2-4597-a10f-39afb13d3b9c',
                    'version':
                    '0.1.0',
                    'name':
                    "Test Primitive",
                    'source': {
                        'name': 'Test',
                    },
                    'installation': [{
                        # Once with enum value.
                        'type':
                        metadata_module.PrimitiveInstallationType.PIP,
                        'package_uri':
                        'git+https://gitlab.com/datadrivendiscovery/[email protected]',
                    }],
                    'python_path':
                    'd3m.primitives.test.TestPrimitive',
                    'algorithm_types': [
                        metadata_module.PrimitiveAlgorithmType.
                        NUMERICAL_METHOD,
                    ],
                    'primitive_family':
                    metadata_module.PrimitiveFamily.OPERATOR,
                })

                def produce(
                        self,
                        *,
                        inputs: Inputs,
                        timeout: float = None,
                        iterations: int = None) -> base.CallResult[Outputs]:
                    pass
Beispiel #4
0
class ResNet50ImageFeature(FeaturizationTransformerPrimitiveBase[Inputs,
                                                                 Outputs,
                                                                 Hyperparams]):
    """
    Image Feature Generation using pretrained deep neural network RestNet50.

    Parameters
    ----------
    _layer_index : int, default: 0, domain: range(11)
        Layer of the network to use to generate features. Smaller
        indices are closer to the output layers of the network.

    _resize_data : Boolean, default: True, domain: {True, False}
        If True resize images to 224 by 224.
    """

    __author__ = 'USC ISI'
    metadata = metadata.PrimitiveMetadata({
        'id':
        'dsbox-featurizer-image-resnet50',
        'version':
        'v' + config.VERSION,
        'name':
        "DSBox Image Featurizer RestNet50",
        'description':
        'Generate image features using RestNet50',
        'python_path':
        'd3m.primitives.dsbox.ResNet50ImageFeature',
        'primitive_family':
        metadata.PrimitiveFamily.FEATURE_EXTRACTION,
        'algorithm_types': [
            metadata.PrimitiveAlgorithmType.FEEDFORWARD_NEURAL_NETWORK,
        ],
        'keywords': ['image', 'featurization', 'resnet50'],
        'source': {
            'name': config.D3M_PERFORMER_TEAM,
            'uris': [config.REPOSITORY]
        },
        # The same path the primitive is registered with entry points in setup.py.
        'installation': [config.INSTALLATION],
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.

        # A metafeature about preconditions required for this primitive to operate well.
        'precondition': [],
        'hyperparms_to_tune': []
    })

    def __init__(
        self,
        *,
        hyperparams: Hyperparams,
        random_seed: int = 0,
        docker_containers: typing.Union[typing.Dict[str, str], None] = None
    ) -> None:
        # All primitives must define these attributes
        self.hyperparams = hyperparams
        self.random_seed = random_seed
        self.docker_containers = docker_containers

        # All other attributes must be private with leading underscore
        self._has_finished = False
        self._iterations_done = False

        #============TODO: these three could be hyperparams=========
        self._layer_index = 0
        self._preprocess_data = True
        self._resize_data = False
        self._RESNET50_MODEL = None
        #===========================================================

        if self._RESNET50_MODEL is None:
            self._RESNET50_MODEL = resnet50.ResNet50(weights='imagenet')
        self._layer_numbers = [
            -2, -4, -8, -11, -14, -18, -21, -24, -30, -33, -36
        ]
        if self._layer_index < 0:
            self._layer_index = 0
        elif self._layer_index > len(self._layer_numbers):
            self._layer_numbers = len(self._layer_numbers) - 1

        self._layer_number = self._layer_numbers[self._layer_index]

        self._org_model = self._RESNET50_MODEL
        self._model = Model(self._org_model.input,
                            self._org_model.layers[self._layer_number].output)

        self._annotation = None

    def _preprocess(self, image_tensor):
        """Preprocess image data by modifying it directly"""
        resnet50.preprocess_input(image_tensor)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """Apply neural network-based feature extraction to image_tensor"""
        image_tensor = inputs

        # preprocess() modifies the data. For now just copy the data.
        if not len(image_tensor.shape) == 4:
            raise ValueError('Expect shape to have 4 dimension')

        resized = False
        if self._resize_data:
            if not (image_tensor.shape[1] == 244
                    and image_tensor.shape[2] == 244):
                resized = True
                y = np.empty((image_tensor.shape[0], 224, 224, 3))
                for index in range(image_tensor.shape[0]):
                    y[index] = imresize(image_tensor[index], (224, 224))
                image_tensor = y

        # preprocess() modifies the data. For now just copy the data.
        if self._preprocess_data:
            if resized:
                # Okay to modify image_tensor, since its not input
                data = image_tensor
            else:
                data = image_tensor.copy()
            self._preprocess(data)
        else:
            data = image_tensor
        result = self._model.predict(data)

        self._has_finished = True
        self._iterations_done = True
        return CallResult(result.reshape(result.shape[0], -1),
                          self._has_finished, self._iterations_done)
Beispiel #5
0
class TensorMachinesBinaryClassification(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    Learns a polynomial function using logistic regression for binary classification by modeling the polynomial's coefficients as low-rank tensors.
    Meant as a faster, more scalable alternative to polynomial random feature map approaches like CRAFTMaps.
    """

    __author__ = "ICSI" # a la directions on https://gitlab.datadrivendiscovery.org/jpl/primitives_repo
    metadata = metadata_module.PrimitiveMetadata({
        'id': 'ecc83605-d340-490d-9a2d-81c2ea6cb6cb', #uuid3(NAMESPACE_DNS, "realML.kernel.TensorMachineBinaryClassification" + __version__),
        'version': __version__,
        'name': 'Tensor Machine Binary Classifier',
        'description': 'Fit a polynomial function for logistic regression by modeling the polynomial coefficients as collection of low-rank tensors',
        'python_path': 'd3m.primitives.realML.kernel.TensorMachinesBinaryClassification',
        'primitive_family': metadata_module.PrimitiveFamily.CLASSIFICATION,
        'algorithm_types' : [
            metadata_module.PrimitiveAlgorithmType.LOGISTIC_REGRESSION,
        ],
        'keywords' : ['kernel learning', 'binary classification', 'adaptive features', 'polynomial model', 'classification'],
        'source' : {
            'name': __author__,
            'contact': 'mailto:[email protected]',
            'citation': 'https://arxiv.org/abs/1504.01697',
            'uris' : [
                "http://*****:*****@{git_commit}#egg=realML'.format(git_commit=utils.current_git_commit(os.path.dirname(__file__)))
            }
        ],
        'location_uris': [ # NEED TO REF SPECIFIC COMMIT
            'https://github.com/alexgittens/realML/blob/master/realML/kernel/TensorMachinesBinaryClassification.py',
            ],
        'preconditions': [
            metadata_module.PrimitivePrecondition.NO_MISSING_VALUES,
            metadata_module.PrimitivePrecondition.NO_CATEGORICAL_VALUES
        ],
    })

    def __init__(self, *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, str] = None) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

        self._seed = random_seed
        self._training_inputs = None
        self._training_outputs = None
        self._fitted = False
        self._weights = None
        self._norms = None

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._training_inputs = inputs
        self._training_outputs = outputs

        if self.hyperparams['preprocess'] == 'YES':
            (self._training_inputs, self._norms) = tm_preprocess(self._training_inputs)

        self._fitted = False

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None or self._training_outputs is None:
            raise ValueError("Missing training data.")

        if len(self._training_outputs.shape) == 1:
            self._training_outputs = np.expand_dims(self._training_outputs, axis=1)
        (self._weights, _) = tm_fit(self._training_inputs, self._training_outputs, 'bc', self.hyperparams['r'],
           self.hyperparams['q'], self.hyperparams['gamma'], self.hyperparams['solver'],
           self.hyperparams['epochs'], self.hyperparams['alpha'], seed=self._seed)

        self._fitted = True

        return CallResult(None)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        if self.hyperparams['preprocess'] == 'YES':
            inputs = tm_preprocess(inputs, colnorms=self._norms)

        pred_test = tm_predict(self._weights, inputs, self.hyperparams['q'],
                                     self.hyperparams['r'], 'bc')
        return CallResult(sign(pred_test.flatten()).astype(int))

    def get_params(self) -> Params:
        return Params(weights=self._weights, norms=self._norms)

    def set_params(self, *, params: Params) -> None:
        self._weights = params['weights']
        self._norms = params['norms']
class TensorMachinesRegularizedLeastSquares(
        SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    Fits an l2-regularized least squares polynomial regression model by modeling the coefficients of the polynomial with a low-rank tensor. Intended
    as a scalable alternative to polynomial random feature maps like CRAFTMaps.
    """

    __author__ = "ICSI"  # a la directions on https://gitlab.datadrivendiscovery.org/jpl/primitives_repo
    metadata = metadata_module.PrimitiveMetadata({
        'id':
        '2d8155bb-3ca8-39de-8964-adb21225868e',
        'version':
        __version__,
        'name':
        'Tensor Machine Regularized Least Squares',
        'description':
        'Fit a polynomial function for l2-regularized regression by modeling the polynomial coefficients as collection of low-rank tensors',
        'python_path':
        'd3m.primitives.realML.kernel.TensorMachinesRegularizedLeastSquares',
        'primitive_family':
        metadata_module.PrimitiveFamily.REGRESSION,
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.KERNEL_METHOD,
            metadata_module.PrimitiveAlgorithmType.POLYNOMIAL_NEURAL_NETWORK
        ],
        'keywords': [
            'kernel learning', 'polynomial regression', 'adaptive features',
            'polynomial model', 'regression'
        ],
        'source': {
            'name': __author__,
            'contact': 'mailto:[email protected]',
            'citation': 'https://arxiv.org/abs/1504.01697',
            'uris': [
                "http://*****:*****@{git_commit}#egg=realML'
            .format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)))
        }],
        'location_uris': [  # NEED TO REF SPECIFIC COMMIT
            'https://github.com/alexgittens/realML/blob/master/realML/kernel/TensorMachinesRegularizedLeastSquares.py',
        ],
        'preconditions': [
            metadata_module.PrimitivePrecondition.NO_MISSING_VALUES,
            metadata_module.PrimitivePrecondition.NO_CATEGORICAL_VALUES
        ],
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, str] = None) -> None:

        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        self._seed = random_seed
        self._training_inputs = None
        self._training_outputs = None
        self._fitted = False
        self._weights = None
        self._norms = None

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._training_inputs = inputs
        self._training_outputs = outputs

        if self.hyperparams['preprocess'] == 'YES':
            (self._training_inputs,
             self._norms) = tm_preprocess(self._training_inputs)

        self._fitted = False

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None or self._training_outputs is None:
            raise ValueError("Missing training data.")

        (self._weights, _) = tm_fit(self._training_inputs,
                                    self._training_outputs,
                                    'regression',
                                    self.hyperparams['r'],
                                    self.hyperparams['q'],
                                    self.hyperparams['gamma'],
                                    self.hyperparams['solver'],
                                    self.hyperparams['epochs'],
                                    self.hyperparams['alpha'],
                                    seed=self._seed)

        self._fitted = True

        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if self.hyperparams['preprocess'] == 'YES':
            inputs = tm_preprocess(inputs, colnorms=self._norms)

        pred_test = tm_predict(self._weights, inputs, self.hyperparams['q'],
                               self.hyperparams['r'], 'regression')
        return CallResult(pred_test.flatten())

    def get_params(self) -> Params:
        return Params(weights=self._weights, norms=self._norms)

    def set_params(self, *, params: Params) -> None:
        self._weights = params['weights']
        self._norms = params['norms']
Beispiel #7
0
class MonomialPrimitive(
        supervised_learning.SupervisedLearnerPrimitiveBase[Inputs, Outputs,
                                                           Params,
                                                           Hyperparams]):
    # It is important to provide a docstring because this docstring is used as a description of
    # a primitive. Some callers might analyze it to determine the nature and purpose of a primitive.
    """
    A primitive which fits output = a * input.
    """

    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        '4a0336ae-63b9-4a42-860e-86c5b64afbdd',
        'version':
        "crap",
        'name':
        "Monomial Regressor",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['test primitive'],
        'source': {
            'name':
            "boss",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/monomial.py',
                'https://gitlab.com/datadrivendiscovery/tests-data.git',
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type':
            metadata_module.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # URIs at which one can obtain code for the primitive, if available.
        'location_uris': [
            'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        ],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.test.MonomialPrimitive',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.LINEAR_REGRESSION,
        ],
        'primitive_family':
        metadata_module.PrimitiveFamily.REGRESSION,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: typing.Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        self._a: float = None
        self._training_inputs: Inputs = None
        self._training_outputs: Outputs = None
        self._fitted: bool = False

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        if self._a is None:
            raise ValueError("Calling produce before fitting.")

        # We compute the result. We use (...) here and not [...] to create a
        # generator and not a list which would then just be copied into "List".
        result = (self._a * input + self.hyperparams['bias']
                  for input in inputs)

        # We convert a regular list to container list which supports metadata attribute.
        outputs: container.List[float] = container.List[float](result)

        # We clear old metadata (but which keeps history and link to inputs metadata) and set new metadata.
        # "for_value" tells that this new metadata will be associated with "outputs",
        # and "source" tells which primitive generated this metadata.
        metadata = inputs.metadata.clear(
            {
                'schema': metadata_module.CONTAINER_SCHEMA_VERSION,
                'structural_type': type(outputs),
                'dimension': {
                    'length': len(outputs)
                }
            },
            for_value=outputs,
            source=self).update((metadata_module.ALL_ELEMENTS, ), {
                'structural_type': float,
            },
                                source=self)

        # Set metadata attribute.
        outputs.metadata = metadata

        # Wrap it into default "CallResult" object: we are not doing any iterations.
        return base.CallResult(outputs)

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._training_inputs = inputs
        self._training_outputs = outputs
        self._fitted = False

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[None]:
        if self._fitted:
            return base.CallResult(None)

        if not self._training_inputs or not self._training_inputs:
            raise ValueError("Missing training data.")

        quotients = [
            output / input for output, input in zip(
                self._training_outputs, self._training_inputs) if input != 0
        ]
        self._a = sum(quotients) / len(quotients)
        self._fitted = True

        return base.CallResult(None)

    def get_params(self) -> Params:
        # You can pass a dict or keyword arguments.
        return Params(a=self._a)

    def set_params(self, *, params: Params) -> None:
        # Params are just a fancy dict.
        self._a = params['a']
Beispiel #8
0
class BBNSVC(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    Primitive wrapping for sklearn.ensemble.AdaBoostClassifier
    """

    __author__ = "JPL MARVIN"
    metadata = metadata_module.PrimitiveMetadata({ 
         "algorithm_types": ['ADABOOST'],
         "name": "sklearn.svm.classes.SVC",
         "primitive_family": "CLASSIFICATION",
         "python_path": "d3m.primitives.bbn.time_series.BBNSVC",
         "source": {'name': 'JPL'},
         "version": "0.1.0",
         "id": "a2ee7b2b-99c6-4326-b2e7-e081cd292d78",
         'installation': [{'type': metadata_module.PrimitiveInstallationType.PIP,
                           'package_uri': 'git+https://gitlab.datadrivendiscovery.org/jpl/d3m_sklearn_wrap.git@{git_commit}'.format(
                               git_commit=utils.current_git_commit(os.path.dirname(__file__)),
                            ),
                         }]
    })

    def __init__(self, *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, str] = None,
                 _verbose: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

        self._clf = SVC(
            C=self.hyperparams['C'],
            kernel=self.hyperparams['kernel'],
            degree=self.hyperparams['degree'],
            gamma=self.hyperparams['gamma'],
            coef0=self.hyperparams['coef0'],
            probability=self.hyperparams['probability'],
            shrinking=self.hyperparams['shrinking'],
            tol=self.hyperparams['tol'],
            class_weight=self.hyperparams['class_weight'],
            max_iter=self.hyperparams['max_iter'],
            decision_function_shape=self.hyperparams['decision_function_shape'],
            verbose=_verbose,
            random_state=self.random_seed,
        )
        self._training_inputs = None
        self._training_outputs = None
        self._fitted = False

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._training_inputs = inputs
        self._training_outputs = outputs
        self._fitted = False

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None or self._training_outputs is None:
            raise ValueError("Missing training data.")

        self._clf.fit(self._training_inputs, self._training_outputs)
        self._fitted = True

        return CallResult(None)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        return CallResult(self._clf.predict(inputs))

    def produce_log_proba(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        return CallResult(self._clf.predict_log_proba(inputs))

    def get_params(self) -> Params:
        return Params(
            support=self._clf.support_,
            support_vectors=self._clf.support_vectors_,
            n_support=self._clf.n_support_,
            dual_coef=self._clf.dual_coef_,
            coef=self._clf.coef_,
            intercept=self._clf.intercept_,
        )

    def set_params(self, *, params: Params) -> None:
        self._clf.support_ = params.support
        self._clf.support_vectors_ = params.support_vectors
        self._clf.n_support_ = params.n_support
        self._clf.dual_coef_ = params.dual_coef
        self._clf.intercept_ = params.intercept
Beispiel #9
0
class MultiTableFeaturization(
        FeaturizationTransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):

    __author__ = 'USC ISI'
    metadata = metadata.PrimitiveMetadata({
        'id':
        'dsbox-multi-table-featurization-aggregation',
        'version':
        'v' + config.VERSION,
        'name':
        "DSBox Multiple Table Featurizer Aggregation",
        'description':
        'Generate a featurized table from multiple-table dataset using aggregation',
        'python_path':
        'd3m.primitives.dsbox.MultiTableFeaturization',
        'primitive_family':
        metadata.PrimitiveFamily.FEATURE_EXTRACTION,
        'algorithm_types': [
            metadata.PrimitiveAlgorithmType.RELATIONAL_DATA_MINING,
        ],
        'keywords': ['multiple table'],
        'source': {
            'name': config.D3M_PERFORMER_TEAM,
            'uris': [config.REPOSITORY]
        },
        # The same path the primitive is registered with entry points in setup.py.
        'installation': [config.INSTALLATION],
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.

        # A metafeature about preconditions required for this primitive to operate well.
        'precondition': [],
        'hyperparms_to_tune': []
    })

    def __init__(
        self,
        *,
        hyperparams: Hyperparams,
        random_seed: int = 0,
        docker_containers: typing.Union[typing.Dict[str, str], None] = None
    ) -> None:

        # All primitives must define these attributes
        self.hyperparams = hyperparams
        self.random_seed = random_seed
        self.docker_containers = docker_containers

        # All other attributes must be private with leading underscore
        self._has_finished = False
        self._iterations_done = False
        self._verbose = hyperparams['verbose'] if hyperparams else 0

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:

        if (timeout is None):
            big_table = self._core(inputs)
            self._has_finished = True
            self._iterations_done = True
            return CallResult(big_table, self._has_finished,
                              self._iterations_done)
        else:
            # setup the timeout
            with stopit.ThreadingTimeout(timeout) as to_ctx_mrg:
                assert to_ctx_mrg.state == to_ctx_mrg.EXECUTING

                # core computations
                big_table = self._core(inputs)

            if to_ctx_mrg.state == to_ctx_mrg.EXECUTED:
                self._has_finished = True
                self._iterations_done = True
                return CallResult(big_table, self._has_finished,
                                  self._iterations_done)
            elif to_ctx_mrg.state == to_ctx_mrg.TIMED_OUT:
                self._has_finished = False
                self._iterations_done = False
                return CallResult(None, self._has_finished,
                                  self._iterations_done)

    def _core(self, inputs) -> Outputs:
        """
        core calculations
        """
        data = inputs[0]
        names = inputs[1][:-1]
        master_table_name = inputs[1][-1]
        # TODO: format names, make sure it has to be like xx.csv
        # for example, if original names is like [0,1,2], make it to be [0.csv, 1.csv, 2.csv]

        # step 1: get relation matrix
        relation_matrix = get_relation_matrix(data, names)

        # step 2: get prime key - foreign key relationship
        relations = relationMat2foreignKey(data, names, relation_matrix)
        # print (relations) # to see if the relations are correct

        # step 3: featurization
        aggregator = Aggregator(relations, data, names)
        big_table = aggregator.forward(master_table_name)

        return big_table
Beispiel #10
0
class JHUGraph(ClusteringPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):

    # TODO: Create metadata for this
    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': 'b940ccbd-9e9b-3166-af50-210bfd79251b',
        'version': "crap",
        'name': "Monomial Regressor",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['test primitive'],
        'source': {
            'name': "boss",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/monomial.py',
                'https://gitlab.com/datadrivendiscovery/tests-data.git',
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type': metadata_module.PrimitiveInstallationType.PIP,
            'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirecto\
ry=primitives'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
            ),
        }],
        # URIs at which one can obtain code for the primitive, if available.
        'location_uris': [
            'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
            ),
        ],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.test.MonomialPrimitive',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.LINEAR_REGRESSION,
        ],
        'primitive_family': metadata_module.PrimitiveFamily.REGRESSION,
    })

    _adjacency_matrix = None
    _num_vertices = None
    _num_edges = None
    _directed = None
    _weighted = None
    _dangling_nodes = None

    def read_graph(self, *, fname: str) -> None:

        dtype = self.hyperparams['dtype']

        if dtype == "gml":
            self._object = read_graph(fname, "gml")
        elif dtype.startswith("edge"):
            self._object = read_graph(fname, "edge")
        else:
            raise NotImplementedError("Reading graphs of type '{}'".\
                    format(dtype))

        self._num_vertices = ig_get_num_vertices(self._object)
        self._num_edges = ig_get_num_edges(self._object)
        self._directed = ig_is_directed(self._object)
        self._weighted = ig_is_weighted(self._object)

    def compute_statistics(self) -> Outputs:
        self._dangling_nodes = ig_get_dangling_nodes(self._object)

    def get_adjacency_matrix(self) -> Outputs:
        return ig_get_adjacency_matrix(self._object)

    def get_dense_matrix(self) -> Outputs:
        return ig_get_dense_matrix(self._object)

    def get_num_vertices(self) -> int:
        return self._num_vertices

    def get_num_edges(self) -> int:
        return self._num_edges

    def is_directed(self) -> bool:
        return self._directed

    def is_weighted(self) -> bool:
        return self._weighted

    def get_dangling_nodes(self) -> Outputs:
        if (self._dangling_nodes is None):
            self.compute_statistics()
        return self._dangling_nodes

    def summary(self) -> None:
        ig_summary(self._object)

    def set_training_data(self, *, inputs: Inputs) -> None:  # type: ignore
        pass

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        return base.CallResult(self.get_adjacency_matrix())

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        return base.CallResult(None)

    def get_params(self) -> Params:
        return Params(other={})

    def set_params(self, *, params: Params) -> None:
        return None
Beispiel #11
0
class duke(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        "46612a42-6120-3559-9db9-3aa9a76eb94f",
        'version':
        __version__,
        'name':
        "duke",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Dataset Descriptor'],
        'source': {
            'name':
            __author__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/duke-thin-client",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type':
            metadata_module.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://github.com/NewKnowledge/duke-thin-client.git@{git_commit}#egg=DukeThinClient'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.distil.duke',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.RECURRENT_NEURAL_NETWORK,
        ],
        'primitive_family':
        metadata_module.PrimitiveFamily.DATA_CLEANING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: typing.Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        self._decoder = JSONDecoder()
        self._params = {}

    def fit(self) -> None:
        pass

    def get_params(self) -> Params:
        return self._params

    def set_params(self, *, params: Params) -> None:
        self.params = params

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        pass

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce primitive's best guess for the structural type of each input column.
        
        Parameters
        ----------
        inputs : Input pandas frame

        Returns
        -------
        Outputs
            The outputs is a list that has length equal to number of columns in input pandas frame. 
            Each entry is a list of strings corresponding to each column's multi-label classification.
        """
        """ Accept a pandas data frame, predicts column types in it
        frame: a pandas data frame containing the data to be processed
        -> a list of lists of column labels
        """

        filename = inputs[1]
        files = {'file': open(filename, 'rb')}

        try:
            r = requests.post(inputs[0] + "/fileUpload", files=files)
            return self._decoder.decode(r.text)
        except:
            # Should probably do some more sophisticated error logging here
            return "Failed processing input file"
class RFMPreconditionedGaussianKRR(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    Performs gaussian kernel regression using a random feature map to precondition the
    problem for faster convergence:
    forms the kernel 
        K_{ij} = exp(-||x_i - x_j||^2/(2sigma^2)) 
    and solves 
        alphahat = argmin ||K alpha - y||_F^2 + lambda ||alpha||_F^2 
    predictions are then formed by 
        ypred = K(trainingData, x) alphahat
    """

    __author__ = "ICSI" # a la directions on https://gitlab.datadrivendiscovery.org/jpl/primitives_repo
    metadata = metadata_module.PrimitiveMetadata({
        'id': '511c3536-2fff-369a-b81d-96755ff5b81b',
        'version': __version__,
        'name': 'RFM Preconditioned Gaussian Kernel Ridge Regression',
        'description': 'Gaussian regression using random fourier features as a preconditioner for faster solves',
        'python_path': 'd3m.primitives.realML.kernel.RFMPreconditionedGaussianKRR',
        'primitive_family': metadata_module.PrimitiveFamily.REGRESSION,
        'algorithm_types' : [
            metadata_module.PrimitiveAlgorithmType.KERNEL_METHOD
        ],
        'keywords' : ['kernel learning', 'kernel ridge regression', 'preconditioned CG', 'Gaussian', 'RBF', 'regression'],
        'source' : {
            'name': __author__,
            'contact': 'mailto:[email protected]',
            'uris' : [
                "http://*****:*****@{git_commit}#egg=realML'.format(git_commit=utils.current_git_commit(os.path.dirname(__file__)))
            }
        ],
        'location_uris': [ # NEED TO REF SPECIFIC COMMIT
            'https://github.com/alexgittens/realML/blob/master/realML/kernel/RFMPreconditionedGaussianKRR.py',
            ],
        'preconditions': [
            metadata_module.PrimitivePrecondition.NO_MISSING_VALUES,
            metadata_module.PrimitivePrecondition.NO_CATEGORICAL_VALUES
        ],
    })

    def __init__(self, *, 
                 hyperparams : Hyperparams,
                 random_seed: int = 0,
                 docker_containers : Dict[str, str] = None) -> None:
        """
        Initializes the preconditioned gaussian kernel ridge regression primitive.
        """
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

        self._seed = random_seed
        self._Xtrain = None
        self._ytrain = None
        self._fitted = False
        np.random.seed(random_seed)
    
    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """
        Sets the training data:
            Input: array, shape = [n_samples, n_features]
            Output: array, shape = [n_samples, n_targets]
        Only uses one input and output
        """
        self._Xtrain = inputs
        self._ytrain = outputs

        maxPCGsize = 20000 # TODO: make a control hyperparameter for when to switch to GS

        if len(self._ytrain.shape) == 1:
            self._ytrain = np.expand_dims(self._ytrain, axis=1) 

        if self._Xtrain.shape[0] > maxPCGsize:
            print("need to implement Gauss-Siedel for large datasets; currently training with a smaller subset")
            choices = np.random.choice(self._Xtrain.shape[0], size=maxPCGsize, replace=False)
            self._Xtrain = self._Xtrain[choices, :] 
            self._ytrain = self._ytrain[choices, :]

        self._n, self._d = self._Xtrain.shape
        self._fitted = False

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """
        Learns the kernel regression coefficients alpha given training pairs (X,y)
        """
        if self._fitted:
            return CallResult(None)

        if self._Xtrain is None or self._ytrain is None:
            raise ValueError("Missing training data.")

        self._U = generateGaussianPreconditioner(self._Xtrain, self.hyperparams['sigma'],
                                                 self.hyperparams['lparam'])
        def mykernel(X, Y):
            return GaussianKernel(X, Y, self.hyperparams['sigma'])
        self._coeffs = PCGfit(self._Xtrain, self._ytrain, mykernel, self._U, self.hyperparams['lparam'],
                              self.hyperparams['eps'], self.hyperparams['maxIters'])
        self._fitted = True

        return CallResult(None)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Predict the value for each sample in X

        Inputs:
            X: array of shape [n_samples, n_features]
        Outputs:
            y: array of shape [n_samples, n_targets]
        """
        return CallResult(GaussianKernel(inputs, self._Xtrain, self.hyperparams['sigma']).dot(self._coeffs).flatten())

    def set_params(self, *, params: Params) -> None:
        self._Xtrain = params['exemplars']
        self._coeffs = params['coeffs']

    def get_params(self) -> Params:
        return Params(exemplars=self._Xtrain, coeffs=self._coeffs)
Beispiel #13
0
class AdjacencySpectralEmbedding(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': 'b940ccbd-9e9b-3166-af50-210bfd79251b',
        'version': "0.3.0",
        'name': "jhu.ase",
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.jhu_primitives.AdjacencySpectralEmbedding',
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['ase primitive'],
        'source': {
            'name': "JHU",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://github.com/youngser/D3M/primitives-interfaces/jhu_primitives/ase/ase.py',
#                'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py',
                'https://github.com/youngser/D3M/primitives-interfaces.git',
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type': metadata_module.PrimitiveInstallationType.PIP,
            'package_uri': 'git+https://github.com/youngser/D3M/primitives-interfaces.git@{git_commit}#egg=jhu.ase'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
                ),
        }],
        # URIs at which one can obtain code for the primitive, if available.
        # 'location_uris': [
        #     'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format(
        #         git_commit=utils.current_git_commit(os.path.dirname(__file__)),
        #     ),
        # ],
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            "HIGHER_ORDER_SINGULAR_VALUE_DECOMPOSITION"
        ],
        'primitive_family': "DATA_TRANSFORMATION"
    })

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
#    def embed(self, *, g : JHUGraph, dim: int):
        """
        Perform Adjacency Spectral Embedding on a graph
        TODO: YP description

        **Positional Arguments:**

        g:
            - Graph in JHUGraph format

        **Optional Arguments:**

        dim:
            - The number of dimensions in which to embed the data
        """

        dim = self.hyperparams['dim']

        path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                "ase.interface.R")
        cmd = """
        source("%s")
        fn <- function(inputs, dim) {
            ase.interface(inputs, dim)
        }
        """ % path
        print(cmd)

        result = np.array(robjects.r(cmd)(inputs, dim))

        outputs = container.ndarray(result)

        return base.CallResult(outputs)

        #return np.array(robjects.r(cmd)(g._object, dim))

    def set_training_data(self) -> None:  # type: ignore
        """
        A noop.
        """

        return

    def fit(self, *, timeout: float = None, iterations: int = None) -> None:
        """
        A noop.
        """

        return

    def get_params(self) -> None:
        """
        A noop.
        """

        return None

    def set_params(self, *, params: None) -> None:
        """
        A noop.
        """

        return