Esempio n. 1
0
class TensorMachinesBinaryClassification(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    Learns a polynomial function using logistic regression for binary classification by modeling the polynomial's coefficients as low-rank tensors.
    Meant as a faster, more scalable alternative to polynomial random feature map approaches like CRAFTMaps.
    """

    __author__ = "ICSI" # a la directions on https://gitlab.datadrivendiscovery.org/jpl/primitives_repo
    metadata = metadata_module.PrimitiveMetadata({
        'id': 'ecc83605-d340-490d-9a2d-81c2ea6cb6cb', #uuid3(NAMESPACE_DNS, "realML.kernel.TensorMachineBinaryClassification" + __version__),
        'version': __version__,
        'name': 'Tensor Machine Binary Classifier',
        'description': 'Fit a polynomial function for logistic regression by modeling the polynomial coefficients as collection of low-rank tensors',
        'python_path': 'd3m.primitives.realML.kernel.TensorMachinesBinaryClassification',
        'primitive_family': metadata_module.PrimitiveFamily.CLASSIFICATION,
        'algorithm_types' : [
            metadata_module.PrimitiveAlgorithmType.LOGISTIC_REGRESSION,
        ],
        'keywords' : ['kernel learning', 'binary classification', 'adaptive features', 'polynomial model', 'classification'],
        'source' : {
            'name': __author__,
            'contact': 'mailto:[email protected]',
            'citation': 'https://arxiv.org/abs/1504.01697',
            'uris' : [
                "http://*****:*****@{git_commit}#egg=realML'.format(git_commit=utils.current_git_commit(os.path.dirname(__file__)))
            }
        ],
        'location_uris': [ # NEED TO REF SPECIFIC COMMIT
            'https://github.com/alexgittens/realML/blob/master/realML/kernel/TensorMachinesBinaryClassification.py',
            ],
        'preconditions': [
            metadata_module.PrimitivePrecondition.NO_MISSING_VALUES,
            metadata_module.PrimitivePrecondition.NO_CATEGORICAL_VALUES
        ],
    })

    def __init__(self, *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, str] = None) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

        self._seed = random_seed
        self._training_inputs = None
        self._training_outputs = None
        self._fitted = False
        self._weights = None
        self._norms = None

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._training_inputs = inputs
        self._training_outputs = outputs

        if self.hyperparams['preprocess'] == 'YES':
            (self._training_inputs, self._norms) = tm_preprocess(self._training_inputs)

        self._fitted = False

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None or self._training_outputs is None:
            raise ValueError("Missing training data.")

        if len(self._training_outputs.shape) == 1:
            self._training_outputs = np.expand_dims(self._training_outputs, axis=1)
        (self._weights, _) = tm_fit(self._training_inputs, self._training_outputs, 'bc', self.hyperparams['r'],
           self.hyperparams['q'], self.hyperparams['gamma'], self.hyperparams['solver'],
           self.hyperparams['epochs'], self.hyperparams['alpha'], seed=self._seed)

        self._fitted = True

        return CallResult(None)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        if self.hyperparams['preprocess'] == 'YES':
            inputs = tm_preprocess(inputs, colnorms=self._norms)

        pred_test = tm_predict(self._weights, inputs, self.hyperparams['q'],
                                     self.hyperparams['r'], 'bc')
        return CallResult(sign(pred_test.flatten()).astype(int))

    def get_params(self) -> Params:
        return Params(weights=self._weights, norms=self._norms)

    def set_params(self, *, params: Params) -> None:
        self._weights = params['weights']
        self._norms = params['norms']
Esempio n. 2
0
import os
from d3m_metadata import utils

D3M_API_VERSION = '2018.1.26'
VERSION = "0.1.0"
TAG_NAME = "{git_commit}".format(git_commit=utils.current_git_commit(
    os.path.dirname(__file__)), )

REPOSITORY = "https://github.com/rooshenas/dsbox-spen"
PACAKGE_NAME = "dsbox-spen"

D3M_PERFORMER_TEAM = 'UMASS'

if TAG_NAME:
    PACKAGE_URI = "git+" + REPOSITORY + "@" + TAG_NAME
else:
    PACKAGE_URI = "git+" + REPOSITORY

PACKAGE_URI = PACKAGE_URI + "#egg=" + PACAKGE_NAME

INSTALLATION_TYPE = 'GIT'
if INSTALLATION_TYPE == 'PYPI':
    INSTALLATION = {"type": "PIP", "package": PACAKGE_NAME, "version": VERSION}
else:
    # INSTALLATION_TYPE == 'GIT'
    INSTALLATION = {
        "type": "PIP",
        "package_uri": PACKAGE_URI,
    }
Esempio n. 3
0
class MonomialPrimitive(
        supervised_learning.SupervisedLearnerPrimitiveBase[Inputs, Outputs,
                                                           Params,
                                                           Hyperparams]):
    # It is important to provide a docstring because this docstring is used as a description of
    # a primitive. Some callers might analyze it to determine the nature and purpose of a primitive.
    """
    A primitive which fits output = a * input.
    """

    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        '4a0336ae-63b9-4a42-860e-86c5b64afbdd',
        'version':
        "crap",
        'name':
        "Monomial Regressor",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['test primitive'],
        'source': {
            'name':
            "boss",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/monomial.py',
                'https://gitlab.com/datadrivendiscovery/tests-data.git',
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type':
            metadata_module.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # URIs at which one can obtain code for the primitive, if available.
        'location_uris': [
            'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        ],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.test.MonomialPrimitive',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.LINEAR_REGRESSION,
        ],
        'primitive_family':
        metadata_module.PrimitiveFamily.REGRESSION,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: typing.Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        self._a: float = None
        self._training_inputs: Inputs = None
        self._training_outputs: Outputs = None
        self._fitted: bool = False

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:
        if self._a is None:
            raise ValueError("Calling produce before fitting.")

        # We compute the result. We use (...) here and not [...] to create a
        # generator and not a list which would then just be copied into "List".
        result = (self._a * input + self.hyperparams['bias']
                  for input in inputs)

        # We convert a regular list to container list which supports metadata attribute.
        outputs: container.List[float] = container.List[float](result)

        # We clear old metadata (but which keeps history and link to inputs metadata) and set new metadata.
        # "for_value" tells that this new metadata will be associated with "outputs",
        # and "source" tells which primitive generated this metadata.
        metadata = inputs.metadata.clear(
            {
                'schema': metadata_module.CONTAINER_SCHEMA_VERSION,
                'structural_type': type(outputs),
                'dimension': {
                    'length': len(outputs)
                }
            },
            for_value=outputs,
            source=self).update((metadata_module.ALL_ELEMENTS, ), {
                'structural_type': float,
            },
                                source=self)

        # Set metadata attribute.
        outputs.metadata = metadata

        # Wrap it into default "CallResult" object: we are not doing any iterations.
        return base.CallResult(outputs)

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._training_inputs = inputs
        self._training_outputs = outputs
        self._fitted = False

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[None]:
        if self._fitted:
            return base.CallResult(None)

        if not self._training_inputs or not self._training_inputs:
            raise ValueError("Missing training data.")

        quotients = [
            output / input for output, input in zip(
                self._training_outputs, self._training_inputs) if input != 0
        ]
        self._a = sum(quotients) / len(quotients)
        self._fitted = True

        return base.CallResult(None)

    def get_params(self) -> Params:
        # You can pass a dict or keyword arguments.
        return Params(a=self._a)

    def set_params(self, *, params: Params) -> None:
        # Params are just a fancy dict.
        self._a = params['a']
class TensorMachinesRegularizedLeastSquares(
        SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    Fits an l2-regularized least squares polynomial regression model by modeling the coefficients of the polynomial with a low-rank tensor. Intended
    as a scalable alternative to polynomial random feature maps like CRAFTMaps.
    """

    __author__ = "ICSI"  # a la directions on https://gitlab.datadrivendiscovery.org/jpl/primitives_repo
    metadata = metadata_module.PrimitiveMetadata({
        'id':
        '2d8155bb-3ca8-39de-8964-adb21225868e',
        'version':
        __version__,
        'name':
        'Tensor Machine Regularized Least Squares',
        'description':
        'Fit a polynomial function for l2-regularized regression by modeling the polynomial coefficients as collection of low-rank tensors',
        'python_path':
        'd3m.primitives.realML.kernel.TensorMachinesRegularizedLeastSquares',
        'primitive_family':
        metadata_module.PrimitiveFamily.REGRESSION,
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.KERNEL_METHOD,
            metadata_module.PrimitiveAlgorithmType.POLYNOMIAL_NEURAL_NETWORK
        ],
        'keywords': [
            'kernel learning', 'polynomial regression', 'adaptive features',
            'polynomial model', 'regression'
        ],
        'source': {
            'name': __author__,
            'contact': 'mailto:[email protected]',
            'citation': 'https://arxiv.org/abs/1504.01697',
            'uris': [
                "http://*****:*****@{git_commit}#egg=realML'
            .format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)))
        }],
        'location_uris': [  # NEED TO REF SPECIFIC COMMIT
            'https://github.com/alexgittens/realML/blob/master/realML/kernel/TensorMachinesRegularizedLeastSquares.py',
        ],
        'preconditions': [
            metadata_module.PrimitivePrecondition.NO_MISSING_VALUES,
            metadata_module.PrimitivePrecondition.NO_CATEGORICAL_VALUES
        ],
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, str] = None) -> None:

        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        self._seed = random_seed
        self._training_inputs = None
        self._training_outputs = None
        self._fitted = False
        self._weights = None
        self._norms = None

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._training_inputs = inputs
        self._training_outputs = outputs

        if self.hyperparams['preprocess'] == 'YES':
            (self._training_inputs,
             self._norms) = tm_preprocess(self._training_inputs)

        self._fitted = False

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None or self._training_outputs is None:
            raise ValueError("Missing training data.")

        (self._weights, _) = tm_fit(self._training_inputs,
                                    self._training_outputs,
                                    'regression',
                                    self.hyperparams['r'],
                                    self.hyperparams['q'],
                                    self.hyperparams['gamma'],
                                    self.hyperparams['solver'],
                                    self.hyperparams['epochs'],
                                    self.hyperparams['alpha'],
                                    seed=self._seed)

        self._fitted = True

        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if self.hyperparams['preprocess'] == 'YES':
            inputs = tm_preprocess(inputs, colnorms=self._norms)

        pred_test = tm_predict(self._weights, inputs, self.hyperparams['q'],
                               self.hyperparams['r'], 'regression')
        return CallResult(pred_test.flatten())

    def get_params(self) -> Params:
        return Params(weights=self._weights, norms=self._norms)

    def set_params(self, *, params: Params) -> None:
        self._weights = params['weights']
        self._norms = params['norms']
Esempio n. 5
0
class BBNSVC(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    Primitive wrapping for sklearn.ensemble.AdaBoostClassifier
    """

    __author__ = "JPL MARVIN"
    metadata = metadata_module.PrimitiveMetadata({ 
         "algorithm_types": ['ADABOOST'],
         "name": "sklearn.svm.classes.SVC",
         "primitive_family": "CLASSIFICATION",
         "python_path": "d3m.primitives.bbn.time_series.BBNSVC",
         "source": {'name': 'JPL'},
         "version": "0.1.0",
         "id": "a2ee7b2b-99c6-4326-b2e7-e081cd292d78",
         'installation': [{'type': metadata_module.PrimitiveInstallationType.PIP,
                           'package_uri': 'git+https://gitlab.datadrivendiscovery.org/jpl/d3m_sklearn_wrap.git@{git_commit}'.format(
                               git_commit=utils.current_git_commit(os.path.dirname(__file__)),
                            ),
                         }]
    })

    def __init__(self, *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, str] = None,
                 _verbose: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

        self._clf = SVC(
            C=self.hyperparams['C'],
            kernel=self.hyperparams['kernel'],
            degree=self.hyperparams['degree'],
            gamma=self.hyperparams['gamma'],
            coef0=self.hyperparams['coef0'],
            probability=self.hyperparams['probability'],
            shrinking=self.hyperparams['shrinking'],
            tol=self.hyperparams['tol'],
            class_weight=self.hyperparams['class_weight'],
            max_iter=self.hyperparams['max_iter'],
            decision_function_shape=self.hyperparams['decision_function_shape'],
            verbose=_verbose,
            random_state=self.random_seed,
        )
        self._training_inputs = None
        self._training_outputs = None
        self._fitted = False

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        self._training_inputs = inputs
        self._training_outputs = outputs
        self._fitted = False

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        if self._training_inputs is None or self._training_outputs is None:
            raise ValueError("Missing training data.")

        self._clf.fit(self._training_inputs, self._training_outputs)
        self._fitted = True

        return CallResult(None)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        return CallResult(self._clf.predict(inputs))

    def produce_log_proba(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        return CallResult(self._clf.predict_log_proba(inputs))

    def get_params(self) -> Params:
        return Params(
            support=self._clf.support_,
            support_vectors=self._clf.support_vectors_,
            n_support=self._clf.n_support_,
            dual_coef=self._clf.dual_coef_,
            coef=self._clf.coef_,
            intercept=self._clf.intercept_,
        )

    def set_params(self, *, params: Params) -> None:
        self._clf.support_ = params.support
        self._clf.support_vectors_ = params.support_vectors
        self._clf.n_support_ = params.n_support
        self._clf.dual_coef_ = params.dual_coef
        self._clf.intercept_ = params.intercept
Esempio n. 6
0
        "class_attributes": {
            "metadata": "d3m_metadata.metadata.PrimitiveMetadata"
        },
        "instance_attributes": {
            "hyperparams": "d3m_metadata.hyperparams.Hyperparams",
            "random_seed": "int",
            "docker_containers": "typing.Dict[str, str]"
        }
    },
    "structural_type": "test_primitives.increment.IncrementPrimitive",
    "description": "A primitive which increments each value by a fixed amount, by default 1."
}
""".replace('__INTERFACES_VERSION__',
            primitive_interfaces.__version__).replace(
                '__GIT_COMMIT__',
                utils.current_git_commit(TEST_PRIMITIVES_DIR))


class TestIncrementPrimitive(unittest.TestCase):
    # It is not necessary to call "can_accept" before calling a method
    # during runtime, but we are doing it here for testing purposes.
    def call_primitive(self, primitive, method_name, **kwargs):
        primitive_arguments = primitive.metadata.query(
        )['primitive_code']['arguments']

        arguments = {}

        for argument_name, argument_value in kwargs.items():
            if primitive_arguments[argument_name][
                    'kind'] == metadata.PrimitiveArgumentKind.PIPELINE:
                arguments[argument_name] = argument_value.metadata
Esempio n. 7
0
class JHUGraph(ClusteringPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):

    # TODO: Create metadata for this
    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': 'b940ccbd-9e9b-3166-af50-210bfd79251b',
        'version': "crap",
        'name': "Monomial Regressor",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['test primitive'],
        'source': {
            'name': "boss",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/monomial.py',
                'https://gitlab.com/datadrivendiscovery/tests-data.git',
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type': metadata_module.PrimitiveInstallationType.PIP,
            'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirecto\
ry=primitives'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
            ),
        }],
        # URIs at which one can obtain code for the primitive, if available.
        'location_uris': [
            'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
            ),
        ],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.test.MonomialPrimitive',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.LINEAR_REGRESSION,
        ],
        'primitive_family': metadata_module.PrimitiveFamily.REGRESSION,
    })

    _adjacency_matrix = None
    _num_vertices = None
    _num_edges = None
    _directed = None
    _weighted = None
    _dangling_nodes = None

    def read_graph(self, *, fname: str) -> None:

        dtype = self.hyperparams['dtype']

        if dtype == "gml":
            self._object = read_graph(fname, "gml")
        elif dtype.startswith("edge"):
            self._object = read_graph(fname, "edge")
        else:
            raise NotImplementedError("Reading graphs of type '{}'".\
                    format(dtype))

        self._num_vertices = ig_get_num_vertices(self._object)
        self._num_edges = ig_get_num_edges(self._object)
        self._directed = ig_is_directed(self._object)
        self._weighted = ig_is_weighted(self._object)

    def compute_statistics(self) -> Outputs:
        self._dangling_nodes = ig_get_dangling_nodes(self._object)

    def get_adjacency_matrix(self) -> Outputs:
        return ig_get_adjacency_matrix(self._object)

    def get_dense_matrix(self) -> Outputs:
        return ig_get_dense_matrix(self._object)

    def get_num_vertices(self) -> int:
        return self._num_vertices

    def get_num_edges(self) -> int:
        return self._num_edges

    def is_directed(self) -> bool:
        return self._directed

    def is_weighted(self) -> bool:
        return self._weighted

    def get_dangling_nodes(self) -> Outputs:
        if (self._dangling_nodes is None):
            self.compute_statistics()
        return self._dangling_nodes

    def summary(self) -> None:
        ig_summary(self._object)

    def set_training_data(self, *, inputs: Inputs) -> None:  # type: ignore
        pass

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        return base.CallResult(self.get_adjacency_matrix())

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        return base.CallResult(None)

    def get_params(self) -> Params:
        return Params(other={})

    def set_params(self, *, params: Params) -> None:
        return None
Esempio n. 8
0
                "description": "A noop."
            }
        },
        "class_attributes": {
            "metadata": "d3m_metadata.metadata.PrimitiveMetadata"
        },
        "instance_attributes": {
            "hyperparams": "d3m_metadata.hyperparams.Hyperparams",
            "random_seed": "int",
            "docker_containers": "typing.Dict[str, str]"
        }
    },
    "structural_type": "test_primitives.sum.SumPrimitive",
    "description": "A primitive which sums all the values on input into one number."
}
""".replace('__INTERFACES_VERSION__', primitive_interfaces.__version__).replace('__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR))


class TestSumPrimitive(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.docker_client = docker.from_env()

        cls.docker_containers = {}

        # Start all containers (this pulls images if they do not yet exist).
        installation = SumPrimitive.metadata.query().get('installation', [])
        for entry in installation:
            if entry['type'] != metadata.PrimitiveInstallationType.DOCKER:
                continue
Esempio n. 9
0
class duke(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id':
        "46612a42-6120-3559-9db9-3aa9a76eb94f",
        'version':
        __version__,
        'name':
        "duke",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['Dataset Descriptor'],
        'source': {
            'name':
            __author__,
            'uris': [
                # Unstructured URIs.
                "https://github.com/NewKnowledge/duke-thin-client",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type':
            metadata_module.PrimitiveInstallationType.PIP,
            'package_uri':
            'git+https://github.com/NewKnowledge/duke-thin-client.git@{git_commit}#egg=DukeThinClient'
            .format(git_commit=utils.current_git_commit(
                os.path.dirname(__file__)), ),
        }],
        # The same path the primitive is registered with entry points in setup.py.
        'python_path':
        'd3m.primitives.distil.duke',
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            metadata_module.PrimitiveAlgorithmType.RECURRENT_NEURAL_NETWORK,
        ],
        'primitive_family':
        metadata_module.PrimitiveFamily.DATA_CLEANING,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: typing.Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        self._decoder = JSONDecoder()
        self._params = {}

    def fit(self) -> None:
        pass

    def get_params(self) -> Params:
        return self._params

    def set_params(self, *, params: Params) -> None:
        self.params = params

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        pass

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """
        Produce primitive's best guess for the structural type of each input column.
        
        Parameters
        ----------
        inputs : Input pandas frame

        Returns
        -------
        Outputs
            The outputs is a list that has length equal to number of columns in input pandas frame. 
            Each entry is a list of strings corresponding to each column's multi-label classification.
        """
        """ Accept a pandas data frame, predicts column types in it
        frame: a pandas data frame containing the data to be processed
        -> a list of lists of column labels
        """

        filename = inputs[1]
        files = {'file': open(filename, 'rb')}

        try:
            r = requests.post(inputs[0] + "/fileUpload", files=files)
            return self._decoder.decode(r.text)
        except:
            # Should probably do some more sophisticated error logging here
            return "Failed processing input file"
class RFMPreconditionedGaussianKRR(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
    """
    Performs gaussian kernel regression using a random feature map to precondition the
    problem for faster convergence:
    forms the kernel 
        K_{ij} = exp(-||x_i - x_j||^2/(2sigma^2)) 
    and solves 
        alphahat = argmin ||K alpha - y||_F^2 + lambda ||alpha||_F^2 
    predictions are then formed by 
        ypred = K(trainingData, x) alphahat
    """

    __author__ = "ICSI" # a la directions on https://gitlab.datadrivendiscovery.org/jpl/primitives_repo
    metadata = metadata_module.PrimitiveMetadata({
        'id': '511c3536-2fff-369a-b81d-96755ff5b81b',
        'version': __version__,
        'name': 'RFM Preconditioned Gaussian Kernel Ridge Regression',
        'description': 'Gaussian regression using random fourier features as a preconditioner for faster solves',
        'python_path': 'd3m.primitives.realML.kernel.RFMPreconditionedGaussianKRR',
        'primitive_family': metadata_module.PrimitiveFamily.REGRESSION,
        'algorithm_types' : [
            metadata_module.PrimitiveAlgorithmType.KERNEL_METHOD
        ],
        'keywords' : ['kernel learning', 'kernel ridge regression', 'preconditioned CG', 'Gaussian', 'RBF', 'regression'],
        'source' : {
            'name': __author__,
            'contact': 'mailto:[email protected]',
            'uris' : [
                "http://*****:*****@{git_commit}#egg=realML'.format(git_commit=utils.current_git_commit(os.path.dirname(__file__)))
            }
        ],
        'location_uris': [ # NEED TO REF SPECIFIC COMMIT
            'https://github.com/alexgittens/realML/blob/master/realML/kernel/RFMPreconditionedGaussianKRR.py',
            ],
        'preconditions': [
            metadata_module.PrimitivePrecondition.NO_MISSING_VALUES,
            metadata_module.PrimitivePrecondition.NO_CATEGORICAL_VALUES
        ],
    })

    def __init__(self, *, 
                 hyperparams : Hyperparams,
                 random_seed: int = 0,
                 docker_containers : Dict[str, str] = None) -> None:
        """
        Initializes the preconditioned gaussian kernel ridge regression primitive.
        """
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

        self._seed = random_seed
        self._Xtrain = None
        self._ytrain = None
        self._fitted = False
        np.random.seed(random_seed)
    
    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """
        Sets the training data:
            Input: array, shape = [n_samples, n_features]
            Output: array, shape = [n_samples, n_targets]
        Only uses one input and output
        """
        self._Xtrain = inputs
        self._ytrain = outputs

        maxPCGsize = 20000 # TODO: make a control hyperparameter for when to switch to GS

        if len(self._ytrain.shape) == 1:
            self._ytrain = np.expand_dims(self._ytrain, axis=1) 

        if self._Xtrain.shape[0] > maxPCGsize:
            print("need to implement Gauss-Siedel for large datasets; currently training with a smaller subset")
            choices = np.random.choice(self._Xtrain.shape[0], size=maxPCGsize, replace=False)
            self._Xtrain = self._Xtrain[choices, :] 
            self._ytrain = self._ytrain[choices, :]

        self._n, self._d = self._Xtrain.shape
        self._fitted = False

    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """
        Learns the kernel regression coefficients alpha given training pairs (X,y)
        """
        if self._fitted:
            return CallResult(None)

        if self._Xtrain is None or self._ytrain is None:
            raise ValueError("Missing training data.")

        self._U = generateGaussianPreconditioner(self._Xtrain, self.hyperparams['sigma'],
                                                 self.hyperparams['lparam'])
        def mykernel(X, Y):
            return GaussianKernel(X, Y, self.hyperparams['sigma'])
        self._coeffs = PCGfit(self._Xtrain, self._ytrain, mykernel, self._U, self.hyperparams['lparam'],
                              self.hyperparams['eps'], self.hyperparams['maxIters'])
        self._fitted = True

        return CallResult(None)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        """
        Predict the value for each sample in X

        Inputs:
            X: array of shape [n_samples, n_features]
        Outputs:
            y: array of shape [n_samples, n_targets]
        """
        return CallResult(GaussianKernel(inputs, self._Xtrain, self.hyperparams['sigma']).dot(self._coeffs).flatten())

    def set_params(self, *, params: Params) -> None:
        self._Xtrain = params['exemplars']
        self._coeffs = params['coeffs']

    def get_params(self) -> Params:
        return Params(exemplars=self._Xtrain, coeffs=self._coeffs)
Esempio n. 11
0
File: ase.py Progetto: youngser/D3M3
class AdjacencySpectralEmbedding(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]):
    # This should contain only metadata which cannot be automatically determined from the code.
    metadata = metadata_module.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        'id': 'b940ccbd-9e9b-3166-af50-210bfd79251b',
        'version': "0.3.0",
        'name': "jhu.ase",
        # The same path the primitive is registered with entry points in setup.py.
        'python_path': 'd3m.primitives.jhu_primitives.AdjacencySpectralEmbedding',
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        'keywords': ['ase primitive'],
        'source': {
            'name': "JHU",
            'uris': [
                # Unstructured URIs. Link to file and link to repo in this case.
                'https://github.com/youngser/D3M/primitives-interfaces/jhu_primitives/ase/ase.py',
#                'https://github.com/youngser/primitives-interfaces/blob/jp-devM1/jhu_primitives/ase/ase.py',
                'https://github.com/youngser/D3M/primitives-interfaces.git',
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        'installation': [{
            'type': metadata_module.PrimitiveInstallationType.PIP,
            'package_uri': 'git+https://github.com/youngser/D3M/primitives-interfaces.git@{git_commit}#egg=jhu.ase'.format(
                git_commit=utils.current_git_commit(os.path.dirname(__file__)),
                ),
        }],
        # URIs at which one can obtain code for the primitive, if available.
        # 'location_uris': [
        #     'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format(
        #         git_commit=utils.current_git_commit(os.path.dirname(__file__)),
        #     ),
        # ],
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        'algorithm_types': [
            "HIGHER_ORDER_SINGULAR_VALUE_DECOMPOSITION"
        ],
        'primitive_family': "DATA_TRANSFORMATION"
    })

    def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, str] = None) -> None:
        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)

    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
#    def embed(self, *, g : JHUGraph, dim: int):
        """
        Perform Adjacency Spectral Embedding on a graph
        TODO: YP description

        **Positional Arguments:**

        g:
            - Graph in JHUGraph format

        **Optional Arguments:**

        dim:
            - The number of dimensions in which to embed the data
        """

        dim = self.hyperparams['dim']

        path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                "ase.interface.R")
        cmd = """
        source("%s")
        fn <- function(inputs, dim) {
            ase.interface(inputs, dim)
        }
        """ % path
        print(cmd)

        result = np.array(robjects.r(cmd)(inputs, dim))

        outputs = container.ndarray(result)

        return base.CallResult(outputs)

        #return np.array(robjects.r(cmd)(g._object, dim))

    def set_training_data(self) -> None:  # type: ignore
        """
        A noop.
        """

        return

    def fit(self, *, timeout: float = None, iterations: int = None) -> None:
        """
        A noop.
        """

        return

    def get_params(self) -> None:
        """
        A noop.
        """

        return None

    def set_params(self, *, params: None) -> None:
        """
        A noop.
        """

        return