Exemple #1
0
    def fit(self, data, args):
        self.model = Normalizer(norm="l2")

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval
Exemple #2
0
def make_models(X, y, y_bin):
    return dict(ols=LinearRegression().fit(X, y),
                lr_bin=LogisticRegression().fit(X, y_bin),
                lr_ovr=LogisticRegression(multi_class='ovr').fit(X, y),
                lr_mn=LogisticRegression(solver='lbfgs',
                                         multi_class='multinomial').fit(X, y),
                svc=SVC(kernel='linear').fit(X, y_bin),
                svr=SVR(kernel='linear').fit(X, y),
                dtc=DecisionTreeClassifier(max_depth=4).fit(X, y),
                dtr=DecisionTreeRegressor(max_depth=4).fit(X, y),
                rfc=RandomForestClassifier(n_estimators=3,
                                           max_depth=3,
                                           random_state=1).fit(X, y),
                rfr=RandomForestRegressor(n_estimators=3,
                                          max_depth=3,
                                          random_state=1).fit(X, y),
                gbc=GradientBoostingClassifier(n_estimators=3,
                                               max_depth=3,
                                               random_state=1).fit(X, y),
                gbr=GradientBoostingRegressor(n_estimators=3,
                                              max_depth=3,
                                              random_state=1).fit(X, y),
                abc=AdaBoostClassifier(algorithm='SAMME',
                                       n_estimators=3,
                                       random_state=1).fit(X, y),
                abc2=AdaBoostClassifier(algorithm='SAMME.R',
                                        n_estimators=3,
                                        random_state=1).fit(X, y),
                abc3=AdaBoostClassifier(algorithm='SAMME',
                                        n_estimators=3,
                                        random_state=1).fit(X, y_bin),
                abc4=AdaBoostClassifier(algorithm='SAMME.R',
                                        n_estimators=3,
                                        random_state=1).fit(X, y_bin),
                km=KMeans(1).fit(X),
                km2=KMeans(5).fit(X),
                pc1=PCA(1).fit(X),
                pc2=PCA(2).fit(X),
                pc3=PCA(2, whiten=True).fit(X),
                mlr1=MLPRegressor([2], 'relu').fit(X, y),
                mlr2=MLPRegressor([2, 1], 'tanh').fit(X, y),
                mlr3=MLPRegressor([2, 2, 2], 'identity').fit(X, y),
                mlc=MLPClassifier([2, 2], 'tanh').fit(X, y),
                mlc_bin=MLPClassifier([2, 2], 'identity').fit(X, y_bin),
                bin=Binarizer(0.5),
                mms=MinMaxScaler().fit(X),
                mas=MaxAbsScaler().fit(X),
                ss1=StandardScaler().fit(X),
                ss2=StandardScaler(with_mean=False).fit(X),
                ss3=StandardScaler(with_std=False).fit(X),
                n1=Normalizer('l1'),
                n2=Normalizer('l2'),
                n3=Normalizer('max'))
Exemple #3
0
class NormalizerImpl():
    def __init__(self, norm='l2', copy=True):
        self._hyperparams = {'norm': norm, 'copy': copy}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Exemple #4
0
def test_fit_transform():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    for obj in ((StandardScaler(), Normalizer(), Binarizer())):
        X_transformed = obj.fit(X).transform(X)
        X_transformed2 = obj.fit_transform(X)
        assert_array_equal(X_transformed, X_transformed2)
Exemple #5
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
def test_normalizer_l1():
    rng = np.random.RandomState(0)
    X_dense = rng.randn(4, 5)
    X_sparse_unpruned = sparse.csr_matrix(X_dense)

    # set the row number 3 to zero
    X_dense[3, :] = 0.0

    # set the row number 3 to zero without pruning (can happen in real life)
    indptr_3 = X_sparse_unpruned.indptr[3]
    indptr_4 = X_sparse_unpruned.indptr[4]
    X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0

    # build the pruned variant using the regular constructor
    X_sparse_pruned = sparse.csr_matrix(X_dense)

    # check inputs that support the no-copy optim
    for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):

        normalizer = Normalizer(norm='l1', copy=True)
        X_norm = normalizer.transform(X)
        assert_true(X_norm is not X)
        X_norm1 = toarray(X_norm)

        normalizer = Normalizer(norm='l1', copy=False)
        X_norm = normalizer.transform(X)
        assert_true(X_norm is X)
        X_norm2 = toarray(X_norm)

        for X_norm in (X_norm1, X_norm2):
            row_sums = np.abs(X_norm).sum(axis=1)
            for i in range(3):
                assert_almost_equal(row_sums[i], 1.0)
            assert_almost_equal(row_sums[3], 0.0)

    # check input for which copy=False won't prevent a copy
    for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
        X = init(X_dense)
        X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X)

        assert_true(X_norm is not X)
        assert_true(isinstance(X_norm, sparse.csr_matrix))

        X_norm = toarray(X_norm)
        for i in range(3):
            assert_almost_equal(row_sums[i], 1.0)
        assert_almost_equal(la.norm(X_norm[3]), 0.0)
Exemple #7
0
class CreateNormalizer(CreateModel):
    def fit(self, data, args):
        self.model = Normalizer(norm="l2")

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval

    def test(self, data):
        assert self.model is not None

        return self.model.transform(data.X_test)

    def predict(self, data):
        with Timer() as t:
            self.predictions = self.test(data)

        data.learning_task = LearningTask.REGRESSION
        return t.interval
Exemple #8
0
    def __init__(self, *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
        
        # False
        self._clf = Normalizer(
              norm=self.hyperparams['norm'],
        )
        
        self._inputs = None
        self._outputs = None
        self._training_inputs = None
        self._training_outputs = None
        self._target_names = None
        self._training_indices = None
        self._target_column_indices = None
        self._target_columns_metadata: List[OrderedDict] = None
        self._input_column_names = None
        self._fitted = False
Exemple #9
0
class DFNormalizer(TransformerMixin):
    # Row wise transformer? - Can be removed if so
    def __init__(self, norm='l2', copy=True):
        self.norm = norm
        self.copy = copy
        self.ss_ = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # assumes X is a DataFrame
        self.ss_ = Normalizer()
        Xss = self.ss_.transform(X)
        Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
        return Xscaled
Exemple #10
0
def test_normalizer_l1():
    rng = np.random.RandomState(0)
    X_dense = rng.randn(4, 5)
    X_sparse_unpruned = sparse.csr_matrix(X_dense)

    # set the row number 3 to zero
    X_dense[3, :] = 0.0

    # set the row number 3 to zero without pruning (can happen in real life)
    indptr_3 = X_sparse_unpruned.indptr[3]
    indptr_4 = X_sparse_unpruned.indptr[4]
    X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0

    # build the pruned variant using the regular constructor
    X_sparse_pruned = sparse.csr_matrix(X_dense)

    # check inputs that support the no-copy optim
    for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):

        normalizer = Normalizer(norm='l1', copy=True)
        X_norm = normalizer.transform(X)
        assert_true(X_norm is not X)
        X_norm1 = toarray(X_norm)

        normalizer = Normalizer(norm='l1', copy=False)
        X_norm = normalizer.transform(X)
        assert_true(X_norm is X)
        X_norm2 = toarray(X_norm)

        for X_norm in (X_norm1, X_norm2):
            row_sums = np.abs(X_norm).sum(axis=1)
            for i in range(3):
                assert_almost_equal(row_sums[i], 1.0)
            assert_almost_equal(row_sums[3], 0.0)

    # check input for which copy=False won't prevent a copy
    for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
        X = init(X_dense)
        X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X)

        assert_true(X_norm is not X)
        assert_true(isinstance(X_norm, sparse.csr_matrix))

        X_norm = toarray(X_norm)
        for i in range(3):
            assert_almost_equal(row_sums[i], 1.0)
        assert_almost_equal(la.norm(X_norm[3]), 0.0)
Exemple #11
0
 def __init__(self, norm='l2', copy=True):
     self._hyperparams = {'norm': norm, 'copy': copy}
     self._wrapped_model = SKLModel(**self._hyperparams)
Exemple #12
0
			'MaxAbsScaler':MaxAbsScaler(),
			'MeanShift':MeanShift(),
			'MinCovDet':MinCovDet(),
			'MinMaxScaler':MinMaxScaler(),
			'MiniBatchDictionaryLearning':MiniBatchDictionaryLearning(),
			'MiniBatchKMeans':MiniBatchKMeans(),
			'MiniBatchSparsePCA':MiniBatchSparsePCA(),
			'MultiTaskElasticNet':MultiTaskElasticNet(),
			'MultiTaskElasticNetCV':MultiTaskElasticNetCV(),
			'MultiTaskLasso':MultiTaskLasso(),
			'MultiTaskLassoCV':MultiTaskLassoCV(),
			'MultinomialNB':MultinomialNB(),
			'NMF':NMF(),
			'NearestCentroid':NearestCentroid(),
			'NearestNeighbors':NearestNeighbors(),
			'Normalizer':Normalizer(),
			'NuSVC':NuSVC(),
			'NuSVR':NuSVR(),
			'Nystroem':Nystroem(),
			'OAS':OAS(),
			'OneClassSVM':OneClassSVM(),
			'OrthogonalMatchingPursuit':OrthogonalMatchingPursuit(),
			'OrthogonalMatchingPursuitCV':OrthogonalMatchingPursuitCV(),
			'PCA':PCA(),
			'PLSCanonical':PLSCanonical(),
			'PLSRegression':PLSRegression(),
			'PLSSVD':PLSSVD(),
			'PassiveAggressiveClassifier':PassiveAggressiveClassifier(),
			'PassiveAggressiveRegressor':PassiveAggressiveRegressor(),
			'Perceptron':Perceptron(),
			'ProjectedGradientNMF':ProjectedGradientNMF(),
from sklearn import datasets

from sklearn.preprocessing.data import Normalizer

iris = datasets.load_iris()

newX = Normalizer().fit_transform(iris.data)

print(iris.data)
print('==============')
print(newX)
class MovingAverageTransform(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs,
                                                              Params,
                                                              Hyperparams]):
    """
    A primitive to generate moving average
    Genrates moving average based on the window_size passed as hyperparameter.
    Columns for which moving average is calculated is passed as hyperparameter . Default is all values column    
    """

    __author__ = "DATA Lab at Texas A&M University"
    metadata = metadata_base.PrimitiveMetadata({
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.MOVING_AVERAGE_TRANSFORM,
        ],
        "name":
        "pandas.preprocessing.data.MovingAverageTransform",
        "primitive_family":
        metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
        "python_path":
        "d3m.primitives.tods.timeseries_processing.transformation.moving_average_transform",
        "source": {
            'name':
            'DATA Lab at Texas A&M University',
            'contact':
            'mailto:[email protected]',
            'uris': [
                'https://gitlab.com/lhenry15/tods.git',
                'https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/MovingAverageTransform.py'
            ]
        },
        "version":
        "0.0.1",
        "id":
        "ab8c90a6-d10e-49f1-8c5a-38884defc570",
        "hyperparams_to_tune": ['window_size'],
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:

        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        # False
        self._clf = Normalizer(norm=self.hyperparams['norm'], )

        self._inputs = None
        self._outputs = None
        self._training_inputs = None
        self._training_outputs = None
        self._target_names = None
        self._training_indices = None
        self._target_column_indices = None
        self._target_columns_metadata: List[OrderedDict] = None
        self._input_column_names = None
        self._fitted = False

    def set_training_data(self, *, inputs: Inputs) -> None:
        self._inputs = inputs
        self._fitted = False

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            self._inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if self._training_inputs is None:
            return CallResult(None)

        if len(self._training_indices) > 0:
            self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:

        self.logger.info('Time Series Moving Average Primitive called')
        outputs = inputs
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            inputs, self.hyperparams)

        try:
            columns_to_calculate_moving_average = List[str]
            if (self.hyperparams['use_columns'] == ()):
                columns_to_calculate_moving_average = list(
                    set(inputs.columns) -
                    set(['d3mIndex', 'timestamp', 'ground_truth']))
            else:
                columns_to_calculate_moving_average = self.hyperparams[
                    'use_columns']
            for column in self._training_indices:
                outputs[inputs.columns[column] +
                        "_moving_average"] = (inputs.iloc[:, column]).rolling(
                            3, min_periods=1, center=True).mean()
        except Exception as e:
            self.logger.error("Error in Calculating Moving Average", e)
        self._update_metadata(outputs)
        #  print(inputs)
        #  print("-------------")
        #  print(outputs)

        return base.CallResult(outputs)

    def _update_metadata(self, outputs):
        outputs.metadata = outputs.metadata.generate(outputs, )

    def get_params(self) -> Params:
        if not self._fitted:
            return Params(
                input_column_names=self._input_column_names,
                training_indices_=self._training_indices,
                target_names_=self._target_names,
                target_column_indices_=self._target_column_indices,
                target_columns_metadata_=self._target_columns_metadata)

        return Params(input_column_names=self._input_column_names,
                      training_indices_=self._training_indices,
                      target_names_=self._target_names,
                      target_column_indices_=self._target_column_indices,
                      target_columns_metadata_=self._target_columns_metadata)

    def set_params(self, *, params: Params) -> None:
        self._input_column_names = params['input_column_names']
        self._training_indices = params['training_indices_']
        self._target_names = params['target_names_']
        self._target_column_indices = params['target_column_indices_']
        self._target_columns_metadata = params['target_columns_metadata_']
        self._fitted = True

    @classmethod
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
        if not hyperparams['use_semantic_types']:
            return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index,
                                           hyperparams)

        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(
            inputs_metadata,
            use_columns=hyperparams['use_columns'],
            exclude_columns=hyperparams['exclude_columns'],
            can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
        # return columns_to_produce

    @classmethod
    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            column_index: int,
                            hyperparams: Hyperparams) -> bool:
        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
        accepted_semantic_types = set()
        accepted_semantic_types.add(
            "https://metadata.datadrivendiscovery.org/types/Attribute")
        if not issubclass(column_metadata['structural_type'],
                          accepted_structural_types):
            return False

        semantic_types = set(column_metadata.get('semantic_types', []))

        if len(semantic_types) == 0:
            cls.logger.warning("No semantic types found in column metadata")
            return False

        # Making sure all accepted_semantic_types are available in semantic_types
        if len(accepted_semantic_types - semantic_types) == 0:
            return True

        return False

    @classmethod
    def _get_target_columns_metadata(
            cls, outputs_metadata: metadata_base.DataMetadata,
            hyperparams) -> List[OrderedDict]:
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']

        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_metadata = OrderedDict(
                outputs_metadata.query_column(column_index))

            # Update semantic types and prepare it for predicted targets.
            semantic_types = set(column_metadata.get('semantic_types', []))
            semantic_types_to_remove = set([])
            add_semantic_types = []
            add_semantic_types.add(hyperparams["return_semantic_type"])
            semantic_types = semantic_types - semantic_types_to_remove
            semantic_types = semantic_types.union(add_semantic_types)
            column_metadata['semantic_types'] = list(semantic_types)

            target_columns_metadata.append(column_metadata)

        return target_columns_metadata

    @classmethod
    def _update_predictions_metadata(
        cls, inputs_metadata: metadata_base.DataMetadata,
        outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]
    ) -> metadata_base.DataMetadata:
        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

        for column_index, column_metadata in enumerate(
                target_columns_metadata):
            column_metadata.pop("structural_type", None)
            outputs_metadata = outputs_metadata.update_column(
                column_index, column_metadata)

        return outputs_metadata

    def _wrap_predictions(self, inputs: Inputs,
                          predictions: ndarray) -> Outputs:
        outputs = d3m_dataframe(predictions, generate_metadata=True)
        target_columns_metadata = self._copy_inputs_metadata(
            inputs.metadata, self._training_indices, outputs.metadata,
            self.hyperparams)
        outputs.metadata = self._update_predictions_metadata(
            inputs.metadata, outputs, target_columns_metadata)
        return outputs

    @classmethod
    def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata,
                              input_indices: List[int],
                              outputs_metadata: metadata_base.DataMetadata,
                              hyperparams):
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']
        target_columns_metadata: List[OrderedDict] = []
        for column_index in input_indices:
            column_name = inputs_metadata.query(
                (metadata_base.ALL_ELEMENTS, column_index)).get("name")
            if column_name is None:
                column_name = "output_{}".format(column_index)

            column_metadata = OrderedDict(
                inputs_metadata.query_column(column_index))
            semantic_types = set(column_metadata.get('semantic_types', []))
            semantic_types_to_remove = set([])
            add_semantic_types = set()
            add_semantic_types.add(hyperparams["return_semantic_type"])
            semantic_types = semantic_types - semantic_types_to_remove
            semantic_types = semantic_types.union(add_semantic_types)
            column_metadata['semantic_types'] = list(semantic_types)

            column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        #  If outputs has more columns than index, add Attribute Type to all remaining
        if outputs_length > len(input_indices):
            for column_index in range(len(input_indices), outputs_length):
                column_metadata = OrderedDict()
                semantic_types = set()
                semantic_types.add(hyperparams["return_semantic_type"])
                column_name = "output_{}".format(column_index)
                column_metadata["semantic_types"] = list(semantic_types)
                column_metadata["name"] = str(column_name)
                target_columns_metadata.append(column_metadata)

        return target_columns_metadata
class SimpleExponentialSmoothing(UnsupervisedLearnerPrimitiveBase[Inputs,
                                                                  Outputs,
                                                                  Params,
                                                                  Hyperparams]
                                 ):
    """
    Primitive wrapping for simple exponential smoothing
    `statsmodels documentation <https://www.statsmodels.org/stable/generated/statsmodels.tsa.holtwinters.SimpleExpSmoothing.html#statsmodels.tsa.holtwinters.SimpleExpSmoothing>`_
    
    """

    __author__ = "DATA Lab at Texas A&M University"
    metadata = metadata_base.PrimitiveMetadata({
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.SIMPLE_EXPONENTIAL_SMOOTHING,
        ],
        "name":
        "statsmodels.preprocessing.data.SimpleExponentialSmoothing",
        "primitive_family":
        metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
        "python_path":
        "d3m.primitives.tods.timeseries_processing.transformation.simple_exponential_smoothing",
        "source": {
            'name':
            'DATA Lab at Texas A&M University',
            'contact':
            'mailto:[email protected]',
            'uris': [
                'https://gitlab.com/lhenry15/tods.git',
                'https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/SimpleExponentialSmoothing.py'
            ]
        },
        "version":
        "0.0.1",
        "id":
        "3e92984e-b7d1-4de0-9203-3a6093ddb38e",
        "hyperparams_to_tune": ['endog', 'use_columns'],
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:

        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        # False
        self._clf = Normalizer(norm=self.hyperparams['norm'], )

        self._inputs = None
        self._outputs = None
        self._training_inputs = None
        self._training_outputs = None
        self._target_names = None
        self._training_indices = None
        self._target_column_indices = None
        self._target_columns_metadata: List[OrderedDict] = None
        self._input_column_names = None
        self._fitted = False

    def set_training_data(self, *, inputs: Inputs) -> None:
        self._inputs = inputs
        self._fitted = False

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        if self._fitted:
            return CallResult(None)

        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            self._inputs, self.hyperparams)
        self._input_column_names = self._training_inputs.columns

        if self._training_inputs is None:
            return CallResult(None)

        if len(self._training_indices) > 0:
            self._clf.fit(self._training_inputs)
            self._fitted = True
        else:
            if self.hyperparams['error_on_no_input']:
                raise RuntimeError("No input columns were selected")
            self.logger.warn("No input columns were selected")
        return CallResult(None)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:

        self.logger.info('Simple Exponential Smoothing Primitive called')
        outputs = inputs
        self._training_inputs, self._training_indices = self._get_columns_to_fit(
            inputs, self.hyperparams)
        try:
            columns_to_calculate_simple_exponential_smoothing = List[str]
            if (self.hyperparams['use_columns'] == ()):
                columns_to_calculate_simple_exponential_smoothing = list(
                    set(inputs.columns) -
                    set(['d3mIndex', 'timestamp', 'ground_truth']))
            else:
                columns_to_calculate_simple_exponential_smoothing = self.hyperparams[
                    'use_columns']
            for column in self._training_indices:
                outputs[inputs.columns[column] +
                        "_simple_exponential_smoothing"] = SimpleExpSmoothing(
                            inputs.iloc[:,
                                        column]).fit(
                                            smoothing_level=0.2,
                                            optimized=False).fittedvalues
        except Exception as e:
            self.logger.error(
                "Error in Calculating simple exponential smoothing", e)
        self._update_metadata(outputs)
        #print(inputs)
        #print("-------------")
        print(outputs)

        return base.CallResult(outputs)

    def _update_metadata(self, outputs):
        outputs.metadata = outputs.metadata.generate(outputs, )

    def get_params(self) -> Params:
        if not self._fitted:
            return Params(
                input_column_names=self._input_column_names,
                training_indices_=self._training_indices,
                target_names_=self._target_names,
                target_column_indices_=self._target_column_indices,
                target_columns_metadata_=self._target_columns_metadata)

        return Params(input_column_names=self._input_column_names,
                      training_indices_=self._training_indices,
                      target_names_=self._target_names,
                      target_column_indices_=self._target_column_indices,
                      target_columns_metadata_=self._target_columns_metadata)

    def set_params(self, *, params: Params) -> None:
        self._input_column_names = params['input_column_names']
        self._training_indices = params['training_indices_']
        self._target_names = params['target_names_']
        self._target_column_indices = params['target_column_indices_']
        self._target_columns_metadata = params['target_columns_metadata_']
        self._fitted = True

    @classmethod
    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
        if not hyperparams['use_semantic_types']:
            return inputs, list(range(len(inputs.columns)))

        inputs_metadata = inputs.metadata

        def can_produce_column(column_index: int) -> bool:
            return cls._can_produce_column(inputs_metadata, column_index,
                                           hyperparams)

        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(
            inputs_metadata,
            use_columns=hyperparams['use_columns'],
            exclude_columns=hyperparams['exclude_columns'],
            can_use_column=can_produce_column)
        return inputs.iloc[:, columns_to_produce], columns_to_produce
        # return columns_to_produce

    @classmethod
    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata,
                            column_index: int,
                            hyperparams: Hyperparams) -> bool:
        column_metadata = inputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, column_index))

        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
        accepted_semantic_types = set()
        accepted_semantic_types.add(
            "https://metadata.datadrivendiscovery.org/types/Attribute")
        if not issubclass(column_metadata['structural_type'],
                          accepted_structural_types):
            return False

        semantic_types = set(column_metadata.get('semantic_types', []))

        if len(semantic_types) == 0:
            cls.logger.warning("No semantic types found in column metadata")
            return False

        # Making sure all accepted_semantic_types are available in semantic_types
        if len(accepted_semantic_types - semantic_types) == 0:
            return True

        return False

    @classmethod
    def _get_target_columns_metadata(
            cls, outputs_metadata: metadata_base.DataMetadata,
            hyperparams) -> List[OrderedDict]:
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']

        target_columns_metadata: List[OrderedDict] = []
        for column_index in range(outputs_length):
            column_metadata = OrderedDict(
                outputs_metadata.query_column(column_index))

            # Update semantic types and prepare it for predicted targets.
            semantic_types = set(column_metadata.get('semantic_types', []))
            semantic_types_to_remove = set([])
            add_semantic_types = []
            add_semantic_types.add(hyperparams["return_semantic_type"])
            semantic_types = semantic_types - semantic_types_to_remove
            semantic_types = semantic_types.union(add_semantic_types)
            column_metadata['semantic_types'] = list(semantic_types)

            target_columns_metadata.append(column_metadata)

        return target_columns_metadata

    @classmethod
    def _update_predictions_metadata(
        cls, inputs_metadata: metadata_base.DataMetadata,
        outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]
    ) -> metadata_base.DataMetadata:
        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

        for column_index, column_metadata in enumerate(
                target_columns_metadata):
            column_metadata.pop("structural_type", None)
            outputs_metadata = outputs_metadata.update_column(
                column_index, column_metadata)

        return outputs_metadata

    def _wrap_predictions(self, inputs: Inputs,
                          predictions: ndarray) -> Outputs:
        outputs = d3m_dataframe(predictions, generate_metadata=True)
        target_columns_metadata = self._copy_inputs_metadata(
            inputs.metadata, self._training_indices, outputs.metadata,
            self.hyperparams)
        outputs.metadata = self._update_predictions_metadata(
            inputs.metadata, outputs, target_columns_metadata)
        return outputs

    @classmethod
    def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata,
                              input_indices: List[int],
                              outputs_metadata: metadata_base.DataMetadata,
                              hyperparams):
        outputs_length = outputs_metadata.query(
            (metadata_base.ALL_ELEMENTS, ))['dimension']['length']
        target_columns_metadata: List[OrderedDict] = []
        for column_index in input_indices:
            column_name = inputs_metadata.query(
                (metadata_base.ALL_ELEMENTS, column_index)).get("name")
            if column_name is None:
                column_name = "output_{}".format(column_index)

            column_metadata = OrderedDict(
                inputs_metadata.query_column(column_index))
            semantic_types = set(column_metadata.get('semantic_types', []))
            semantic_types_to_remove = set([])
            add_semantic_types = set()
            add_semantic_types.add(hyperparams["return_semantic_type"])
            semantic_types = semantic_types - semantic_types_to_remove
            semantic_types = semantic_types.union(add_semantic_types)
            column_metadata['semantic_types'] = list(semantic_types)

            column_metadata["name"] = str(column_name)
            target_columns_metadata.append(column_metadata)

        #  If outputs has more columns than index, add Attribute Type to all remaining
        if outputs_length > len(input_indices):
            for column_index in range(len(input_indices), outputs_length):
                column_metadata = OrderedDict()
                semantic_types = set()
                semantic_types.add(hyperparams["return_semantic_type"])
                column_name = "output_{}".format(column_index)
                column_metadata["semantic_types"] = list(semantic_types)
                column_metadata["name"] = str(column_name)
                target_columns_metadata.append(column_metadata)

        return target_columns_metadata
Exemple #16
0
 def transform(self, X):
     # assumes X is a DataFrame
     self.ss_ = Normalizer()
     Xss = self.ss_.transform(X)
     Xscaled = pd.DataFrame(Xss, index=X.index, columns=X.columns)
     return Xscaled