Ejemplo n.º 1
0
    def create_partition(self, partition_type: str,
                         **kwargs) -> tuple[str, str]:
        set_seed(self.seed)
        if self.partition is not None:
            raise RuntimeError('Partition already created.')
        data = self.catalog.load_table(self.data_id)

        if partition_type == 'random':
            trn_ratio = kwargs['trn_ratio']
            val_ratio = kwargs.get('val_ratio', None)

            trn_idx, val_idx = split_index(data, trn_ratio, b_ratio=val_ratio)
            trn_idx_id = self.catalog.save_index(self.data_id, trn_idx)
            val_idx_id = self.catalog.save_index(self.data_id, val_idx)
            names = trn_idx_id, val_idx_id
        elif partition_type == 'manual':
            trn_idx_id = kwargs['trn_idx_id']
            val_idx_id = kwargs.get('val_idx_id', None)
            names = self._register_partition(trn_idx_id, val_idx_id=val_idx_id)
        else:
            raise ValueError('Unknown partition_type.')

        self.partition = Partition(names=names,
                                   type=partition_type,
                                   param=kwargs)

        return trn_idx_id, val_idx_id
Ejemplo n.º 2
0
    def create_cv_partition(self, cv_type: str,
                            **kwargs) -> list[tuple[str, str]]:
        if self.cv_partition is not None:
            raise RuntimeError('CV partition already created.')
        set_seed(self.seed)
        data = self.catalog.load_table(self.data_id)

        selector_class = SELECTORS.get(cv_type.lower(), None)
        if selector_class is not None:
            selector: BaseSelector = selector_class(**kwargs)
            cv_names = []
            for idx in selector.split(data):
                trn_idx = pd.Index(idx[0])
                val_idx = pd.Index(idx[1])
                idx_ids = (
                    self.catalog.save_index(self.data_id, trn_idx),
                    self.catalog.save_index(self.data_id, val_idx),
                )
                cv_names.append(idx_ids)
        elif cv_type == 'manual':
            cv_names = kwargs['partition_idx_ids']
            # self._register_partition(idx_ids[0], val_idx_id=idx_ids[1])
        else:
            raise ValueError('Unknown cv_type.')

        self.cv_partition = CVPartition(cv_names=cv_names,
                                        type=cv_type,
                                        param=kwargs)

        return cv_names
Ejemplo n.º 3
0
    def __init__(self,
                 target_type: str,
                 predictor_class: Type[BasePredictor],
                 target: str,
                 catalog: Catalog,
                 data_id: str,
                 metrics: list[Type[BaseMetric]] = None,
                 train_param: dict = None,
                 seed: int = None) -> None:
        """Initialize object."""
        if seed is None:
            seed = set_seed_random()
        else:
            set_seed(seed)
        self.seed = seed
        self.target_type = target_type
        self.target = target
        self.catalog = catalog
        self.predictor_class = predictor_class
        self.metrics = metrics
        self.train_param = {} if train_param is None else train_param

        self.score: dict = {}
        self.perm: dict = {}

        self.data_id = data_id
        self.model_id: Optional[str] = None
        self.partition: Optional[Partition] = None
        self.cv_partition: Optional[CVPartition] = None

        self.cv_model_ids: Optional[list[str]] = None

        self.column_stats: Optional[pd.DataFrame] = None
Ejemplo n.º 4
0
 def _calc_perm(self, model_id: str, idx_id: str,
                **kwargs) -> dict[str, pd.DataFrame]:
     set_seed(self.seed)
     model = self.catalog.load_model(model_id)
     data_idx = self.catalog.load_index(idx_id)
     data = self.catalog.load_table(self.data_id).iloc[data_idx]
     x = data.drop(columns=[self.target])
     y = data[self.target]
     result = model.calc_perm(x, y, **kwargs)
     # TODO: Save perm
     return result
Ejemplo n.º 5
0
    def _validate(self, model_id: str, val_idx_id: str) -> dict:
        set_seed(self.seed)
        data = self.catalog.load_table(self.data_id)
        model = self.catalog.load_model(model_id)
        data_idx = self.catalog.load_index(val_idx_id)
        sample = data.iloc[data_idx]
        x = sample.drop(columns=[self.target])
        y = sample[self.target]

        score, y_pred = model.validate(x, y)
        # TODO: Save y_pred?
        return score
Ejemplo n.º 6
0
    def split(self,
              x: X_TYPE,
              y: Y_TYPE = None) -> Iterable[tuple[pd.Index, pd.Index]]:
        """Split."""
        for seed in self.seeds:
            set_seed(seed)
            trn_size = round(x.shape[0] * self.trn_ratio)
            trn_idx = x.sample(trn_size).index
            if self.val_ratio:
                val_size = round(x.shape[0] * self.val_ratio)
                val_idx = x.drop(trn_idx).sample(val_size).index
            else:
                val_idx = x.drop(trn_idx).index

            yield trn_idx, val_idx
Ejemplo n.º 7
0
    def _train(self, trn_idx_id: str) -> str:
        set_seed(self.seed)
        data = self.catalog.load_table(self.data_id)
        data_idx = self.catalog.load_index(trn_idx_id)
        sample = data.iloc[data_idx]
        x = sample.drop(columns=[self.target])
        y = sample[self.target]
        model = self._get_model()
        model_id = model.hash_model(x, y)
        try:
            model = self.catalog.load_model(model_id)
        except IndexError:
            model.train(x, y)
            new_model_id = self.catalog.save_model(model)
            assert new_model_id == model_id

        return model_id
Ejemplo n.º 8
0
def test_set_seed():
    """Test set_seed()."""
    seed_value = 4
    seed.set_seed(seed_value)
    expected = random.random()
    seed.set_seed(seed_value)
    assert random.random() == expected

    seed.set_seed(1)
    assert random.random() != expected
from power_ml.ai.predictor.sklearn_predictor import SklearnPredictor
from power_ml.util.seed import set_seed

dataset = load_boston()
x = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
y = dataset['target']

# Train model.
predictor = SklearnPredictor(LinearRegression, 'regression')
predictor.train(x, y)

# Initialize.
perm = PermutationImportance(predictor, MAE, x, y, n=20)

# Recommend to set seed.
set_seed(3)

# Get single column shuffled metric.
col = 'DIS'
metric = perm.shuffle_and_evaluate(col)
print('{}: {}'.format(col, metric))

# Iterate permutation importance,
for col_perm in perm.iter_perm():
    col, weight, score = col_perm
    print('{:10},  {:.4f},  {:.4f}'.format(col, weight, score))

# Analyzed permutation importance
# n_jobs=-1: Use all CPU cores.
perms = perm.calc(n_jobs=-1)
print(perms)
Ejemplo n.º 10
0
"""Example of model."""

import pandas as pd
from sklearn.datasets import load_boston

from power_ml.ai.metrics import MAE, MAPE
from power_ml.ai.model import Model
from power_ml.ai.predictor.light_gbm import LightGBM
from power_ml.util.seed import set_seed

# Recommend to set seed.
set_seed(82)

dataset = load_boston()
x = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
y = dataset['target']
x_trn, y_trn = x[:400], y[:400]
x_tst, y_tst = x[400:], y[400:]

param = {
    'objective': 'regression',
}
predictor = LightGBM('regression', param=param)
model = Model(predictor, metrics=[MAE, MAPE])
model.train(x_trn, y_trn)

# Permutation Importance
perms = model.calc_perm(x_trn, y_trn, n=5, n_jobs=1)
for metric, perm in model.calc_perm(x_trn, y_trn, n=10, n_jobs=1).items():
    print(metric)
    print(perm)
Ejemplo n.º 11
0
"""Example of seed."""

import random

from power_ml.util.seed import set_seed, set_seed_random

for v in [5, 5, 6]:
    set_seed(v)
    print('Seed: {}'.format(v))
    print(random.random())
    print(random.random())

v = set_seed_random()
print('Seed: {}'.format(v))
print(random.random())