Exemple #1
0
    def cv_score(self, X, y, context, metric=None, cv=None):
        """Cross Validate this pipeline."""

        scorer = METRICS_DICT[metric or self.metric]

        LOGGER.debug('CV Scoring pipeline %s', self)

        self.cv_scores = list()

        for fold, (train_index, test_index) in enumerate(cv.split(X, y)):

            LOGGER.debug('Scoring fold: %s', fold)

            X_train, y_train = self._get_split(X, y, train_index)
            pipeline = MLPipeline.from_dict(self._tunable)
            pipeline.fit(X_train, y_train, **context)

            X_test, y_test = self._get_split(X, y, test_index)
            pred = pipeline.predict(X_test, **context)
            score = scorer(pred, y_test)
            self.cv_scores.append(score)

            LOGGER.debug('Fold %s score: %s', fold, score)

        score, std, rank = self._get_score()

        LOGGER.debug('CV score: %s +/- %s; rank: %s', score, std, rank)

        self.score = score
        self.std = std
        self.rank = rank + random.random() * 1.e-12  # to avoid collisions
Exemple #2
0
    def _load_mlpipeline(self, template):
        if not isinstance(template, dict):
            template = self._load_template(template)

        self.template = template

        return MLPipeline.from_dict(template)
Exemple #3
0
def pipeline_score(pipeline_dict, X, y, scorer, context=None,
                   n_splits=5, cv=None, random_state=0):

    context = context or dict()

    LOGGER.debug('CV Scoring pipeline %s')

    cv_scores = list()

    if not cv:
        metadata = pipeline_dict.get('metadata', pipeline_dict.get('loader', dict()))
        if metadata.get('task_type') == 'classification':
            cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        else:
            cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for fold, (train_index, test_index) in enumerate(cv.split(X, y)):
        LOGGER.debug('Scoring fold: %s', fold)

        X_train, y_train = get_split(X, y, train_index)
        pipeline = MLPipeline.from_dict(pipeline_dict)
        pipeline.fit(X_train, y_train, **context)

        X_test, y_test = get_split(X, y, test_index)
        pred = pipeline.predict(X_test, **context)

        score = scorer(pred, y_test)
        cv_scores.append(score)

        LOGGER.debug('Fold %s score: %s', fold, score)

    return np.mean(cv_scores), np.std(cv_scores)
Exemple #4
0
    def fit(self, data_params):
        """Fit the pipeline on the given params."""
        X, y = data_params.X, data_params.y

        self.pipeline = MLPipeline.from_dict(self.pipeline_dict)
        self.pipeline.fit(X, y, **data_params.context)

        self.fitted = True
Exemple #5
0
 def preprocess(self, X, y, context):
     """Execute the preprocessing steps of the pipeline."""
     if self._preprocessing:
         LOGGER.info("Executing preprocessing pipeline")
         pipeline = MLPipeline.from_dict(self._preprocessing)
         pipeline.fit(X, y, **context)
         return pipeline.predict(X, **context)
     else:
         LOGGER.info("No preprocessing steps found")
         return X
Exemple #6
0
def _load_pipeline(pipeline):

    if isinstance(pipeline, MLPipeline):
        return pipeline

    if isinstance(pipeline, str):
        return MLPipeline.load(pipeline)

    if isinstance(pipeline, dict):
        return MLPipeline.from_dict(pipeline)

    raise ValueError('Invalid pipeline %s', pipeline)
Exemple #7
0
    def _load_mlpipeline(self, template):
        if not isinstance(template, dict):
            template_name = template
            if os.path.isfile(template_name):
                with open(template_name, 'r') as template_file:
                    template = json.load(template_file)

            elif self._db:
                template = self._db.load_template(template_name)

            if not template:
                raise ValueError('Unknown template {}'.format(template_name))

            self.template = template

        return MLPipeline.from_dict(template)
Exemple #8
0
    def __init__(self, pipeline_dict, loader, metric, problem_doc):
        self.pipeline_dict = pipeline_dict
        self.name = pipeline_dict['name']
        self.template = pipeline_dict.get('template')
        self.loader = loader
        self.metric = metric
        self.problem_doc = problem_doc

        preprocessing_blocks = self.pipeline_dict.get('preprocessing_blocks')
        if preprocessing_blocks:
            preprocessing = pipeline_dict.copy()
            preprocessing_primitives = preprocessing[
                'primitives'][:preprocessing_blocks]
            preprocessing['primitives'] = preprocessing_primitives
            self._preprocessing = preprocessing

            tunable = pipeline_dict.copy()
            tunable_primitives = tunable['primitives'][preprocessing_blocks:]
            tunable['primitives'] = tunable_primitives
            self._tunable = tunable

            pre_params, tun_params = self._extract_hyperparameters(
                preprocessing_primitives)
            self._preprocessing['hyperparameters'] = pre_params
            self._tunable['hyperparameters'] = tun_params

        else:
            self._preprocessing = None
            self._tunable = pipeline_dict

        self.id = str(uuid.uuid4())
        self.cv_scores = list()

        self.rank = None
        self.score = None
        self.dumped = False
        self.fitted = False

        self.pipeline = MLPipeline.from_dict(pipeline_dict)
Exemple #9
0
def score_pipeline(pipeline_metadata, n_splits=5, random_state=0, dataset=None):
    if isinstance(pipeline_metadata, str):
        LOGGER.info('Loading pipeline %s', pipeline_metadata)
        with open(pipeline_metadata, 'r') as pipeline_file:
            pipeline_metadata = json.load(pipeline_file)

    validation = pipeline_metadata['validation']
    if dataset is None:
        dataset = validation['dataset']

    LOGGER.info('Loading dataset %s', dataset)
    dataset = load_dataset(dataset)
    metric = validation.get('metric')
    metric_args = validation.get('metric_args', dict())
    if metric:
        scorer = get_scorer(metric, metric_args)
    else:
        scorer = dataset.score
        metric = dataset.metric

    scores = list()
    splits = dataset.get_splits(n_splits, random_state)
    if n_splits == 1:
        splits = [splits]

    for split, (X_train, X_test, y_train, y_test) in enumerate(splits):
        LOGGER.info('Scoring split %s', split + 1)
        context = get_context(dataset, validation.get('context', dict()))
        pipeline = MLPipeline.from_dict(pipeline_metadata)
        pipeline.fit(X_train, y_train, **context)
        predictions = pipeline.predict(X_test, **context)

        score = scorer(y_test, predictions)
        LOGGER.info('Split %s %s: %s', split + 1, metric, score)

        scores.append(score)

    return np.mean(scores), np.std(scores)
Exemple #10
0
 def load_pipeline(self, pipeline):
     LOGGER.info("Loading pipeline %s", pipeline.name)
     return MLPipeline.from_dict(pipeline.mlpipeline)
Exemple #11
0
 def _clone_pipeline(pipeline):
     return MLPipeline.from_dict(pipeline.to_dict())
Exemple #12
0
 def load_template(self, template_name):
     """Get the dict representation of the template."""
     template = self._load_template(template_name)
     mlpipeline_dict = MLPipeline.from_dict(template).to_dict()
     mlpipeline_dict['metadata'] = template['metadata']
     return mlpipeline_dict
Exemple #13
0
 def get_tunable_hyperparameters(self, template_name):
     """Get the tunable hyperparmeters of the given template."""
     template = self.load_template(template_name)
     return template['tunable_hyperparameters']
     mlpipeline = MLPipeline.from_dict(template)
     return mlpipeline.get_tunable_hyperparameters()