def cv_score(self, X, y, context, metric=None, cv=None): """Cross Validate this pipeline.""" scorer = METRICS_DICT[metric or self.metric] LOGGER.debug('CV Scoring pipeline %s', self) self.cv_scores = list() for fold, (train_index, test_index) in enumerate(cv.split(X, y)): LOGGER.debug('Scoring fold: %s', fold) X_train, y_train = self._get_split(X, y, train_index) pipeline = MLPipeline.from_dict(self._tunable) pipeline.fit(X_train, y_train, **context) X_test, y_test = self._get_split(X, y, test_index) pred = pipeline.predict(X_test, **context) score = scorer(pred, y_test) self.cv_scores.append(score) LOGGER.debug('Fold %s score: %s', fold, score) score, std, rank = self._get_score() LOGGER.debug('CV score: %s +/- %s; rank: %s', score, std, rank) self.score = score self.std = std self.rank = rank + random.random() * 1.e-12 # to avoid collisions
def _load_mlpipeline(self, template): if not isinstance(template, dict): template = self._load_template(template) self.template = template return MLPipeline.from_dict(template)
def pipeline_score(pipeline_dict, X, y, scorer, context=None, n_splits=5, cv=None, random_state=0): context = context or dict() LOGGER.debug('CV Scoring pipeline %s') cv_scores = list() if not cv: metadata = pipeline_dict.get('metadata', pipeline_dict.get('loader', dict())) if metadata.get('task_type') == 'classification': cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) else: cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state) for fold, (train_index, test_index) in enumerate(cv.split(X, y)): LOGGER.debug('Scoring fold: %s', fold) X_train, y_train = get_split(X, y, train_index) pipeline = MLPipeline.from_dict(pipeline_dict) pipeline.fit(X_train, y_train, **context) X_test, y_test = get_split(X, y, test_index) pred = pipeline.predict(X_test, **context) score = scorer(pred, y_test) cv_scores.append(score) LOGGER.debug('Fold %s score: %s', fold, score) return np.mean(cv_scores), np.std(cv_scores)
def fit(self, data_params): """Fit the pipeline on the given params.""" X, y = data_params.X, data_params.y self.pipeline = MLPipeline.from_dict(self.pipeline_dict) self.pipeline.fit(X, y, **data_params.context) self.fitted = True
def preprocess(self, X, y, context): """Execute the preprocessing steps of the pipeline.""" if self._preprocessing: LOGGER.info("Executing preprocessing pipeline") pipeline = MLPipeline.from_dict(self._preprocessing) pipeline.fit(X, y, **context) return pipeline.predict(X, **context) else: LOGGER.info("No preprocessing steps found") return X
def _load_pipeline(pipeline): if isinstance(pipeline, MLPipeline): return pipeline if isinstance(pipeline, str): return MLPipeline.load(pipeline) if isinstance(pipeline, dict): return MLPipeline.from_dict(pipeline) raise ValueError('Invalid pipeline %s', pipeline)
def _load_mlpipeline(self, template): if not isinstance(template, dict): template_name = template if os.path.isfile(template_name): with open(template_name, 'r') as template_file: template = json.load(template_file) elif self._db: template = self._db.load_template(template_name) if not template: raise ValueError('Unknown template {}'.format(template_name)) self.template = template return MLPipeline.from_dict(template)
def __init__(self, pipeline_dict, loader, metric, problem_doc): self.pipeline_dict = pipeline_dict self.name = pipeline_dict['name'] self.template = pipeline_dict.get('template') self.loader = loader self.metric = metric self.problem_doc = problem_doc preprocessing_blocks = self.pipeline_dict.get('preprocessing_blocks') if preprocessing_blocks: preprocessing = pipeline_dict.copy() preprocessing_primitives = preprocessing[ 'primitives'][:preprocessing_blocks] preprocessing['primitives'] = preprocessing_primitives self._preprocessing = preprocessing tunable = pipeline_dict.copy() tunable_primitives = tunable['primitives'][preprocessing_blocks:] tunable['primitives'] = tunable_primitives self._tunable = tunable pre_params, tun_params = self._extract_hyperparameters( preprocessing_primitives) self._preprocessing['hyperparameters'] = pre_params self._tunable['hyperparameters'] = tun_params else: self._preprocessing = None self._tunable = pipeline_dict self.id = str(uuid.uuid4()) self.cv_scores = list() self.rank = None self.score = None self.dumped = False self.fitted = False self.pipeline = MLPipeline.from_dict(pipeline_dict)
def score_pipeline(pipeline_metadata, n_splits=5, random_state=0, dataset=None): if isinstance(pipeline_metadata, str): LOGGER.info('Loading pipeline %s', pipeline_metadata) with open(pipeline_metadata, 'r') as pipeline_file: pipeline_metadata = json.load(pipeline_file) validation = pipeline_metadata['validation'] if dataset is None: dataset = validation['dataset'] LOGGER.info('Loading dataset %s', dataset) dataset = load_dataset(dataset) metric = validation.get('metric') metric_args = validation.get('metric_args', dict()) if metric: scorer = get_scorer(metric, metric_args) else: scorer = dataset.score metric = dataset.metric scores = list() splits = dataset.get_splits(n_splits, random_state) if n_splits == 1: splits = [splits] for split, (X_train, X_test, y_train, y_test) in enumerate(splits): LOGGER.info('Scoring split %s', split + 1) context = get_context(dataset, validation.get('context', dict())) pipeline = MLPipeline.from_dict(pipeline_metadata) pipeline.fit(X_train, y_train, **context) predictions = pipeline.predict(X_test, **context) score = scorer(y_test, predictions) LOGGER.info('Split %s %s: %s', split + 1, metric, score) scores.append(score) return np.mean(scores), np.std(scores)
def load_pipeline(self, pipeline): LOGGER.info("Loading pipeline %s", pipeline.name) return MLPipeline.from_dict(pipeline.mlpipeline)
def _clone_pipeline(pipeline): return MLPipeline.from_dict(pipeline.to_dict())
def load_template(self, template_name): """Get the dict representation of the template.""" template = self._load_template(template_name) mlpipeline_dict = MLPipeline.from_dict(template).to_dict() mlpipeline_dict['metadata'] = template['metadata'] return mlpipeline_dict
def get_tunable_hyperparameters(self, template_name): """Get the tunable hyperparmeters of the given template.""" template = self.load_template(template_name) return template['tunable_hyperparameters'] mlpipeline = MLPipeline.from_dict(template) return mlpipeline.get_tunable_hyperparameters()