Beispiel #1
0
def search_base_learner(id):
    """Creates a set of base learners from base learner origin using grid search
    and queues them up
    """
    path = functions.get_path_from_query_string(request)
    req_body = request.get_json()
    if req_body['method'] == 'grid':
        param_grid = functions.import_object_from_string_code(
            req_body['source'], 'param_grid')
        iterator = ParameterGrid(param_grid)
    elif req_body['method'] == 'random':
        param_distributions = functions.import_object_from_string_code(
            req_body['source'], 'param_distributions')
        iterator = ParameterSampler(param_distributions,
                                    n_iter=req_body['n_iter'])

    else:
        raise exceptions.UserError('{} not a valid search method'.format(
            req_body['method']))

    with functions.DBContextManager(path) as session:
        base_learner_origin = session.query(
            models.BaseLearnerOrigin).filter_by(id=id).first()
        if base_learner_origin is None:
            raise exceptions.UserError(
                'Base learner origin {} not found'.format(id), 404)

        if not base_learner_origin.final:
            raise exceptions.UserError(
                'Base learner origin {} is not final'.format(id))

        learners = []
        for params in iterator:
            est = base_learner_origin.return_estimator()
            try:
                est.set_params(**params)
            except Exception as e:
                print(repr(e))
                continue

            hyperparameters = functions.make_serializable(est.get_params())

            base_learners = session.query(models.BaseLearner).\
                filter_by(base_learner_origin_id=id,
                          hyperparameters=hyperparameters).all()
            if base_learners:  # already exists
                continue

            base_learner = models.BaseLearner(hyperparameters, 'queued',
                                              base_learner_origin)

            session.add(base_learner)
            session.commit()
            with Connection(get_redis_connection()):
                rqtasks.generate_meta_features.delay(path, base_learner.id)
            learners.append(base_learner)
        return jsonify(map(lambda x: x.serialize, learners))
Beispiel #2
0
    def return_estimator(self):
        """Returns estimator from base learner origin

        Returns:
            est (estimator): Estimator object
        """
        extraction_code = self.source
        estimator = functions.import_object_from_string_code(extraction_code, "base_learner")

        return estimator
Beispiel #3
0
def create_base_learner(id):
    """This creates a single base learner from a base learner origin and queues it up"""
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        base_learner_origin = session.query(
            models.BaseLearnerOrigin).filter_by(id=id).first()
        if base_learner_origin is None:
            raise exceptions.UserError(
                'Base learner origin {} not found'.format(id), 404)

        if not base_learner_origin.final:
            raise exceptions.UserError(
                'Base learner origin {} is not final'.format(id))

        req_body = request.get_json()

        # Retrieve full hyperparameters
        est = base_learner_origin.return_estimator()
        hyperparameters = functions.import_object_from_string_code(
            req_body['source'], 'params')
        est.set_params(**hyperparameters)
        hyperparameters = functions.make_serializable(est.get_params())

        base_learners = session.query(models.BaseLearner).\
            filter_by(base_learner_origin_id=id,
                      hyperparameters=hyperparameters).all()
        if base_learners:
            raise exceptions.UserError(
                'Base learner exists with given hyperparameters')

        base_learner = models.BaseLearner(hyperparameters, 'queued',
                                          base_learner_origin)

        if 'single_searches' not in base_learner_origin.description:
            base_learner_origin.description['single_searches'] = []
        base_learner_origin.description['single_searches'] += ([
            req_body['source']
        ])

        session.add(base_learner)
        session.add(base_learner_origin)
        session.commit()

        with Connection(get_redis_connection()):
            rqtasks.generate_meta_features.delay(path, base_learner.id)

        return jsonify(base_learner.serialize)
Beispiel #4
0
    def return_main_dataset(self):
        """Returns main data set from self

        Returns:
            X (numpy.ndarray): Features

            y (numpy.ndarray): Labels
        """
        if not self.main_dataset['source']:
            raise exceptions.UserError('Source is empty')

        extraction_code = self.main_dataset["source"]

        extraction_function = functions.import_object_from_string_code(extraction_code,
                                                                       "extract_main_dataset")
        try:
            X, y = extraction_function()
        except Exception as e:
            raise exceptions.UserError('User code exception', exception_message=str(e))

        X, y = np.array(X), np.array(y)

        return X, y
Beispiel #5
0
def generate_meta_features(path, base_learner_id):
    """Generates meta-features for specified base learner

    After generation of meta-features, the file is saved into the meta-features folder

    Args:
        path (str): Path to Xcessiv notebook

        base_learner_id (str): Base learner ID
    """
    with functions.DBContextManager(path) as session:
        base_learner = session.query(models.BaseLearner).filter_by(id=base_learner_id).first()
        if not base_learner:
            raise exceptions.UserError('Base learner {} '
                                       'does not exist'.format(base_learner_id))

        base_learner.job_id = get_current_job().id
        base_learner.job_status = 'started'

        session.add(base_learner)
        session.commit()

        try:
            est = base_learner.return_estimator()
            extraction = session.query(models.Extraction).first()
            X, y = extraction.return_train_dataset()
            return_splits_iterable = functions.import_object_from_string_code(
                extraction.meta_feature_generation['source'],
                'return_splits_iterable'
            )

            meta_features_list = []
            trues_list = []
            for train_index, test_index in return_splits_iterable(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                est = est.fit(X_train, y_train)
                meta_features_list.append(
                    getattr(est, base_learner.base_learner_origin.
                            meta_feature_generator)(X_test)
                )
                trues_list.append(y_test)
            meta_features = np.concatenate(meta_features_list, axis=0)
            y_true = np.concatenate(trues_list)

            for key in base_learner.base_learner_origin.metric_generators:
                metric_generator = functions.import_object_from_string_code(
                    base_learner.base_learner_origin.metric_generators[key],
                    'metric_generator'
                )
                base_learner.individual_score[key] = metric_generator(y_true, meta_features)

            meta_features_path = base_learner.meta_features_path(path)

            if not os.path.exists(os.path.dirname(meta_features_path)):
                os.makedirs(os.path.dirname(meta_features_path))

            np.save(meta_features_path, meta_features, allow_pickle=False)
            base_learner.job_status = 'finished'
            base_learner.meta_features_exists = True
            session.add(base_learner)
            session.commit()

        except:
            session.rollback()
            base_learner.job_status = 'errored'
            base_learner.description['error_type'] = repr(sys.exc_info()[0])
            base_learner.description['error_value'] = repr(sys.exc_info()[1])
            base_learner.description['error_traceback'] = \
                traceback.format_exception(*sys.exc_info())
            session.add(base_learner)
            session.commit()
            raise
Beispiel #6
0
def evaluate_stacked_ensemble(path, ensemble_id):
    """Evaluates the ensemble and updates the database when finished/

    Args:
        path (str): Path to Xcessiv notebook

        ensemble_id (str): Ensemble ID
    """
    with functions.DBContextManager(path) as session:
        stacked_ensemble = session.query(models.StackedEnsemble).filter_by(
            id=ensemble_id).first()
        if not stacked_ensemble:
            raise exceptions.UserError('Stacked ensemble {} '
                                       'does not exist'.format(ensemble_id))

        stacked_ensemble.job_id = get_current_job().id
        stacked_ensemble.job_status = 'started'

        session.add(stacked_ensemble)
        session.commit()

        try:
            meta_features_list = []
            for base_learner in stacked_ensemble.base_learners:
                mf = np.load(base_learner.meta_features_path(path))
                if len(mf.shape) == 1:
                    mf = mf.reshape(-1, 1)
                meta_features_list.append(mf)

            secondary_features = np.concatenate(meta_features_list, axis=1)

            # Get data
            extraction = session.query(models.Extraction).first()
            return_splits_iterable = functions.import_object_from_string_code(
                extraction.meta_feature_generation['source'],
                'return_splits_iterable'
            )
            X, y = extraction.return_train_dataset()

            #  We need to retrieve original order of meta-features
            indices_list = [test_index for train_index, test_index in return_splits_iterable(X, y)]
            indices = np.concatenate(indices_list)
            X, y = X[indices], y[indices]

            est = stacked_ensemble.return_secondary_learner()

            return_splits_iterable_stacked_ensemble = functions.import_object_from_string_code(
                extraction.stacked_ensemble_cv['source'],
                'return_splits_iterable'
            )
            preds = []
            trues_list = []
            for train_index, test_index in return_splits_iterable_stacked_ensemble(secondary_features, y):
                X_train, X_test = secondary_features[train_index], secondary_features[test_index]
                y_train, y_test = y[train_index], y[test_index]
                est = est.fit(X_train, y_train)
                preds.append(
                    getattr(est, stacked_ensemble.base_learner_origin.
                            meta_feature_generator)(X_test)
                )
                trues_list.append(y_test)
            preds = np.concatenate(preds, axis=0)
            y_true = np.concatenate(trues_list)

            for key in stacked_ensemble.base_learner_origin.metric_generators:
                metric_generator = functions.import_object_from_string_code(
                    stacked_ensemble.base_learner_origin.metric_generators[key],
                    'metric_generator'
                )
                stacked_ensemble.individual_score[key] = metric_generator(y_true, preds)

            stacked_ensemble.job_status = 'finished'
            session.add(stacked_ensemble)
            session.commit()

        except:
            session.rollback()
            stacked_ensemble.job_status = 'errored'
            stacked_ensemble.description['error_type'] = repr(sys.exc_info()[0])
            stacked_ensemble.description['error_value'] = repr(sys.exc_info()[1])
            stacked_ensemble.description['error_traceback'] = \
                traceback.format_exception(*sys.exc_info())
            session.add(stacked_ensemble)
            session.commit()
            raise
Beispiel #7
0
    def func_to_optimize(**params):
        base_estimator = base_learner_origin.return_estimator()
        base_estimator.set_params(**default_params)
        # For integer hyperparameters, make sure they are rounded off
        params = dict((key, val) if key not in integers else (key, int(val))
                      for key, val in iteritems(params))
        base_estimator.set_params(**params)
        hyperparameters = functions.make_serializable(base_estimator.get_params())

        # Look if base learner already exists
        base_learner = session.query(models.BaseLearner).\
            filter_by(base_learner_origin_id=base_learner_origin.id,
                      hyperparameters=hyperparameters).first()

        calculate_only = False

        # If base learner exists and has finished, just return its result
        if base_learner and base_learner.job_status == 'finished':
            if invert_metric:
                return -base_learner.individual_score[metric_to_optimize]
            else:
                return base_learner.individual_score[metric_to_optimize]

        # else if base learner exists but is unfinished, just calculate the result without storing
        elif base_learner and base_learner.job_status != 'finished':
            calculate_only = True

        # else if base learner does not exist, create it
        else:
            base_learner = models.BaseLearner(hyperparameters,
                                              'started',
                                              base_learner_origin)
            base_learner.job_id = get_current_job().id
            session.add(base_learner)
            session.commit()

        try:
            est = base_learner.return_estimator()
            extraction = session.query(models.Extraction).first()
            X, y = extraction.return_train_dataset()
            return_splits_iterable = functions.import_object_from_string_code(
                extraction.meta_feature_generation['source'],
                'return_splits_iterable'
            )

            meta_features_list = []
            trues_list = []
            for train_index, test_index in return_splits_iterable(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                est = est.fit(X_train, y_train)
                meta_features_list.append(
                    getattr(est, base_learner.base_learner_origin.
                            meta_feature_generator)(X_test)
                )
                trues_list.append(y_test)
            meta_features = np.concatenate(meta_features_list, axis=0)
            y_true = np.concatenate(trues_list)

            for key in base_learner.base_learner_origin.metric_generators:
                metric_generator = functions.import_object_from_string_code(
                    base_learner.base_learner_origin.metric_generators[key],
                    'metric_generator'
                )
                base_learner.individual_score[key] = metric_generator(y_true, meta_features)

            # Only do this if you want to save things
            if not calculate_only:
                meta_features_path = base_learner.meta_features_path(path)

                if not os.path.exists(os.path.dirname(meta_features_path)):
                    os.makedirs(os.path.dirname(meta_features_path))

                np.save(meta_features_path, meta_features, allow_pickle=False)
                base_learner.job_status = 'finished'
                base_learner.meta_features_exists = True
                session.add(base_learner)
                session.commit()

            if invert_metric:
                return -base_learner.individual_score[metric_to_optimize]
            else:
                return base_learner.individual_score[metric_to_optimize]

        except:
            session.rollback()
            base_learner.job_status = 'errored'
            base_learner.description['error_type'] = repr(sys.exc_info()[0])
            base_learner.description['error_value'] = repr(sys.exc_info()[1])
            base_learner.description['error_traceback'] = \
                traceback.format_exception(*sys.exc_info())
            session.add(base_learner)
            session.commit()
            raise
Beispiel #8
0
def extraction_data_statistics(path):
    """ Generates data statistics for the given data extraction setup stored
    in Xcessiv notebook.

    This is in rqtasks.py but not as a job yet. Temporarily call this directly
    while I'm figuring out Javascript lel.

    Args:
        path (str, unicode): Path to xcessiv notebook
    """
    with functions.DBContextManager(path) as session:
        extraction = session.query(models.Extraction).first()
        X, y = extraction.return_main_dataset()
        functions.verify_dataset(X, y)

        if extraction.test_dataset['method'] == 'split_from_main':
            X, X_test, y, y_test = train_test_split(
                X,
                y,
                test_size=extraction.test_dataset['split_ratio'],
                random_state=extraction.test_dataset['split_seed'],
                stratify=y
            )
        elif extraction.test_dataset['method'] == 'source':
            if 'source' not in extraction.test_dataset or not extraction.test_dataset['source']:
                raise exceptions.UserError('Source is empty')

            extraction_code = extraction.test_dataset["source"]
            extraction_function = functions.\
                import_object_from_string_code(extraction_code, "extract_test_dataset")
            X_test, y_test = extraction_function()
        else:
            X_test, y_test = None, None

        # test base learner cross-validation
        extraction_code = extraction.meta_feature_generation['source']
        return_splits_iterable = functions.import_object_from_string_code(
            extraction_code,
            'return_splits_iterable'
        )
        number_of_splits = 0
        test_indices = []
        try:
            for train_idx, test_idx in return_splits_iterable(X, y):
                number_of_splits += 1
                test_indices.append(test_idx)
        except Exception as e:
            raise exceptions.UserError('User code exception', exception_message=str(e))

        # preparation before testing stacked ensemble cross-validation
        test_indices = np.concatenate(test_indices)
        X, y = X[test_indices], y[test_indices]

        # test stacked ensemble cross-validation
        extraction_code = extraction.stacked_ensemble_cv['source']
        return_splits_iterable = functions.import_object_from_string_code(
            extraction_code,
            'return_splits_iterable'
        )
        number_of_splits_stacked_cv = 0
        try:
            for train_idx, test_idx in return_splits_iterable(X, y):
                number_of_splits_stacked_cv += 1
        except Exception as e:
            raise exceptions.UserError('User code exception', exception_message=str(e))

        data_stats = dict()
        data_stats['train_data_stats'] = functions.verify_dataset(X, y)
        if X_test is not None:
            data_stats['test_data_stats'] = functions.verify_dataset(X_test, y_test)
        else:
            data_stats['test_data_stats'] = None
        data_stats['holdout_data_stats'] = {'number_of_splits': number_of_splits}
        data_stats['stacked_ensemble_cv_stats'] = {'number_of_splits': number_of_splits_stacked_cv}

        extraction.data_statistics = data_stats

        session.add(extraction)
        session.commit()
Beispiel #9
0
def eval_stacked_ensemble(stacked_ensemble, session, path):
    """Evaluate stacked ensemble

    Args:
        stacked_ensemble (xcessiv.models.StackedEnsemble)

        session: Valid SQLAlchemy session

        path (str, unicode): Path to project folder

    Returns:
        stacked_ensemble (xcessiv.models.StackedEnsemble)
    """
    try:
        meta_features_list = []
        for base_learner in stacked_ensemble.base_learners:
            mf = np.load(base_learner.meta_features_path(path))
            if len(mf.shape) == 1:
                mf = mf.reshape(-1, 1)
            meta_features_list.append(mf)

        secondary_features = np.concatenate(meta_features_list, axis=1)

        # Get data
        extraction = session.query(models.Extraction).first()
        return_splits_iterable = functions.import_object_from_string_code(
            extraction.meta_feature_generation['source'],
            'return_splits_iterable'
        )
        X, y = extraction.return_train_dataset()

        #  We need to retrieve original order of meta-features
        indices_list = [test_index for train_index, test_index in return_splits_iterable(X, y)]
        indices = np.concatenate(indices_list)
        X, y = X[indices], y[indices]

        est = stacked_ensemble.return_secondary_learner()

        return_splits_iterable_stacked_ensemble = functions.import_object_from_string_code(
            extraction.stacked_ensemble_cv['source'],
            'return_splits_iterable'
        )
        preds = []
        trues_list = []
        for train_index, test_index in return_splits_iterable_stacked_ensemble(secondary_features, y):
            X_train, X_test = secondary_features[train_index], secondary_features[test_index]
            y_train, y_test = y[train_index], y[test_index]
            est = est.fit(X_train, y_train)
            preds.append(
                getattr(est, stacked_ensemble.base_learner_origin.
                        meta_feature_generator)(X_test)
            )
            trues_list.append(y_test)
        preds = np.concatenate(preds, axis=0)
        y_true = np.concatenate(trues_list)

        for key in stacked_ensemble.base_learner_origin.metric_generators:
            metric_generator = functions.import_object_from_string_code(
                stacked_ensemble.base_learner_origin.metric_generators[key],
                'metric_generator'
            )
            stacked_ensemble.individual_score[key] = metric_generator(y_true, preds)

        stacked_ensemble.job_status = 'finished'
        session.add(stacked_ensemble)
        session.commit()
        return stacked_ensemble

    except:
        session.rollback()
        stacked_ensemble.job_status = 'errored'
        stacked_ensemble.description['error_type'] = repr(sys.exc_info()[0])
        stacked_ensemble.description['error_value'] = repr(sys.exc_info()[1])
        stacked_ensemble.description['error_traceback'] = \
            traceback.format_exception(*sys.exc_info())
        session.add(stacked_ensemble)
        session.commit()
        raise