Example #1
0
def specific_base_learner_origin(id):
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        base_learner_origin = session.query(models.BaseLearnerOrigin).filter_by(id=id).first()
        if base_learner_origin is None:
            raise exceptions.UserError('Base learner origin {} not found'.format(id), 404)

        if request.method == 'GET':
            return jsonify(base_learner_origin.serialize)

        if request.method == 'PATCH':
            if base_learner_origin.final:
                raise exceptions.UserError('Cannot modify a final base learner origin')
            req_body = request.get_json()

            modifiable_attr = ('meta_feature_generator', 'name', 'source',
                               'metric_generators')
            for attr in modifiable_attr:
                if attr in req_body:
                    setattr(base_learner_origin, attr, req_body[attr])

            session.add(base_learner_origin)
            session.commit()
            return jsonify(base_learner_origin.serialize)

        if request.method == 'DELETE':
            base_learner_origin.cleanup(path)
            session.delete(base_learner_origin)
            session.commit()
            return jsonify(message='Deleted base learner origin')
Example #2
0
def verify_base_learner_origin(id):
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        base_learner_origin = session.query(
            models.BaseLearnerOrigin).filter_by(id=id).first()
        if base_learner_origin is None:
            raise exceptions.UserError(
                'Base learner origin {} not found'.format(id), 404)

        if request.method == 'POST':
            req_body = request.get_json()
            if base_learner_origin.final:
                raise exceptions.UserError('Base learner origin {} '
                                           'is already final'.format(id))
            base_learner = base_learner_origin.return_estimator()
            validation_results, hyperparameters = functions.verify_estimator_class(
                base_learner, base_learner_origin.meta_feature_generator,
                base_learner_origin.metric_generators, req_body['dataset'])
            base_learner_origin.validation_results = {
                req_body['dataset']: validation_results
            }
            base_learner_origin.hyperparameters = hyperparameters
            session.add(base_learner_origin)
            session.commit()
            return jsonify(base_learner_origin.serialize)
Example #3
0
def confirm_base_learner_origin(id):
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        base_learner_origin = session.query(models.BaseLearnerOrigin).filter_by(id=id).first()
        if base_learner_origin is None:
            raise exceptions.UserError('Base learner origin {} not found'.format(id), 404)

        if request.method == 'GET':
            if base_learner_origin.final:
                raise exceptions.UserError('Base learner origin {} '
                                           'is already final'.format(id))
            if not base_learner_origin.validation_results:
                raise exceptions.UserError('Base learner origin {} has not yet been '
                                           'verified on a dataset'.format(id))
            base_learner = base_learner_origin.return_estimator()
            validation_results, hyperparameters = functions.verify_estimator_class(
                base_learner,
                base_learner_origin.meta_feature_generator,
                base_learner_origin.metric_generators,
                base_learner_origin.validation_results['dataset']
            )
            base_learner_origin.validation_results = {
                'dataset': base_learner_origin.validation_results['dataset'],
                'metrics': validation_results
            }
            base_learner_origin.hyperparameters = hyperparameters
            base_learner_origin.final = True
            session.add(base_learner_origin)
            session.commit()
            return jsonify(base_learner_origin.serialize)
Example #4
0
def verify_dataset(X, y):
    """Verifies if a dataset is valid for use i.e. scikit-learn format

    Used to verify a dataset by returning shape and basic statistics of
    returned data. This will also provide quick and dirty check on
    capability of host machine to process the data.

    Args:
        X (array-like): Features array

        y (array-like): Label array

    Returns:
        X_shape (2-tuple of int): Shape of X returned

        y_shape (1-tuple of int): Shape of y returned

    Raises:
        AssertionError: `X_shape` must be of length 2 and `y_shape` must be of
            length 1. `X` must have the same number of elements as `y`
            i.e. X_shape[0] == y_shape[0]. If any of these conditions are not met,
            an AssertionError is raised.
    """
    X_shape, y_shape = np.array(X).shape, np.array(y).shape
    if len(X_shape) != 2:
        raise exceptions.UserError("X must be 2-dimensional array")
    if len(y_shape) != 1:
        raise exceptions.UserError("y must be 1-dimensional array")
    if X_shape[0] != y_shape[0]:
        raise exceptions.UserError("X must have same number of elements as y")
    return dict(features_shape=X_shape, labels_shape=y_shape)
Example #5
0
def start_automated_run(id):
    """This starts an automated run using the passed in source code for configuration"""
    path = functions.get_path_from_query_string(request)
    req_body = request.get_json()
    with functions.DBContextManager(path) as session:
        base_learner_origin = session.query(models.BaseLearnerOrigin).filter_by(id=id).first()
        if base_learner_origin is None:
            raise exceptions.UserError('Base learner origin {} not found'.format(id), 404)

        if not base_learner_origin.final:
            raise exceptions.UserError('Base learner origin {} is not final'.format(id))

        # Check for any syntax errors
        module = functions.import_string_code_as_module(req_body['source'])
        del module

        automated_run = models.AutomatedRun(req_body['source'],
                                            'queued',
                                            base_learner_origin)

        session.add(automated_run)
        session.commit()

        with Connection(get_redis_connection()):
            rqtasks.start_automated_run.delay(path, automated_run.id)

        return jsonify(automated_run.serialize)
Example #6
0
def search_base_learner(id):
    """Creates a set of base learners from base learner origin using grid search
    and queues them up
    """
    path = functions.get_path_from_query_string(request)
    req_body = request.get_json()
    if req_body['method'] == 'grid':
        param_grid = functions.import_object_from_string_code(
            req_body['source'], 'param_grid')
        iterator = ParameterGrid(param_grid)
    elif req_body['method'] == 'random':
        param_distributions = functions.import_object_from_string_code(
            req_body['source'], 'param_distributions')
        iterator = ParameterSampler(param_distributions,
                                    n_iter=req_body['n_iter'])

    else:
        raise exceptions.UserError('{} not a valid search method'.format(
            req_body['method']))

    with functions.DBContextManager(path) as session:
        base_learner_origin = session.query(
            models.BaseLearnerOrigin).filter_by(id=id).first()
        if base_learner_origin is None:
            raise exceptions.UserError(
                'Base learner origin {} not found'.format(id), 404)

        if not base_learner_origin.final:
            raise exceptions.UserError(
                'Base learner origin {} is not final'.format(id))

        learners = []
        for params in iterator:
            est = base_learner_origin.return_estimator()
            try:
                est.set_params(**params)
            except Exception as e:
                print(repr(e))
                continue

            hyperparameters = functions.make_serializable(est.get_params())

            base_learners = session.query(models.BaseLearner).\
                filter_by(base_learner_origin_id=id,
                          hyperparameters=hyperparameters).all()
            if base_learners:  # already exists
                continue

            base_learner = models.BaseLearner(hyperparameters, 'queued',
                                              base_learner_origin)

            session.add(base_learner)
            session.commit()
            with Connection(get_redis_connection()):
                rqtasks.generate_meta_features.delay(path, base_learner.id)
            learners.append(base_learner)
        return jsonify(map(lambda x: x.serialize, learners))
Example #7
0
def create_new_stacked_ensemble():
    path = functions.get_path_from_query_string(request)
    req_body = request.get_json()

    with functions.DBContextManager(path) as session:
        if request.method == 'GET':
            return jsonify(
                list(
                    map(lambda x: x.serialize,
                        session.query(models.StackedEnsemble).all())))

        if request.method == 'POST':
            base_learners = session.query(models.BaseLearner).\
                filter(models.BaseLearner.id.in_(req_body['base_learner_ids'])).all()
            if len(base_learners) != len(req_body['base_learner_ids']):
                raise exceptions.UserError('Not all base learners found')
            for learner in base_learners:
                if learner.job_status != 'finished':
                    raise exceptions.UserError(
                        'Not all base learners have finished')

            base_learner_origin = session.query(models.BaseLearnerOrigin).\
                filter_by(id=req_body['base_learner_origin_id']).first()
            if base_learner_origin is None:
                raise exceptions.UserError(
                    'Base learner origin {} not '
                    'found'.format(req_body['base_learner_origin_id']), 404)

            # Retrieve full hyperparameters
            est = base_learner_origin.return_estimator()
            params = functions.import_object_from_string_code\
                (req_body['secondary_learner_hyperparameters_source'], 'params')
            est.set_params(**params)
            hyperparameters = functions.make_serializable(est.get_params())

            stacked_ensembles = session.query(models.StackedEnsemble).\
                filter_by(base_learner_origin_id=req_body['base_learner_origin_id'],
                          secondary_learner_hyperparameters=hyperparameters,
                          base_learner_ids=sorted([bl.id for bl in base_learners])).all()
            if stacked_ensembles:
                raise exceptions.UserError('Stacked ensemble exists')

            stacked_ensemble = models.StackedEnsemble(
                secondary_learner_hyperparameters=hyperparameters,
                base_learners=base_learners,
                base_learner_origin=base_learner_origin,
                job_status='queued')

            session.add(stacked_ensemble)
            session.commit()

            with Connection(get_redis_connection()):
                rqtasks.evaluate_stacked_ensemble.delay(
                    path, stacked_ensemble.id)

            return jsonify(stacked_ensemble.serialize)
Example #8
0
def get_automated_runs():
    """Return all automated runs"""
    path = functions.get_path_from_query_string(request)

    if request.method == 'GET':
        with functions.DBContextManager(path) as session:
            automated_runs = session.query(models.AutomatedRun).all()
            return jsonify(list(map(lambda x: x.serialize, automated_runs)))

    if request.method == 'POST':
        req_body = request.get_json()
        with functions.DBContextManager(path) as session:
            base_learner_origin = None

            if req_body['category'] == 'bayes' or req_body[
                    'category'] == 'greedy_ensemble_search':
                base_learner_origin = session.query(models.BaseLearnerOrigin).\
                    filter_by(id=req_body['base_learner_origin_id']).first()
                if base_learner_origin is None:
                    raise exceptions.UserError(
                        'Base learner origin {} not found'.format(
                            req_body['base_learner_origin_id']), 404)
                if not base_learner_origin.final:
                    raise exceptions.UserError(
                        'Base learner origin {} is not final'.format(
                            req_body['base_learner_origin_id']))

            elif req_body['category'] == 'tpot':
                pass

            else:
                raise exceptions.UserError('Automated run category'
                                           ' {} not recognized'.format(
                                               req_body['category']))

            # Check for any syntax errors
            module = functions.import_string_code_as_module(req_body['source'])
            del module

            automated_run = models.AutomatedRun(req_body['source'], 'queued',
                                                req_body['category'],
                                                base_learner_origin)

            session.add(automated_run)
            session.commit()

            with Connection(get_redis_connection()):
                rqtasks.start_automated_run.delay(path, automated_run.id)

            return jsonify(automated_run.serialize)
Example #9
0
def create_base_learner(id):
    """This creates a single base learner from a base learner origin and queues it up"""
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        base_learner_origin = session.query(
            models.BaseLearnerOrigin).filter_by(id=id).first()
        if base_learner_origin is None:
            raise exceptions.UserError(
                'Base learner origin {} not found'.format(id), 404)

        if not base_learner_origin.final:
            raise exceptions.UserError(
                'Base learner origin {} is not final'.format(id))

        req_body = request.get_json()

        # Retrieve full hyperparameters
        est = base_learner_origin.return_estimator()
        hyperparameters = functions.import_object_from_string_code(
            req_body['source'], 'params')
        est.set_params(**hyperparameters)
        hyperparameters = functions.make_serializable(est.get_params())

        base_learners = session.query(models.BaseLearner).\
            filter_by(base_learner_origin_id=id,
                      hyperparameters=hyperparameters).all()
        if base_learners:
            raise exceptions.UserError(
                'Base learner exists with given hyperparameters')

        base_learner = models.BaseLearner(hyperparameters, 'queued',
                                          base_learner_origin)

        if 'single_searches' not in base_learner_origin.description:
            base_learner_origin.description['single_searches'] = []
        base_learner_origin.description['single_searches'] += ([
            req_body['source']
        ])

        session.add(base_learner)
        session.add(base_learner_origin)
        session.commit()

        with Connection(get_redis_connection()):
            rqtasks.generate_meta_features.delay(path, base_learner.id)

        return jsonify(base_learner.serialize)
Example #10
0
    def return_test_dataset(self):
        """Returns test data set

        Returns:
            X (numpy.ndarray): Features

            y (numpy.ndarray): Labels
        """
        if self.test_dataset['method'] == 'split_from_main':
            X, y = self.return_main_dataset()
            X, X_test, y, y_test = train_test_split(
                X,
                y,
                test_size=self.test_dataset['split_ratio'],
                random_state=self.test_dataset['split_seed'],
                stratify=y)

            return X_test, y_test

        if self.test_dataset['method'] == 'source':
            if 'source' not in self.test_dataset or not self.test_dataset[
                    'source']:
                raise exceptions.UserError('Source is empty')

            extraction_code = self.test_dataset["source"]
            extraction_function = functions.\
                import_object_from_string_code(extraction_code, "extract_test_dataset")
            X_test, y_test = extraction_function()

            return np.array(X_test), np.array(y_test)
Example #11
0
def export_stacked_ensemble_as_base_learner_origin(id):
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        stacked_ensemble = session.query(
            models.StackedEnsemble).filter_by(id=id).first()
        if stacked_ensemble is None:
            raise exceptions.UserError(
                'Stacked ensemble {} not found'.format(id), 404)

        extraction = session.query(models.Extraction).first()

        if request.method == 'POST':
            source = stacked_ensemble.export_as_code(
                extraction.meta_feature_generation['source'])

            new_base_learner_origin = models.BaseLearnerOrigin(
                source=source,
                name='Xcessiv Ensemble',
                meta_feature_generator=stacked_ensemble.base_learner_origin.
                meta_feature_generator)

            session.add(new_base_learner_origin)
            session.commit()
            return jsonify(new_base_learner_origin.serialize)
Example #12
0
def export_stacked_ensemble(id):
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        stacked_ensemble = session.query(
            models.StackedEnsemble).filter_by(id=id).first()
        if stacked_ensemble is None:
            raise exceptions.UserError(
                'Stacked ensemble {} not found'.format(id), 404)

        extraction = session.query(models.Extraction).first()

        if request.method == 'POST':
            req_body = request.get_json()
            if req_body['type'] == 'package':
                stacked_ensemble.export_as_package(
                    os.path.join(path, req_body['name']),
                    extraction.meta_feature_generation['source'])
            elif req_body['type'] == 'file':
                if not req_body['name'].endswith('.py'):
                    req_body['name'] += '.py'
                stacked_ensemble.export_as_file(
                    os.path.join(path, req_body['name']),
                    extraction.meta_feature_generation['source'])
            return jsonify(
                message='Stacked ensemble successfully '
                'exported as {} in {}'.format(req_body['name'], path))
Example #13
0
def get_sample_dataset(dataset_properties):
    """Returns sample dataset

    Args:
        dataset_properties (dict): Dictionary corresponding to the properties of the dataset
            used to verify the estimator and metric generators.

    Returns:
        X (array-like): Features array

        y (array-like): Labels array

        splits (iterator): This is an iterator that returns train test splits for
            cross-validation purposes on ``X`` and ``y``.
    """
    kwargs = dataset_properties.copy()
    data_type = kwargs.pop('type')
    if data_type == 'multiclass':
        try:
            X, y = datasets.make_classification(random_state=8, **kwargs)
            splits = model_selection.StratifiedKFold(n_splits=2,
                                                     random_state=8).split(
                                                         X, y)
        except Exception as e:
            raise exceptions.UserError(repr(e))
    elif data_type == 'iris':
        X, y = datasets.load_iris(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2,
                                                 random_state=8).split(X, y)
    elif data_type == 'mnist':
        X, y = datasets.load_digits(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2,
                                                 random_state=8).split(X, y)
    elif data_type == 'breast_cancer':
        X, y = datasets.load_breast_cancer(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2,
                                                 random_state=8).split(X, y)
    elif data_type == 'boston':
        X, y = datasets.load_boston(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    elif data_type == 'diabetes':
        X, y = datasets.load_diabetes(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    else:
        raise exceptions.UserError('Unknown dataset type {}'.format(
            dataset_properties['type']))
    return X, y, splits
Example #14
0
    def __enter__(self):
        if not os.path.exists(self.path):
            raise exceptions.UserError('{} does not exist'.format(self.path))
        sqlite_url = 'sqlite:///{}'.format(self.path)
        engine = create_engine(sqlite_url)

        self.session = Session(bind=engine)

        return self.session
Example #15
0
def verify_extraction_meta_feature_generation():
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        extraction = session.query(models.Extraction).first()

    if extraction.meta_feature_generation['method'] == 'cv':
        raise exceptions.UserError('Xcessiv will use cross-validation to'
                                   ' generate meta-features')

    X_holdout, y_holdout = extraction.return_holdout_dataset()

    return jsonify(functions.verify_dataset(X_holdout, y_holdout))
Example #16
0
def start_automated_run(path, automated_run_id):
    """Starts automated run. This will automatically create
    base learners until the run finishes or errors out.

    Args:
        path (str): Path to Xcessiv notebook

        automated_run_id (str): Automated Run ID
    """
    with functions.DBContextManager(path) as session:
        automated_run = session.query(
            models.AutomatedRun).filter_by(id=automated_run_id).first()
        if not automated_run:
            raise exceptions.UserError(
                'Automated run {} '
                'does not exist'.format(automated_run_id))
        automated_run.job_id = get_current_job().id
        automated_run.job_status = 'started'

        session.add(automated_run)
        session.commit()

        try:
            if automated_run.category == 'bayes':
                automatedruns.start_naive_bayes(automated_run, session, path)

            elif automated_run.category == 'tpot':
                automatedruns.start_tpot(automated_run, session, path)

            elif automated_run.category == 'greedy_ensemble_search':
                automatedruns.start_greedy_ensemble_search(
                    automated_run, session, path)

            else:
                raise Exception(
                    'Something went wrong. Invalid category for automated run')

            automated_run.job_status = 'finished'
            session.add(automated_run)
            session.commit()

        except:
            session.rollback()
            automated_run.job_status = 'errored'
            automated_run.description['error_type'] = repr(sys.exc_info()[0])
            automated_run.description['error_value'] = repr(sys.exc_info()[1])
            automated_run.description['error_traceback'] = \
                traceback.format_exception(*sys.exc_info())
            session.add(automated_run)
            session.commit()
            raise
Example #17
0
    def return_main_dataset(self):
        """Returns main data set from self

        Returns:
            X (numpy.ndarray): Features

            y (numpy.ndarray): Labels
        """
        if not self.main_dataset['source']:
            raise exceptions.UserError('Source is empty')

        extraction_code = self.main_dataset["source"]

        extraction_function = functions.import_object_from_string_code(extraction_code,
                                                                       "extract_main_dataset")
        try:
            X, y = extraction_function()
        except Exception as e:
            raise exceptions.UserError('User code exception', exception_message=str(e))

        X, y = np.array(X), np.array(y)

        return X, y
Example #18
0
def import_object_from_string_code(code, object):
    """Used to import an object from arbitrary passed code.

    Passed in code is treated as a module and is imported and added
    to `sys.modules` with its SHA256 hash as key.

    Args:
        code (string): Python code to import as module

        object (string): Name of object to extract from imported module
    """
    sha256 = hashlib.sha256(code.encode('UTF-8')).hexdigest()
    module = imp.new_module(sha256)
    try:
        exec_(code, module.__dict__)
    except Exception as e:
        raise exceptions.UserError('User code exception',
                                   exception_message=str(e))
    sys.modules[sha256] = module
    try:
        return getattr(module, object)
    except AttributeError:
        raise exceptions.UserError("{} not found in code".format(object))
Example #19
0
def specific_automated_run(id):
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        automated_run = session.query(models.AutomatedRun).filter_by(id=id).first()
        if automated_run is None:
            raise exceptions.UserError('Automated run {} not found'.format(id), 404)

        if request.method == 'GET':
            return jsonify(automated_run.serialize)

        if request.method == 'DELETE':
            session.delete(automated_run)
            session.commit()
            return jsonify(message='Deleted automated run')
Example #20
0
def specific_stacked_ensemble(id):
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        stacked_ensemble = session.query(models.StackedEnsemble).filter_by(id=id).first()
        if stacked_ensemble is None:
            raise exceptions.UserError('Stacked ensemble {} not found'.format(id), 404)

        if request.method == 'GET':
            return jsonify(stacked_ensemble.serialize)

        if request.method == 'DELETE':
            session.delete(stacked_ensemble)
            session.commit()
            return jsonify(message='Deleted stacked ensemble')
Example #21
0
def get_path_from_query_string(req):
    """Gets path from query string

    Args:
        req (flask.request): Request object from Flask

    Returns:
        path (str): Value of "path" parameter from query string

    Raises:
        exceptions.UserError: If "path" is not found in query string
    """
    if req.args.get('path') is None:
        raise exceptions.UserError('Path not found in query string')
    return req.args.get('path')
Example #22
0
def specific_base_learner(id):
    path = functions.get_path_from_query_string(request)

    with functions.DBContextManager(path) as session:
        base_learner = session.query(models.BaseLearner).filter_by(id=id).first()
        if base_learner is None:
            raise exceptions.UserError('Base learner {} not found'.format(id), 404)

        if request.method == 'GET':
            return jsonify(base_learner.serialize)

        if request.method == 'DELETE':
            base_learner.cleanup(path)
            session.delete(base_learner)
            session.commit()
            return jsonify(message='Deleted base learner')
Example #23
0
    def export_as_file(self, file_path, cv_source):
        """Export the ensemble as a single Python file and saves it to `file_path`.

        This is EXPERIMENTAL as putting different modules together would probably wreak havoc
        especially on modules that make heavy use of global variables.

        Args:
            file_path (str, unicode): Absolute/local path of place to save file in

            cv_source (str, unicode): String containing actual code for base learner
                cross-validation used to generate secondary meta-features.
        """
        if os.path.exists(file_path):
            raise exceptions.UserError('{} already exists'.format(file_path))

        with open(file_path, 'wb') as f:
            f.write(self.export_as_code(cv_source).encode('utf8'))
Example #24
0
def import_string_code_as_module(code):
    """Used to run arbitrary passed code as a module

    Args:
        code (string): Python code to import as module

    Returns:
        module: Python module
    """
    sha256 = hashlib.sha256(code.encode('UTF-8')).hexdigest()
    module = imp.new_module(sha256)
    try:
        exec_(code, module.__dict__)
    except Exception as e:
        raise exceptions.UserError('User code exception',
                                   exception_message=str(e))
    sys.modules[sha256] = module
    return module
Example #25
0
def generate_meta_features(path, base_learner_id):
    """Generates meta-features for specified base learner

    After generation of meta-features, the file is saved into the meta-features folder

    Args:
        path (str): Path to Xcessiv notebook

        base_learner_id (str): Base learner ID
    """
    with functions.DBContextManager(path) as session:
        base_learner = session.query(models.BaseLearner).filter_by(id=base_learner_id).first()
        if not base_learner:
            raise exceptions.UserError('Base learner {} '
                                       'does not exist'.format(base_learner_id))

        base_learner.job_id = get_current_job().id
        base_learner.job_status = 'started'

        session.add(base_learner)
        session.commit()

        try:
            est = base_learner.return_estimator()
            extraction = session.query(models.Extraction).first()
            X, y = extraction.return_train_dataset()
            return_splits_iterable = functions.import_object_from_string_code(
                extraction.meta_feature_generation['source'],
                'return_splits_iterable'
            )

            meta_features_list = []
            trues_list = []
            for train_index, test_index in return_splits_iterable(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                est = est.fit(X_train, y_train)
                meta_features_list.append(
                    getattr(est, base_learner.base_learner_origin.
                            meta_feature_generator)(X_test)
                )
                trues_list.append(y_test)
            meta_features = np.concatenate(meta_features_list, axis=0)
            y_true = np.concatenate(trues_list)

            for key in base_learner.base_learner_origin.metric_generators:
                metric_generator = functions.import_object_from_string_code(
                    base_learner.base_learner_origin.metric_generators[key],
                    'metric_generator'
                )
                base_learner.individual_score[key] = metric_generator(y_true, meta_features)

            meta_features_path = base_learner.meta_features_path(path)

            if not os.path.exists(os.path.dirname(meta_features_path)):
                os.makedirs(os.path.dirname(meta_features_path))

            np.save(meta_features_path, meta_features, allow_pickle=False)
            base_learner.job_status = 'finished'
            base_learner.meta_features_exists = True
            session.add(base_learner)
            session.commit()

        except:
            session.rollback()
            base_learner.job_status = 'errored'
            base_learner.description['error_type'] = repr(sys.exc_info()[0])
            base_learner.description['error_value'] = repr(sys.exc_info()[1])
            base_learner.description['error_traceback'] = \
                traceback.format_exception(*sys.exc_info())
            session.add(base_learner)
            session.commit()
            raise
Example #26
0
def evaluate_stacked_ensemble(path, ensemble_id):
    """Evaluates the ensemble and updates the database when finished/

    Args:
        path (str): Path to Xcessiv notebook

        ensemble_id (str): Ensemble ID
    """
    with functions.DBContextManager(path) as session:
        stacked_ensemble = session.query(models.StackedEnsemble).filter_by(
            id=ensemble_id).first()
        if not stacked_ensemble:
            raise exceptions.UserError('Stacked ensemble {} '
                                       'does not exist'.format(ensemble_id))

        stacked_ensemble.job_id = get_current_job().id
        stacked_ensemble.job_status = 'started'

        session.add(stacked_ensemble)
        session.commit()

        try:
            meta_features_list = []
            for base_learner in stacked_ensemble.base_learners:
                mf = np.load(base_learner.meta_features_path(path))
                if len(mf.shape) == 1:
                    mf = mf.reshape(-1, 1)
                meta_features_list.append(mf)

            secondary_features = np.concatenate(meta_features_list, axis=1)

            # Get data
            extraction = session.query(models.Extraction).first()
            return_splits_iterable = functions.import_object_from_string_code(
                extraction.meta_feature_generation['source'],
                'return_splits_iterable'
            )
            X, y = extraction.return_train_dataset()

            #  We need to retrieve original order of meta-features
            indices_list = [test_index for train_index, test_index in return_splits_iterable(X, y)]
            indices = np.concatenate(indices_list)
            X, y = X[indices], y[indices]

            est = stacked_ensemble.return_secondary_learner()

            return_splits_iterable_stacked_ensemble = functions.import_object_from_string_code(
                extraction.stacked_ensemble_cv['source'],
                'return_splits_iterable'
            )
            preds = []
            trues_list = []
            for train_index, test_index in return_splits_iterable_stacked_ensemble(secondary_features, y):
                X_train, X_test = secondary_features[train_index], secondary_features[test_index]
                y_train, y_test = y[train_index], y[test_index]
                est = est.fit(X_train, y_train)
                preds.append(
                    getattr(est, stacked_ensemble.base_learner_origin.
                            meta_feature_generator)(X_test)
                )
                trues_list.append(y_test)
            preds = np.concatenate(preds, axis=0)
            y_true = np.concatenate(trues_list)

            for key in stacked_ensemble.base_learner_origin.metric_generators:
                metric_generator = functions.import_object_from_string_code(
                    stacked_ensemble.base_learner_origin.metric_generators[key],
                    'metric_generator'
                )
                stacked_ensemble.individual_score[key] = metric_generator(y_true, preds)

            stacked_ensemble.job_status = 'finished'
            session.add(stacked_ensemble)
            session.commit()

        except:
            session.rollback()
            stacked_ensemble.job_status = 'errored'
            stacked_ensemble.description['error_type'] = repr(sys.exc_info()[0])
            stacked_ensemble.description['error_value'] = repr(sys.exc_info()[1])
            stacked_ensemble.description['error_traceback'] = \
                traceback.format_exception(*sys.exc_info())
            session.add(stacked_ensemble)
            session.commit()
            raise
Example #27
0
def start_automated_run(path, automated_run_id):
    """Starts automated run. This will automatically create
    base learners until the run finishes or errors out.

    Args:
        path (str): Path to Xcessiv notebook

        automated_run_id (str): Automated Run ID
    """
    with functions.DBContextManager(path) as session:
        automated_run = session.query(models.AutomatedRun).filter_by(id=automated_run_id).first()
        if not automated_run:
            raise exceptions.UserError('Automated run {} '
                                       'does not exist'.format(automated_run_id))
        automated_run.job_id = get_current_job().id
        automated_run.job_status = 'started'

        session.add(automated_run)
        session.commit()

        try:
            module = functions.import_string_code_as_module(automated_run.source)
            random_state = 8 if not hasattr(module, 'random_state') else module.random_state
            assert module.metric_to_optimize in automated_run.base_learner_origin.metric_generators

            # get non-searchable parameters
            base_estimator = automated_run.base_learner_origin.return_estimator()
            base_estimator.set_params(**module.default_params)
            default_params = functions.make_serializable(base_estimator.get_params())
            non_searchable_params = dict((key, val) for key, val in iteritems(default_params)
                                         if key not in module.pbounds)

            # get already calculated base learners in search space
            existing_base_learners = []
            for base_learner in automated_run.base_learner_origin.base_learners:
                if not base_learner.job_status == 'finished':
                    continue
                in_search_space = True
                for key, val in iteritems(non_searchable_params):
                    if base_learner.hyperparameters[key] != val:
                        in_search_space = False
                        break  # If no match, move on to the next base learner
                if in_search_space:
                    existing_base_learners.append(base_learner)

            # build initialize dictionary
            target = []
            initialization_dict = dict((key, list()) for key in module.pbounds.keys())
            for base_learner in existing_base_learners:
                # check if base learner's searchable hyperparameters are all numerical
                all_numerical = True
                for key in module.pbounds.keys():
                    if not isinstance(base_learner.hyperparameters[key], numbers.Number):
                        all_numerical = False
                        break
                if not all_numerical:
                    continue  # if there is a non-numerical hyperparameter, skip this.

                for key in module.pbounds.keys():
                    initialization_dict[key].append(base_learner.hyperparameters[key])
                target.append(base_learner.individual_score[module.metric_to_optimize])
            initialization_dict['target'] = target if not module.invert_metric \
                else list(map(lambda x: -x, target))
            print('{} existing in initialization dictionary'.
                  format(len(initialization_dict['target'])))

            # Create function to be optimized
            func_to_optimize = return_func_to_optimize(
                path, session, automated_run.base_learner_origin, module.default_params,
                module.metric_to_optimize, module.invert_metric, set(module.integers)
            )

            # Create Bayes object
            bo = BayesianOptimization(func_to_optimize, module.pbounds)

            bo.initialize(initialization_dict)

            np.random.seed(random_state)

            bo.maximize(**module.maximize_config)

            automated_run.job_status = 'finished'
            session.add(automated_run)
            session.commit()

        except:
            session.rollback()
            automated_run.job_status = 'errored'
            automated_run.description['error_type'] = repr(sys.exc_info()[0])
            automated_run.description['error_value'] = repr(sys.exc_info()[1])
            automated_run.description['error_traceback'] = \
                traceback.format_exception(*sys.exc_info())
            session.add(automated_run)
            session.commit()
            raise
Example #28
0
def extraction_data_statistics(path):
    """ Generates data statistics for the given data extraction setup stored
    in Xcessiv notebook.

    This is in rqtasks.py but not as a job yet. Temporarily call this directly
    while I'm figuring out Javascript lel.

    Args:
        path (str, unicode): Path to xcessiv notebook
    """
    with functions.DBContextManager(path) as session:
        extraction = session.query(models.Extraction).first()
        X, y = extraction.return_main_dataset()
        functions.verify_dataset(X, y)

        if extraction.test_dataset['method'] == 'split_from_main':
            X, X_test, y, y_test = train_test_split(
                X,
                y,
                test_size=extraction.test_dataset['split_ratio'],
                random_state=extraction.test_dataset['split_seed'],
                stratify=y
            )
        elif extraction.test_dataset['method'] == 'source':
            if 'source' not in extraction.test_dataset or not extraction.test_dataset['source']:
                raise exceptions.UserError('Source is empty')

            extraction_code = extraction.test_dataset["source"]
            extraction_function = functions.\
                import_object_from_string_code(extraction_code, "extract_test_dataset")
            X_test, y_test = extraction_function()
        else:
            X_test, y_test = None, None

        # test base learner cross-validation
        extraction_code = extraction.meta_feature_generation['source']
        return_splits_iterable = functions.import_object_from_string_code(
            extraction_code,
            'return_splits_iterable'
        )
        number_of_splits = 0
        test_indices = []
        try:
            for train_idx, test_idx in return_splits_iterable(X, y):
                number_of_splits += 1
                test_indices.append(test_idx)
        except Exception as e:
            raise exceptions.UserError('User code exception', exception_message=str(e))

        # preparation before testing stacked ensemble cross-validation
        test_indices = np.concatenate(test_indices)
        X, y = X[test_indices], y[test_indices]

        # test stacked ensemble cross-validation
        extraction_code = extraction.stacked_ensemble_cv['source']
        return_splits_iterable = functions.import_object_from_string_code(
            extraction_code,
            'return_splits_iterable'
        )
        number_of_splits_stacked_cv = 0
        try:
            for train_idx, test_idx in return_splits_iterable(X, y):
                number_of_splits_stacked_cv += 1
        except Exception as e:
            raise exceptions.UserError('User code exception', exception_message=str(e))

        data_stats = dict()
        data_stats['train_data_stats'] = functions.verify_dataset(X, y)
        if X_test is not None:
            data_stats['test_data_stats'] = functions.verify_dataset(X_test, y_test)
        else:
            data_stats['test_data_stats'] = None
        data_stats['holdout_data_stats'] = {'number_of_splits': number_of_splits}
        data_stats['stacked_ensemble_cv_stats'] = {'number_of_splits': number_of_splits_stacked_cv}

        extraction.data_statistics = data_stats

        session.add(extraction)
        session.commit()
Example #29
0
    def export_as_package(self, package_path, cv_source):
        """Exports the ensemble as a Python package and saves it to `package_path`.

        Args:
            package_path (str, unicode): Absolute/local path of place to save package in

            cv_source (str, unicode): String containing actual code for base learner
                cross-validation used to generate secondary meta-features.

        Raises:
            exceptions.UserError: If os.path.join(path, name) already exists.
        """
        if os.path.exists(package_path):
            raise exceptions.UserError(
                '{} already exists'.format(package_path))

        package_name = os.path.basename(os.path.normpath(package_path))

        os.makedirs(package_path)

        # Write __init__.py
        with open(os.path.join(package_path, '__init__.py'), 'wb') as f:
            f.write('from {}.builder import xcessiv_ensemble'.format(
                package_name).encode('utf8'))

        # Create package baselearners with each base learner having its own module
        os.makedirs(os.path.join(package_path, 'baselearners'))
        open(os.path.join(package_path, 'baselearners', '__init__.py'),
             'a').close()
        for idx, base_learner in enumerate(self.base_learners):
            base_learner.export_as_file(
                os.path.join(package_path, 'baselearners',
                             'baselearner' + str(idx)))

        # Create metalearner.py containing secondary learner
        self.base_learner_origin.export_as_file(
            os.path.join(package_path, 'metalearner'),
            self.secondary_learner_hyperparameters)

        # Create cv.py containing CV method for getting meta-features
        with open(os.path.join(package_path, 'cv.py'), 'wb') as f:
            f.write(cv_source.encode('utf8'))

        # Create stacker.py containing class for Xcessiv ensemble
        ensemble_source = ''
        stacker_file_loc = os.path.join(
            os.path.abspath(os.path.dirname(__file__)), 'stacker.py')
        with open(stacker_file_loc) as f:
            ensemble_source += f.read()

        ensemble_source += '\n\n' \
                           '    def {}(self, X):\n' \
                           '        return self._process_using_' \
                           'meta_feature_generator(X, "{}")\n\n'\
            .format(self.base_learner_origin.meta_feature_generator,
                    self.base_learner_origin.meta_feature_generator)

        with open(os.path.join(package_path, 'stacker.py'), 'wb') as f:
            f.write(ensemble_source.encode('utf8'))

        # Create builder.py containing file where `xcessiv_ensemble` is instantiated for import
        builder_source = ''

        for idx, base_learner in enumerate(self.base_learners):
            builder_source += 'from {}.baselearners import baselearner{}\n'.format(
                package_name, idx)

        builder_source += 'from {}.cv import return_splits_iterable\n'.format(
            package_name)

        builder_source += 'from {} import metalearner\n'.format(package_name)

        builder_source += 'from {}.stacker import XcessivStackedEnsemble\n'.format(
            package_name)

        builder_source += '\nbase_learners = [\n'
        for idx, base_learner in enumerate(self.base_learners):
            builder_source += '    baselearner{}.base_learner,\n'.format(idx)
        builder_source += ']\n'

        builder_source += '\nmeta_feature_generators = [\n'
        for idx, base_learner in enumerate(self.base_learners):
            builder_source += '    baselearner{}.meta_feature_generator,\n'.format(
                idx)
        builder_source += ']\n'

        builder_source += '\nxcessiv_ensemble = XcessivStackedEnsemble(base_learners=base_learners,' \
                          ' meta_feature_generators=meta_feature_generators,' \
                          ' secondary_learner=metalearner.base_learner,' \
                          ' cv_function=return_splits_iterable,' \
                          ' append_original={})\n'.format(self.append_original)

        with open(os.path.join(package_path, 'builder.py'), 'wb') as f:
            f.write(builder_source.encode('utf8'))
Example #30
0
def extraction_data_statistics(path):
    """ Generates data statistics for the given data extraction setup stored
    in Xcessiv notebook.

    This is in rqtasks.py but not as a job yet. Temporarily call this directly
    while I'm figuring out Javascript lel.

    Args:
        path (str, unicode): Path to xcessiv notebook
    """
    with functions.DBContextManager(path) as session:
        extraction = session.query(models.Extraction).first()
        X, y = extraction.return_main_dataset()
        functions.verify_dataset(X, y)

        if extraction.test_dataset['method'] == 'split_from_main':
            X, X_test, y, y_test = train_test_split(
                X,
                y,
                test_size=extraction.test_dataset['split_ratio'],
                random_state=extraction.test_dataset['split_seed'],
                stratify=y
            )
        elif extraction.test_dataset['method'] == 'source':
            if 'source' not in extraction.test_dataset or not extraction.test_dataset['source']:
                raise exceptions.UserError('Source is empty')

            extraction_code = extraction.test_dataset["source"]
            extraction_function = functions.\
                import_object_from_string_code(extraction_code, "extract_test_dataset")
            X_test, y_test = extraction_function()
        else:
            X_test, y_test = None, None

        if extraction.meta_feature_generation['method'] == 'holdout_split':
            X, X_holdout, y, y_holdout = train_test_split(
                X,
                y,
                test_size=extraction.meta_feature_generation['split_ratio'],
                random_state=extraction.meta_feature_generation['seed'],
                stratify=y
            )
        elif extraction.meta_feature_generation['method'] == 'holdout_source':
            if 'source' not in extraction.meta_feature_generation or \
                    not extraction.meta_feature_generation['source']:
                raise exceptions.UserError('Source is empty')

            extraction_code = extraction.meta_feature_generation["source"]
            extraction_function = functions.\
                import_object_from_string_code(extraction_code,
                                               "extract_holdout_dataset")
            X_holdout, y_holdout = extraction_function()
        else:
            X_holdout, y_holdout = None, None

        data_stats = dict()
        data_stats['train_data_stats'] = functions.verify_dataset(X, y)
        if X_test is not None:
            data_stats['test_data_stats'] = functions.verify_dataset(X_test, y_test)
        else:
            data_stats['test_data_stats'] = None
        if X_holdout is not None:
            data_stats['holdout_data_stats'] = functions.verify_dataset(X_holdout, y_holdout)
        else:
            data_stats['holdout_data_stats'] = None

        extraction.data_statistics = data_stats

        session.add(extraction)
        session.commit()