Example #1
0
def start_automated_run(id):
    """This starts an automated run using the passed in source code for configuration"""
    path = functions.get_path_from_query_string(request)
    req_body = request.get_json()
    with functions.DBContextManager(path) as session:
        base_learner_origin = session.query(models.BaseLearnerOrigin).filter_by(id=id).first()
        if base_learner_origin is None:
            raise exceptions.UserError('Base learner origin {} not found'.format(id), 404)

        if not base_learner_origin.final:
            raise exceptions.UserError('Base learner origin {} is not final'.format(id))

        # Check for any syntax errors
        module = functions.import_string_code_as_module(req_body['source'])
        del module

        automated_run = models.AutomatedRun(req_body['source'],
                                            'queued',
                                            base_learner_origin)

        session.add(automated_run)
        session.commit()

        with Connection(get_redis_connection()):
            rqtasks.start_automated_run.delay(path, automated_run.id)

        return jsonify(automated_run.serialize)
Example #2
0
    def test_source(self):
        module = functions.import_string_code_as_module(cvsetting.leave_one_out['source'])
        assert hasattr(module, 'return_splits_iterable')

        list(module.return_splits_iterable(self.X, self.y))

        del module
    def test_source(self):
        module = functions.import_string_code_as_module(metricsetting.f1_score_from_preds['source'])

        assert np.round(module.metric_generator(binary_y, binary_preds), 2) == 0.96
        assert np.round(module.metric_generator(multiclass_y, multiclass_preds), 2) == 0.95

        del module
Example #4
0
    def test_source(self):
        module = functions.import_string_code_as_module(
            cvsetting.group_k_fold['source'])
        assert hasattr(module, 'return_splits_iterable')

        generator = module.return_splits_iterable(self.X, self.y)
        self.assertRaises(ValueError, list, generator)

        del module
Example #5
0
    def test_learner_settings(self):
        for key in self.transformer_settings:
            setting = getattr(learnersetting, key)
            module = functions.import_string_code_as_module(setting['source'])

            assert hasattr(module.base_learner, 'get_params')
            assert hasattr(module.base_learner, 'set_params')
            assert hasattr(module.base_learner, 'fit')
            assert hasattr(module.base_learner,
                           setting['meta_feature_generator'])

            module.base_learner.fit(self.X, self.y)

            del module
Example #6
0
def get_automated_runs():
    """Return all automated runs"""
    path = functions.get_path_from_query_string(request)

    if request.method == 'GET':
        with functions.DBContextManager(path) as session:
            automated_runs = session.query(models.AutomatedRun).all()
            return jsonify(list(map(lambda x: x.serialize, automated_runs)))

    if request.method == 'POST':
        req_body = request.get_json()
        with functions.DBContextManager(path) as session:
            base_learner_origin = None

            if req_body['category'] == 'bayes' or req_body[
                    'category'] == 'greedy_ensemble_search':
                base_learner_origin = session.query(models.BaseLearnerOrigin).\
                    filter_by(id=req_body['base_learner_origin_id']).first()
                if base_learner_origin is None:
                    raise exceptions.UserError(
                        'Base learner origin {} not found'.format(
                            req_body['base_learner_origin_id']), 404)
                if not base_learner_origin.final:
                    raise exceptions.UserError(
                        'Base learner origin {} is not final'.format(
                            req_body['base_learner_origin_id']))

            elif req_body['category'] == 'tpot':
                pass

            else:
                raise exceptions.UserError('Automated run category'
                                           ' {} not recognized'.format(
                                               req_body['category']))

            # Check for any syntax errors
            module = functions.import_string_code_as_module(req_body['source'])
            del module

            automated_run = models.AutomatedRun(req_body['source'], 'queued',
                                                req_body['category'],
                                                base_learner_origin)

            session.add(automated_run)
            session.commit()

            with Connection(get_redis_connection()):
                rqtasks.start_automated_run.delay(path, automated_run.id)

            return jsonify(automated_run.serialize)
Example #7
0
def start_tpot(automated_run, session, path):
    """Starts a TPOT automated run that exports directly to base learner setup

    Args:
        automated_run (xcessiv.models.AutomatedRun): Automated run object

        session: Valid SQLAlchemy session

        path (str, unicode): Path to project folder
    """
    module = functions.import_string_code_as_module(automated_run.source)
    extraction = session.query(models.Extraction).first()
    X, y = extraction.return_train_dataset()

    tpot_learner = module.tpot_learner

    tpot_learner.fit(X, y)

    temp_filename = os.path.join(path,
                                 'tpot-temp-export-{}'.format(os.getpid()))
    tpot_learner.export(temp_filename)

    with open(temp_filename) as f:
        base_learner_source = f.read()

    base_learner_source = constants.tpot_learner_docstring + base_learner_source

    try:
        os.remove(temp_filename)
    except OSError:
        pass

    blo = models.BaseLearnerOrigin(source=base_learner_source,
                                   name='TPOT Learner',
                                   meta_feature_generator='predict')

    automated_run.job_status = 'finished'

    session.add(blo)
    session.add(automated_run)
    session.commit()
Example #8
0
def start_automated_run(path, automated_run_id):
    """Starts automated run. This will automatically create
    base learners until the run finishes or errors out.

    Args:
        path (str): Path to Xcessiv notebook

        automated_run_id (str): Automated Run ID
    """
    with functions.DBContextManager(path) as session:
        automated_run = session.query(models.AutomatedRun).filter_by(id=automated_run_id).first()
        if not automated_run:
            raise exceptions.UserError('Automated run {} '
                                       'does not exist'.format(automated_run_id))
        automated_run.job_id = get_current_job().id
        automated_run.job_status = 'started'

        session.add(automated_run)
        session.commit()

        try:
            module = functions.import_string_code_as_module(automated_run.source)
            random_state = 8 if not hasattr(module, 'random_state') else module.random_state
            assert module.metric_to_optimize in automated_run.base_learner_origin.metric_generators

            # get non-searchable parameters
            base_estimator = automated_run.base_learner_origin.return_estimator()
            base_estimator.set_params(**module.default_params)
            default_params = functions.make_serializable(base_estimator.get_params())
            non_searchable_params = dict((key, val) for key, val in iteritems(default_params)
                                         if key not in module.pbounds)

            # get already calculated base learners in search space
            existing_base_learners = []
            for base_learner in automated_run.base_learner_origin.base_learners:
                if not base_learner.job_status == 'finished':
                    continue
                in_search_space = True
                for key, val in iteritems(non_searchable_params):
                    if base_learner.hyperparameters[key] != val:
                        in_search_space = False
                        break  # If no match, move on to the next base learner
                if in_search_space:
                    existing_base_learners.append(base_learner)

            # build initialize dictionary
            target = []
            initialization_dict = dict((key, list()) for key in module.pbounds.keys())
            for base_learner in existing_base_learners:
                # check if base learner's searchable hyperparameters are all numerical
                all_numerical = True
                for key in module.pbounds.keys():
                    if not isinstance(base_learner.hyperparameters[key], numbers.Number):
                        all_numerical = False
                        break
                if not all_numerical:
                    continue  # if there is a non-numerical hyperparameter, skip this.

                for key in module.pbounds.keys():
                    initialization_dict[key].append(base_learner.hyperparameters[key])
                target.append(base_learner.individual_score[module.metric_to_optimize])
            initialization_dict['target'] = target if not module.invert_metric \
                else list(map(lambda x: -x, target))
            print('{} existing in initialization dictionary'.
                  format(len(initialization_dict['target'])))

            # Create function to be optimized
            func_to_optimize = return_func_to_optimize(
                path, session, automated_run.base_learner_origin, module.default_params,
                module.metric_to_optimize, module.invert_metric, set(module.integers)
            )

            # Create Bayes object
            bo = BayesianOptimization(func_to_optimize, module.pbounds)

            bo.initialize(initialization_dict)

            np.random.seed(random_state)

            bo.maximize(**module.maximize_config)

            automated_run.job_status = 'finished'
            session.add(automated_run)
            session.commit()

        except:
            session.rollback()
            automated_run.job_status = 'errored'
            automated_run.description['error_type'] = repr(sys.exc_info()[0])
            automated_run.description['error_value'] = repr(sys.exc_info()[1])
            automated_run.description['error_traceback'] = \
                traceback.format_exception(*sys.exc_info())
            session.add(automated_run)
            session.commit()
            raise
Example #9
0
def start_naive_bayes(automated_run, session, path):
    """Starts naive bayes automated run

    Args:
        automated_run (xcessiv.models.AutomatedRun): Automated run object

        session: Valid SQLAlchemy session

        path (str, unicode): Path to project folder
    """
    module = functions.import_string_code_as_module(automated_run.source)
    random_state = 8 if not hasattr(module,
                                    'random_state') else module.random_state
    assert module.metric_to_optimize in automated_run.base_learner_origin.metric_generators

    # get non-searchable parameters
    base_estimator = automated_run.base_learner_origin.return_estimator()
    base_estimator.set_params(**module.default_params)
    default_params = functions.make_serializable(base_estimator.get_params())
    non_searchable_params = dict((key, val)
                                 for key, val in iteritems(default_params)
                                 if key not in module.pbounds)

    # get already calculated base learners in search space
    existing_base_learners = []
    for base_learner in automated_run.base_learner_origin.base_learners:
        if not base_learner.job_status == 'finished':
            continue
        in_search_space = True
        for key, val in iteritems(non_searchable_params):
            if base_learner.hyperparameters[key] != val:
                in_search_space = False
                break  # If no match, move on to the next base learner
        if in_search_space:
            existing_base_learners.append(base_learner)

    # build initialize dictionary
    target = []
    initialization_dict = dict((key, list()) for key in module.pbounds.keys())
    for base_learner in existing_base_learners:
        # check if base learner's searchable hyperparameters are all numerical
        all_numerical = True
        for key in module.pbounds.keys():
            if not isinstance(base_learner.hyperparameters[key],
                              numbers.Number):
                all_numerical = False
                break
        if not all_numerical:
            continue  # if there is a non-numerical hyperparameter, skip this.

        for key in module.pbounds.keys():
            initialization_dict[key].append(base_learner.hyperparameters[key])
        target.append(base_learner.individual_score[module.metric_to_optimize])
    initialization_dict['target'] = target if not module.invert_metric \
        else list(map(lambda x: -x, target))
    print('{} existing in initialization dictionary'.format(
        len(initialization_dict['target'])))

    # Create function to be optimized
    func_to_optimize = return_func_to_optimize(
        path, session, automated_run.base_learner_origin,
        module.default_params, module.metric_to_optimize, module.invert_metric,
        set(module.integers))

    # Create Bayes object
    bo = BayesianOptimization(func_to_optimize, module.pbounds)

    bo.initialize(initialization_dict)

    np.random.seed(random_state)

    bo.maximize(**module.maximize_config)

    automated_run.job_status = 'finished'
    session.add(automated_run)
    session.commit()
    def test_source(self):
        module = functions.import_string_code_as_module(metricsetting.explained_variance_score['source'])

        assert np.round(module.metric_generator(regression_y, regression_preds), 2) == -0.89

        del module
    def test_source(self):
        module = functions.import_string_code_as_module(metricsetting.median_absolute_error['source'])

        assert np.round(module.metric_generator(regression_y, regression_preds), 2) == 3.72

        del module
    def test_source(self):
        module = functions.import_string_code_as_module(metricsetting.mse['source'])

        assert np.round(module.metric_generator(regression_y, regression_preds), 2) == 168.09

        del module
Example #13
0
def start_greedy_ensemble_search(automated_run, session, path):
    """Starts an automated ensemble search using greedy forward model selection.

    The steps for this search are adapted from "Ensemble Selection from Libraries of Models" by
    Caruana.

    1. Start with the empty ensemble

    2. Add to the ensemble the model in the library that maximizes the ensemmble's
    performance on the error metric.

    3. Repeat step 2 for a fixed number of iterations or until all models have been used.

    Args:
        automated_run (xcessiv.models.AutomatedRun): Automated run object

        session: Valid SQLAlchemy session

        path (str, unicode): Path to project folder
    """
    module = functions.import_string_code_as_module(automated_run.source)
    assert module.metric_to_optimize in automated_run.base_learner_origin.metric_generators

    best_ensemble = []  # List containing IDs of best performing ensemble for the last round

    secondary_learner = automated_run.base_learner_origin.return_estimator()
    secondary_learner.set_params(**module.secondary_learner_hyperparameters)

    for i in range(module.max_num_base_learners):
        best_score = -float('inf')  # Best metric for this round (not in total!)
        current_ensemble = best_ensemble[:]  # Shallow copy of best ensemble
        for base_learner in session.query(models.BaseLearner).filter_by(job_status='finished').all():
            if base_learner in current_ensemble:  # Don't append when learner is already in
                continue
            current_ensemble.append(base_learner)

            # Check if our "best ensemble" already exists
            existing_ensemble = session.query(models.StackedEnsemble).\
                filter_by(base_learner_origin_id=automated_run.base_learner_origin.id,
                          secondary_learner_hyperparameters=secondary_learner.get_params(),
                          base_learner_ids=sorted([bl.id for bl in current_ensemble])).first()

            if existing_ensemble and existing_ensemble.job_status == 'finished':
                score = existing_ensemble.individual_score[module.metric_to_optimize]

            elif existing_ensemble and existing_ensemble.job_status != 'finished':
                eval_stacked_ensemble(existing_ensemble, session, path)
                score = existing_ensemble.individual_score[module.metric_to_optimize]

            else:
                stacked_ensemble = models.StackedEnsemble(
                    secondary_learner_hyperparameters=secondary_learner.get_params(),
                    base_learners=current_ensemble,
                    base_learner_origin=automated_run.base_learner_origin,
                    job_status='started'
                )
                session.add(stacked_ensemble)
                session.commit()
                eval_stacked_ensemble(stacked_ensemble, session, path)
                score = stacked_ensemble.individual_score[module.metric_to_optimize]

            score = -score if module.invert_metric else score

            if best_score < score:
                best_score = score
                best_ensemble = current_ensemble[:]

            current_ensemble.pop()