Beispiel #1
0
def test_run_per_partition(dataset):
    sql_conf = SQLConfig({'sql_database': DB_PATH})
    db = Database(**sql_conf.to_dict())

    run_conf = RunConfig(
        {
            'dataset_id': dataset.id,
            'methods': ['logreg'],
            'run_per_partition': True
        }
    )

    atm = ATM(sql_conf, None, None)

    run_ids = atm.enter_data(None, run_conf)

    with db_session(db):
        runs = []
        for run_id in run_ids:
            run = db.get_datarun(run_id.id)
            if run is not None:
                runs.append(run)

        assert len(runs) == METHOD_HYPERPARTS['logreg']
        assert all([len(r.hyperpartitions) == 1 for r in runs])
Beispiel #2
0
    def __init__(
        self,

        # SQL Conf
        dialect='sqlite',
        database='atm.db',
        username=None,
        password=None,
        host=None,
        port=None,
        query=None,

        # AWS Conf
        access_key=None,
        secret_key=None,
        s3_bucket=None,
        s3_folder=None,

        # Log Conf
        models_dir='models',
        metrics_dir='metrics',
        verbose_metrics=False,
    ):

        self.db = Database(dialect, database, username, host, port, query)
        self.aws_access_key = access_key
        self.aws_secret_key = secret_key
        self.s3_bucket = s3_bucket
        self.s3_folder = s3_folder

        self.models_dir = models_dir
        self.metrics_dir = metrics_dir
        self.verbose_metrics = verbose_metrics
Beispiel #3
0
def get_new_worker(**kwargs):
    kwargs['methods'] = kwargs.get('methods', ['logreg', 'dt'])
    sql_conf = SQLConfig(database=DB_PATH)
    run_conf = RunConfig(**kwargs)
    run_id = enter_data(sql_conf, run_conf)
    db = Database(**vars(sql_conf))
    datarun = db.get_datarun(run_id)
    return Worker(db, datarun)
Beispiel #4
0
def db():
    os.remove(DB_PATH)
    db = Database(dialect='sqlite', database=DB_PATH)
    # load cached ModelHub state. This database snapshot has one dataset
    # (pollution.csv) and two dataruns, one complete and one with 33/100
    # classifiers finished.
    db.from_csv(DB_CACHE_PATH)
    return db
Beispiel #5
0
def db():
    os.remove(DB_PATH)
    db = Database(dialect='sqlite', database=DB_PATH)
    # load cached ModelHub state. This database snapshot has one dataset
    # (pollution_1.csv) and two dataruns, one complete and one with 33/100
    # classifiers finished.
    db.from_csv(DB_CACHE_PATH)
    return db
Beispiel #6
0
def get_new_worker(**kwargs):
    kwargs['methods'] = kwargs.get('methods', ['logreg', 'dt'])
    sql_conf = SQLConfig(database=DB_PATH)
    run_conf = RunConfig(**kwargs)
    run_id = enter_data(sql_conf, run_conf)
    db = Database(**vars(sql_conf))
    datarun = db.get_datarun(run_id)
    return Worker(db, datarun)
Beispiel #7
0
    def __init__(self, **kwargs):

        if kwargs.get('log_config') is None:
            kwargs['log_config'] = os.path.join(
                PROJECT_ROOT, 'config/templates/log-script.yaml')

        self.sql_conf, self.run_conf, self.aws_conf, self.log_conf = load_config(
            **kwargs)

        self.db = Database(**vars(self.sql_conf))

        initialize_logging(self.log_conf)
def test_enter_data_all(dataset):
    sql_conf = SQLConfig(database=DB_PATH)
    db = Database(**vars(sql_conf))
    run_conf = RunConfig(dataset_id=dataset.id,
                         methods=METHOD_HYPERPARTS.keys())

    run_id = enter_data(sql_conf, run_conf)

    with db_session(db):
        run = db.get_datarun(run_id)
        assert run.dataset.id == dataset.id
        assert len(run.hyperpartitions) == sum(METHOD_HYPERPARTS.values())
Beispiel #9
0
def test_enter_data_all(dataset):
    sql_conf = SQLConfig({'sql_database': DB_PATH})
    db = Database(**sql_conf.to_dict())
    run_conf = RunConfig({'dataset_id': dataset.id, 'methods': METHOD_HYPERPARTS.keys()})

    atm = ATM(sql_conf, None, None)

    run_id = atm.enter_data(None, run_conf)

    with db_session(db):
        run = db.get_datarun(run_id.id)
        assert run.dataset.id == dataset.id
        assert len(run.hyperpartitions) == sum(METHOD_HYPERPARTS.values())
Beispiel #10
0
def get_db():
    """Connect to the application's configured database. The connection
    is unique for each request and will be reused if this is called
    again.
    """
    if 'db' not in g:
        sql_conf = current_app.config['SQL_CONF']
        db = Database(sql_conf.dialect, sql_conf.database, sql_conf.username,
                      sql_conf.password, sql_conf.host, sql_conf.port,
                      sql_conf.query)
        check_db_mappers(db)
        db.session = db.get_session()
        g.db = db
    return g.db
def test_enter_data_by_methods(dataset):
    sql_conf = SQLConfig(database=DB_PATH)
    db = Database(**vars(sql_conf))
    run_conf = RunConfig(dataset_id=dataset.id)

    for method, n_parts in METHOD_HYPERPARTS.items():
        run_conf.methods = [method]
        run_id = enter_data(sql_conf, run_conf)

        assert db.get_datarun(run_id)
        with db_session(db):
            run = db.get_datarun(run_id)
            assert run.dataset.id == dataset.id
            assert len(run.hyperpartitions) == n_parts
Beispiel #12
0
def test_enter_data_by_methods(dataset):
    sql_conf = SQLConfig({'sql_database': DB_PATH})
    db = Database(**sql_conf.to_dict())
    run_conf = RunConfig({'dataset_id': dataset.id})

    atm = ATM(sql_conf, None, None)

    for method, n_parts in METHOD_HYPERPARTS.items():
        run_conf.methods = [method]
        run_id = atm.enter_data(None, run_conf)

        with db_session(db):
            run = db.get_datarun(run_id.id)
            assert run.dataset.id == dataset.id
            assert len(run.hyperpartitions) == n_parts
Beispiel #13
0
def get_new_worker(**kwargs):
    kwargs['dataset_id'] = kwargs.get('dataset_id', None)
    kwargs['methods'] = kwargs.get('methods', ['logreg', 'dt'])
    sql_conf = SQLConfig({'sql_database': DB_PATH})
    run_conf = RunConfig(kwargs)

    dataset_conf = DatasetConfig(kwargs)

    db = Database(**sql_conf.to_dict())
    atm = ATM(sql_conf, None, None)

    run_id = atm.enter_data(dataset_conf, run_conf)
    datarun = db.get_datarun(run_id.id)

    return Worker(db, datarun)
def test_run_per_partition(dataset):
    sql_conf = SQLConfig(database=DB_PATH)
    db = Database(**vars(sql_conf))
    run_conf = RunConfig(dataset_id=dataset.id, methods=['logreg'])

    run_ids = enter_data(sql_conf, run_conf, run_per_partition=True)

    with db_session(db):
        runs = []
        for run_id in run_ids:
            run = db.get_datarun(run_id)
            if run is not None:
                runs.append(run)

        assert len(runs) == METHOD_HYPERPARTS['logreg']
        assert all([len(run.hyperpartitions) == 1 for run in runs])
Beispiel #15
0
def work(datarun_id, args=None):
    """
    A copy of the code in atm/scripts/worker.py
    A call to this function will start and run a simple worker
    """
    _logger = logging.getLogger('atm_server.worker:work')
    _logger.setLevel(logging.DEBUG)
    fh = logging.FileHandler('logs/works.log')
    fmt = '%(asctime)-12s %(name)s - %(levelname)s  %(message)s'
    fh.setFormatter(logging.Formatter(fmt))
    _logger.addHandler(fh)
    parser = argparse.ArgumentParser(description='Add more classifiers to database')
    add_arguments_sql(parser)
    add_arguments_aws_s3(parser)
    add_arguments_logging(parser)

    # add worker-specific arguments
    parser.add_argument('--cloud-mode', action='store_true', default=False,
                        help='Whether to run this worker in cloud mode')
    parser.add_argument('--time', help='Number of seconds to run worker', type=int)
    parser.add_argument('--choose-randomly', action='store_true',
                        help='Choose dataruns to work on randomly (default = sequential order)')
    parser.add_argument('--no-save', dest='save_files', default=True,
                        action='store_const', const=False,
                        help="don't save models and metrics at all")

    # parse arguments and load configuration
    if args is None:
        args = []
    _args = parser.parse_args(args)

    # default logging config is different if initialized from the command line
    if _args.log_config is None:
        _args.log_config = os.path.join(atm.PROJECT_ROOT,
                                        'config/templates/log-script.yaml')

    sql_config, _, aws_config, log_config = load_config(**vars(_args))
    initialize_logging(log_config)

    # let's go
    _logger.warning('Worker started!')
    with datarun_config(datarun_id) as config:
        _logger.warning('Using configs from ' + config.config_path)
        db = Database(**vars(sql_config))
        try:
            atm_work(db=db,
                     datarun_ids=[datarun_id],
                     choose_randomly=_args.choose_randomly,
                     save_files=_args.save_files,
                     cloud_mode=_args.cloud_mode,
                     aws_config=aws_config,
                     log_config=log_config,
                     total_time=_args.time,
                     wait=False)
        except Exception as e:
            _logger.error(e)
            mark_running_datarun_pending(db, datarun_id)
            raise e
    _logger.warning('Worker exited.')
Beispiel #16
0
def get_new_worker(**kwargs):
    kwargs['dataset_id'] = kwargs.get('dataset_id', None)
    kwargs['methods'] = kwargs.get('methods', ['logreg', 'dt'])
    run_conf = RunConfig(kwargs)

    kwargs['train_path'] = POLLUTION_PATH
    dataset_conf = DatasetConfig(kwargs)

    db = Database(dialect='sqlite', database=DB_PATH)
    atm = ATM(dialect='sqlite', database=DB_PATH)

    dataset = atm.add_dataset(**dataset_conf.to_dict())
    run_conf.dataset_id = dataset.id
    datarun = atm.add_datarun(**run_conf.to_dict())

    return Worker(db, datarun)
Beispiel #17
0
def btb_test(tuner=None,
             selector=None,
             dataruns=None,
             datasets=None,
             processes=1,
             graph=False):
    """
    Run a test datarun using the chosen tuner and selector, and compare it to
    the baseline performance
    """
    sql_conf, run_conf, _ = load_config(sql_path=SQL_CONFIG,
                                        run_path=RUN_CONFIG)

    if tuner is not None:
        run_conf.tuner = tuner
    if selector is not None:
        run_conf.selector = selector

    db = Database(**vars(sql_conf))
    datarun_ids = dataruns or []
    datasets = datasets or DATASETS_MAX_FIRST

    # if necessary, generate datasets and dataruns
    if not datarun_ids:
        for ds in datasets:
            run_conf.train_path = DATA_URL + ds
            run_conf.dataset_id = None
            print('Creating datarun for', run_conf.train_path)
            datarun_ids.append(enter_datarun(sql_conf, run_conf))

    # work on the dataruns til they're done
    print('Working on %d dataruns' % len(datarun_ids))
    work_parallel(db=db, datarun_ids=datarun_ids, n_procs=processes)
    print('Finished!')

    results = {}

    # compute and maybe graph the results
    for rid in datarun_ids:
        res = report_auc_vs_baseline(db, rid, graph=graph)
        results[rid] = {'test': res[0], 'baseline': res[1]}

    return results
Beispiel #18
0
def btb_test(dataruns=None, datasets=None, processes=1, graph=False, **kwargs):
    """
    Run a test datarun using the chosen tuner and selector, and compare it to
    the baseline performance.

    Tuner and selector will be specified in **kwargs, along with the rest of the
    standard datarun arguments.
    """
    sql_conf, run_conf, _, _ = load_config(sql_path=SQL_CONFIG,
                                           run_path=RUN_CONFIG,
                                           **kwargs)

    db = Database(**vars(sql_conf))
    datarun_ids = dataruns or []
    datarun_ids_per_dataset = [[each] for each in dataruns] if dataruns else []
    datasets = datasets or DATASETS_MAX_FIRST

    # if necessary, generate datasets and dataruns
    if not datarun_ids:
        for ds in datasets:
            run_conf.train_path = DATA_URL + ds
            run_conf.dataset_id = None
            print('Creating 10 dataruns for', run_conf.train_path)
            run_ids = [enter_data(sql_conf, run_conf) for i in range(10)]
            datarun_ids_per_dataset.append(run_ids)
            datarun_ids.extend(run_ids)

    # work on the dataruns til they're done
    print('Working on %d dataruns' % len(datarun_ids))
    work_parallel(db=db, datarun_ids=datarun_ids, n_procs=processes)
    print('Finished!')

    results = {}

    # compute and maybe graph the results for each dataset
    for rids in datarun_ids_per_dataset:
        res = report_auc_vs_baseline(db, rids, graph=graph)
        results[tuple(rids)] = {'test': res[0], 'baseline': res[1]}

    return results
Beispiel #19
0
class ATM(object):

    LOOP_WAIT = 5

    def __init__(
        self,

        # SQL Conf
        dialect='sqlite',
        database='atm.db',
        username=None,
        password=None,
        host=None,
        port=None,
        query=None,

        # AWS Conf
        access_key=None,
        secret_key=None,
        s3_bucket=None,
        s3_folder=None,

        # Log Conf
        models_dir='models',
        metrics_dir='metrics',
        verbose_metrics=False,
    ):

        self.db = Database(dialect, database, username, host, port, query)
        self.aws_access_key = access_key
        self.aws_secret_key = secret_key
        self.s3_bucket = s3_bucket
        self.s3_folder = s3_folder

        self.models_dir = models_dir
        self.metrics_dir = metrics_dir
        self.verbose_metrics = verbose_metrics

    def add_dataset(self, train_path, test_path=None, name=None,
                    description=None, class_column=None):
        return self.db.create_dataset(
            train_path=train_path,
            test_path=test_path,
            name=name,
            description=description,
            class_column=class_column,
            aws_access_key=self.aws_access_key,
            aws_secret_key=self.aws_secret_key,
        )

    def add_datarun(self, dataset_id, budget=100, budget_type='classifier',
                    gridding=0, k_window=3, metric='f1', methods=['logreg', 'dt', 'knn'],
                    r_minimum=2, run_per_partition=False, score_target='cv', priority=1,
                    selector='uniform', tuner='uniform', deadline=None):

        dataruns = list()

        if deadline:
            deadline = datetime.strptime(deadline, TIME_FMT)
            budget_type = 'walltime'

        elif budget_type == 'walltime':
            deadline = datetime.now() + timedelta(minutes=budget)

        run_description = '___'.join([tuner, selector])
        target = score_target + '_judgment_metric'

        method_parts = {}
        for method in methods:
            # enumerate all combinations of categorical variables for this method
            method_instance = Method(method)
            method_parts[method] = method_instance.get_hyperpartitions()

            LOGGER.info('method {} has {} hyperpartitions'.format(
                method, len(method_parts[method])))

        if not run_per_partition:
            datarun = self.db.create_datarun(
                dataset_id=dataset_id,
                description=run_description,
                tuner=tuner,
                selector=selector,
                gridding=gridding,
                priority=priority,
                budget_type=budget_type,
                budget=budget,
                deadline=deadline,
                metric=metric,
                score_target=target,
                k_window=k_window,
                r_minimum=r_minimum
            )

            dataruns.append(datarun)

        for method, parts in method_parts.items():
            for part in parts:
                # if necessary, create a new datarun for each hyperpartition.
                # This setting is useful for debugging.
                if run_per_partition:
                    datarun = self.db.create_datarun(
                        dataset_id=dataset_id,
                        description=run_description,
                        tuner=tuner,
                        selector=selector,
                        gridding=gridding,
                        priority=priority,
                        budget_type=budget_type,
                        budget=budget,
                        deadline=deadline,
                        metric=metric,
                        score_target=target,
                        k_window=k_window,
                        r_minimum=r_minimum
                    )

                    dataruns.append(datarun)

                # create a new hyperpartition in the database
                self.db.create_hyperpartition(datarun_id=datarun.id,
                                              method=method,
                                              tunables=part.tunables,
                                              constants=part.constants,
                                              categoricals=part.categoricals,
                                              status=PartitionStatus.INCOMPLETE)

        dataset = self.db.get_dataset(dataset_id)
        LOGGER.info('Dataruns created. Summary:')
        LOGGER.info('\tDataset ID: {}'.format(dataset.id))
        LOGGER.info('\tTraining data: {}'.format(dataset.train_path))
        LOGGER.info('\tTest data: {}'.format(dataset.test_path))

        if run_per_partition:
            LOGGER.info('\tDatarun IDs: {}'.format(
                ', '.join(str(datarun.id) for datarun in dataruns)))

        else:
            LOGGER.info('\tDatarun ID: {}'.format(dataruns[0].id))

        LOGGER.info('\tHyperpartition selection strategy: {}'.format(dataruns[0].selector))
        LOGGER.info('\tParameter tuning strategy: {}'.format(dataruns[0].tuner))
        LOGGER.info('\tBudget: {} ({})'.format(dataruns[0].budget, dataruns[0].budget_type))

        return dataruns if run_per_partition else dataruns[0]

    def work(self, datarun_ids=None, save_files=True, choose_randomly=True,
             cloud_mode=False, total_time=None, wait=True, verbose=False):
        """
        Check the ModelHub database for unfinished dataruns, and spawn workers to
        work on them as they are added. This process will continue to run until it
        exceeds total_time or is broken with ctrl-C.

        datarun_ids (optional): list of IDs of dataruns to compute on. If None,
            this will work on all unfinished dataruns in the database.
        choose_randomly: if True, work on all highest-priority dataruns in random
            order. If False, work on them in sequential order (by ID)
        cloud_mode: if True, save processed datasets to AWS.
        total_time (optional): if set to an integer, this worker will only work for
            total_time seconds. Otherwise, it will continue working until all
            dataruns are complete (or indefinitely).
        wait: if True, once all dataruns in the database are complete, keep spinning
            and wait for new runs to be added. If False, exit once all dataruns are
            complete.
        """
        start_time = datetime.now()

        # main loop
        while True:
            # get all pending and running dataruns, or all pending/running dataruns
            # from the list we were given
            dataruns = self.db.get_dataruns(include_ids=datarun_ids, ignore_complete=True)
            if not dataruns:
                if wait:
                    LOGGER.debug('No dataruns found. Sleeping %d seconds and trying again.',
                                 ATM.LOOP_WAIT)
                    time.sleep(ATM.LOOP_WAIT)
                    continue

                else:
                    LOGGER.info('No dataruns found. Exiting.')
                    break

            # either choose a run randomly between priority, or take the run with the lowest ID
            if choose_randomly:
                run = random.choice(dataruns)
            else:
                run = sorted(dataruns, key=attrgetter('id'))[0]

            # say we've started working on this datarun, if we haven't already
            self.db.mark_datarun_running(run.id)

            LOGGER.info('Computing on datarun %d' % run.id)
            # actual work happens here
            worker = Worker(self.db, run, save_files=save_files,
                            cloud_mode=cloud_mode, aws_access_key=self.aws_access_key,
                            aws_secret_key=self.aws_secret_key, s3_bucket=self.s3_bucket,
                            s3_folder=self.s3_folder, models_dir=self.models_dir,
                            metrics_dir=self.metrics_dir, verbose_metrics=self.verbose_metrics)
            try:
                if run.budget_type == 'classifier':
                    pbar = tqdm(
                        total=run.budget,
                        ascii=True,
                        initial=run.completed_classifiers,
                        disable=not verbose
                    )

                    while run.status != RunStatus.COMPLETE:
                        worker.run_classifier()
                        run = self.db.get_datarun(run.id)
                        if verbose and run.completed_classifiers > pbar.last_print_n:
                            pbar.update(run.completed_classifiers - pbar.last_print_n)

                    pbar.close()

                elif run.budget_type == 'walltime':
                    pbar = tqdm(
                        disable=not verbose,
                        ascii=True,
                        initial=run.completed_classifiers,
                        unit=' Classifiers'
                    )

                    while run.status != RunStatus.COMPLETE:
                        worker.run_classifier()
                        run = self.db.get_datarun(run.id)  # Refresh the datarun object.
                        if verbose and run.completed_classifiers > pbar.last_print_n:
                            pbar.update(run.completed_classifiers - pbar.last_print_n)

                    pbar.close()

            except ClassifierError:
                # the exception has already been handled; just wait a sec so we
                # don't go out of control reporting errors
                LOGGER.error('Something went wrong. Sleeping %d seconds.', ATM.LOOP_WAIT)
                time.sleep(ATM.LOOP_WAIT)

            elapsed_time = (datetime.now() - start_time).total_seconds()
            if total_time is not None and elapsed_time >= total_time:
                LOGGER.info('Total run time for worker exceeded; exiting.')
                break

    def run(self, train_path, test_path=None, name=None, description=None,
            class_column='class', budget=100, budget_type='classifier', gridding=0, k_window=3,
            metric='f1', methods=['logreg', 'dt', 'knn'], r_minimum=2, run_per_partition=False,
            score_target='cv', selector='uniform', tuner='uniform', deadline=None, priority=1,
            save_files=True, choose_randomly=True, cloud_mode=False, total_time=None,
            wait=True, verbose=True):

        dataset = self.add_dataset(train_path, test_path, name, description, class_column)
        datarun = self.add_datarun(
            dataset.id,
            budget,
            budget_type,
            gridding,
            k_window,
            metric,
            methods,
            r_minimum,
            run_per_partition,
            score_target,
            priority,
            selector,
            tuner,
            deadline
        )

        if run_per_partition:
            datarun_ids = [_datarun.id for _datarun in datarun]

        else:
            datarun_ids = [datarun.id]

        if verbose:
            print('Processing dataset {}'.format(train_path))

        self.work(
            datarun_ids,
            save_files,
            choose_randomly,
            cloud_mode,
            total_time,
            False,
            verbose=verbose
        )

        dataruns = self.db.get_dataruns(
            include_ids=datarun_ids,
            ignore_complete=False,
            ignore_pending=True
        )

        if run_per_partition:
            return dataruns

        elif len(dataruns) == 1:
            return dataruns[0]

    def load_model(self, classifier_id):
        return self.db.get_classifier(classifier_id).load_model()
Beispiel #20
0
class ATM(object):

    _LOOP_WAIT = 5

    def __init__(
        self,

        # SQL Conf
        dialect='sqlite',
        database='atm.db',
        username=None,
        password=None,
        host=None,
        port=None,
        query=None,

        # AWS Conf
        access_key=None,
        secret_key=None,
        s3_bucket=None,
        s3_folder=None,

        # Log Conf
        models_dir='models',
        metrics_dir='metrics',
        verbose_metrics=False,
    ):

        self.db = Database(dialect, database, username, host, port, query)
        self.aws_access_key = access_key
        self.aws_secret_key = secret_key
        self.s3_bucket = s3_bucket
        self.s3_folder = s3_folder

        self.models_dir = models_dir
        self.metrics_dir = metrics_dir
        self.verbose_metrics = verbose_metrics

    def add_dataset(self, train_path, test_path=None, name=None,
                    description=None, class_column=None):
        """Add a new dataset to the Database.

        Args:
            train_path (str):
                Path to the training CSV file. It can be a local filesystem path,
                absolute or relative, or an HTTP or HTTPS URL, or an S3 path in the
                format ``s3://{bucket_name}/{key}``. Required.
            test_path (str):
                Path to the testing CSV file. It can be a local filesystem path,
                absolute or relative, or an HTTP or HTTPS URL, or an S3 path in the
                format ``s3://{bucket_name}/{key}``.
                Optional. If not given, the training CSV will be split in two parts,
                train and test.
            name (str):
                Name given to this dataset. Optional. If not given, a hash will be
                generated from the training_path and used as the Dataset name.
            description (str):
                Human friendly description of the Dataset. Optional.
            class_column (str):
                Name of the column that will be used as the target variable.
                Optional. Defaults to ``'class'``.

        Returns:
            Dataset:
                The created dataset.
        """

        return self.db.create_dataset(
            train_path=train_path,
            test_path=test_path,
            name=name,
            description=description,
            class_column=class_column,
            aws_access_key=self.aws_access_key,
            aws_secret_key=self.aws_secret_key,
        )

    def add_datarun(self, dataset_id, budget=100, budget_type='classifier',
                    gridding=0, k_window=3, metric='f1', methods=['logreg', 'dt', 'knn'],
                    r_minimum=2, run_per_partition=False, score_target='cv', priority=1,
                    selector='uniform', tuner='uniform', deadline=None):

        """Register one or more Dataruns to the Database.

        The methods hyperparameters will be analyzed and Hyperpartitions generated
        from them.
        If ``run_per_partition`` is ``True``, one Datarun will be created for each
        Hyperpartition. Otherwise, a single one will be created for all of them.

        Args:
            dataset_id (int):
                Id of the Dataset which this Datarun will belong to.
            budget (int):
                Budget amount. Optional. Defaults to ``100``.
            budget_type (str):
                Budget Type. Can be 'classifier' or 'walltime'.
                Optional. Defaults to ``'classifier'``.
            gridding (int):
                ``gridding`` setting for the Tuner. Optional. Defaults to ``0``.
            k_window (int):
                ``k`` setting for the Selector. Optional. Defaults to ``3``.
            metric (str):
                Metric to use for the tuning and selection. Optional. Defaults to ``'f1'``.
            methods (list):
                List of methods to try. Optional. Defaults to ``['logreg', 'dt', 'knn']``.
            r_minimum (int):
                ``r_minimum`` setting for the Tuner. Optional. Defaults to ``2``.
            run_per_partition (bool):
                whether to create a separated Datarun for each Hyperpartition or not.
                Optional. Defaults to ``False``.
            score_target (str):
                Which score to use for the tuning and selection process. It can be ``'cv'`` or
                ``'test'``. Optional. Defaults to ``'cv'``.
            priority (int):
                Priority of this Datarun. The higher the better. Optional. Defaults to ``1``.
            selector (str):
                Type of selector to use. Optional. Defaults to ``'uniform'``.
            tuner (str):
                Type of tuner to use. Optional. Defaults to ``'uniform'``.
            deadline (str):
                Time deadline. It must be a string representing a datetime in the format
                ``'%Y-%m-%d %H:%M'``. If given, ``budget_type`` will be set to ``'walltime'``.

        Returns:
            Datarun:
                The created Datarun or list of Dataruns.
        """

        if deadline:
            deadline = datetime.strptime(deadline, TIME_FMT)
            budget_type = 'walltime'

        elif budget_type == 'walltime':
            deadline = datetime.now() + timedelta(minutes=budget)

        run_description = '___'.join([tuner, selector])
        target = score_target + '_judgment_metric'

        method_parts = {}
        for method in methods:
            # enumerate all combinations of categorical variables for this method
            method_instance = Method(method)
            method_parts[method] = method_instance.get_hyperpartitions()

            LOGGER.info('method {} has {} hyperpartitions'.format(
                method, len(method_parts[method])))

        dataruns = list()
        if not run_per_partition:
            datarun = self.db.create_datarun(
                dataset_id=dataset_id,
                description=run_description,
                tuner=tuner,
                selector=selector,
                gridding=gridding,
                priority=priority,
                budget_type=budget_type,
                budget=budget,
                deadline=deadline,
                metric=metric,
                score_target=target,
                k_window=k_window,
                r_minimum=r_minimum
            )

            dataruns.append(datarun)

        for method, parts in method_parts.items():
            for part in parts:
                # if necessary, create a new datarun for each hyperpartition.
                # This setting is useful for debugging.
                if run_per_partition:
                    datarun = self.db.create_datarun(
                        dataset_id=dataset_id,
                        description=run_description,
                        tuner=tuner,
                        selector=selector,
                        gridding=gridding,
                        priority=priority,
                        budget_type=budget_type,
                        budget=budget,
                        deadline=deadline,
                        metric=metric,
                        score_target=target,
                        k_window=k_window,
                        r_minimum=r_minimum
                    )

                    dataruns.append(datarun)

                # create a new hyperpartition in the database
                self.db.create_hyperpartition(datarun_id=datarun.id,
                                              method=method,
                                              tunables=part.tunables,
                                              constants=part.constants,
                                              categoricals=part.categoricals,
                                              status=PartitionStatus.INCOMPLETE)

        dataset = self.db.get_dataset(dataset_id)
        LOGGER.info('Dataruns created. Summary:')
        LOGGER.info('\tDataset ID: {}'.format(dataset.id))
        LOGGER.info('\tTraining data: {}'.format(dataset.train_path))
        LOGGER.info('\tTest data: {}'.format(dataset.test_path))

        if run_per_partition:
            LOGGER.info('\tDatarun IDs: {}'.format(
                ', '.join(str(datarun.id) for datarun in dataruns)))

        else:
            LOGGER.info('\tDatarun ID: {}'.format(dataruns[0].id))

        LOGGER.info('\tHyperpartition selection strategy: {}'.format(dataruns[0].selector))
        LOGGER.info('\tParameter tuning strategy: {}'.format(dataruns[0].tuner))
        LOGGER.info('\tBudget: {} ({})'.format(dataruns[0].budget, dataruns[0].budget_type))

        return dataruns if run_per_partition else dataruns[0]

    def work(self, datarun_ids=None, save_files=True, choose_randomly=True,
             cloud_mode=False, total_time=None, wait=True, verbose=False):
        """Get unfinished Dataruns from the database and work on them.

        Check the ModelHub Database for unfinished Dataruns, and work on them
        as they are added. This process will continue to run until it exceeds
        total_time or there are no more Dataruns to process or it is killed.

        Args:
            datarun_ids (list):
                list of IDs of Dataruns to work on. If ``None``, this will work on any
                unfinished Dataruns found in the database. Optional. Defaults to ``None``.
            save_files (bool):
                Whether to save the fitted classifiers and their metrics or not.
                Optional. Defaults to True.
            choose_randomly (bool):
                If ``True``, work on all the highest-priority dataruns in random order.
                Otherwise, work on them in sequential order (by ID).
                Optional. Defaults to ``True``.
            cloud_mode (bool):
                Save the models and metrics in AWS S3 instead of locally. This option
                works only if S3 configuration has been provided on initialization.
                Optional. Defaults to ``False``.
            total_time (int):
                Total time to run the work process, in seconds. If ``None``, continue to
                run until interrupted or there are no more Dataruns to process.
                Optional. Defaults to ``None``.
            wait (bool):
                If ``True``, wait for more Dataruns to be inserted into the Database
                once all have been processed. Otherwise, exit the worker loop
                when they run out.
                Optional. Defaults to ``False``.
            verbose (bool):
                Whether to be verbose about the process. Optional. Defaults to ``True``.
        """
        start_time = datetime.now()

        # main loop
        while True:
            # get all pending and running dataruns, or all pending/running dataruns
            # from the list we were given
            dataruns = self.db.get_dataruns(include_ids=datarun_ids, ignore_complete=True)
            if not dataruns:
                if wait:
                    LOGGER.debug('No dataruns found. Sleeping %d seconds and trying again.',
                                 self._LOOP_WAIT)
                    time.sleep(self._LOOP_WAIT)
                    continue

                else:
                    LOGGER.info('No dataruns found. Exiting.')
                    break

            # either choose a run randomly between priority, or take the run with the lowest ID
            if choose_randomly:
                run = random.choice(dataruns)
            else:
                run = sorted(dataruns, key=attrgetter('id'))[0]

            # say we've started working on this datarun, if we haven't already
            self.db.mark_datarun_running(run.id)

            LOGGER.info('Computing on datarun %d' % run.id)
            # actual work happens here
            worker = Worker(self.db, run, save_files=save_files,
                            cloud_mode=cloud_mode, aws_access_key=self.aws_access_key,
                            aws_secret_key=self.aws_secret_key, s3_bucket=self.s3_bucket,
                            s3_folder=self.s3_folder, models_dir=self.models_dir,
                            metrics_dir=self.metrics_dir, verbose_metrics=self.verbose_metrics)

            try:
                if run.budget_type == 'classifier':
                    pbar = tqdm(
                        total=run.budget,
                        ascii=True,
                        initial=run.completed_classifiers,
                        disable=not verbose
                    )

                    while run.status != RunStatus.COMPLETE:
                        worker.run_classifier()
                        run = self.db.get_datarun(run.id)
                        if verbose and run.completed_classifiers > pbar.last_print_n:
                            pbar.update(run.completed_classifiers - pbar.last_print_n)

                    pbar.close()

                elif run.budget_type == 'walltime':
                    pbar = tqdm(
                        disable=not verbose,
                        ascii=True,
                        initial=run.completed_classifiers,
                        unit=' Classifiers'
                    )

                    while run.status != RunStatus.COMPLETE:
                        worker.run_classifier()
                        run = self.db.get_datarun(run.id)  # Refresh the datarun object.
                        if verbose and run.completed_classifiers > pbar.last_print_n:
                            pbar.update(run.completed_classifiers - pbar.last_print_n)

                    pbar.close()

            except ClassifierError:
                # the exception has already been handled; just wait a sec so we
                # don't go out of control reporting errors
                LOGGER.error('Something went wrong. Sleeping %d seconds.', self._LOOP_WAIT)
                time.sleep(self._LOOP_WAIT)

            elapsed_time = (datetime.now() - start_time).total_seconds()
            if total_time is not None and elapsed_time >= total_time:
                LOGGER.info('Total run time for worker exceeded; exiting.')
                break

    def run(self, train_path, test_path=None, name=None, description=None,
            class_column='class', budget=100, budget_type='classifier', gridding=0, k_window=3,
            metric='f1', methods=['logreg', 'dt', 'knn'], r_minimum=2, run_per_partition=False,
            score_target='cv', selector='uniform', tuner='uniform', deadline=None, priority=1,
            save_files=True, choose_randomly=True, cloud_mode=False, total_time=None,
            verbose=True):

        """Create a Dataset and a Datarun and then work on it.

        Args:
            train_path (str):
                Path to the training CSV file. It can be a local filesystem path,
                absolute or relative, or an HTTP or HTTPS URL, or an S3 path in the
                format ``s3://{bucket_name}/{key}``. Required.
            test_path (str):
                Path to the testing CSV file. It can be a local filesystem path,
                absolute or relative, or an HTTP or HTTPS URL, or an S3 path in the
                format ``s3://{bucket_name}/{key}``.
                Optional. If not given, the training CSV will be split in two parts,
                train and test.
            name (str):
                Name given to this dataset. Optional. If not given, a hash will be
                generated from the training_path and used as the Dataset name.
            description (str):
                Human friendly description of the Dataset. Optional.
            class_column (str):
                Name of the column that will be used as the target variable.
                Optional. Defaults to ``'class'``.
            budget (int):
                Budget amount. Optional. Defaults to ``100``.
            budget_type (str):
                Budget Type. Can be 'classifier' or 'walltime'.
                Optional. Defaults to ``'classifier'``.
            gridding (int):
                ``gridding`` setting for the Tuner. Optional. Defaults to ``0``.
            k_window (int):
                ``k`` setting for the Selector. Optional. Defaults to ``3``.
            metric (str):
                Metric to use for the tuning and selection. Optional. Defaults to ``'f1'``.
            methods (list):
                List of methods to try. Optional. Defaults to ``['logreg', 'dt', 'knn']``.
            r_minimum (int):
                ``r_minimum`` setting for the Tuner. Optional. Defaults to ``2``.
            run_per_partition (bool):
                whether to create a separated Datarun for each Hyperpartition or not.
                Optional. Defaults to ``False``.
            score_target (str):
                Which score to use for the tuning and selection process. It can be ``'cv'`` or
                ``'test'``. Optional. Defaults to ``'cv'``.
            priority (int):
                Priority of this Datarun. The higher the better. Optional. Defaults to ``1``.
            selector (str):
                Type of selector to use. Optional. Defaults to ``'uniform'``.
            tuner (str):
                Type of tuner to use. Optional. Defaults to ``'uniform'``.
            deadline (str):
                Time deadline. It must be a string representing a datetime in the format
                ``'%Y-%m-%d %H:%M'``. If given, ``budget_type`` will be set to ``'walltime'``.
            verbose (bool):
                Whether to be verbose about the process. Optional. Defaults to ``True``.

        Returns:
            Datarun:
                The created Datarun or list of Dataruns.
        """

        dataset = self.add_dataset(train_path, test_path, name, description, class_column)
        datarun = self.add_datarun(
            dataset.id,
            budget,
            budget_type,
            gridding,
            k_window,
            metric,
            methods,
            r_minimum,
            run_per_partition,
            score_target,
            priority,
            selector,
            tuner,
            deadline
        )

        if run_per_partition:
            datarun_ids = [_datarun.id for _datarun in datarun]

        else:
            datarun_ids = [datarun.id]

        if verbose:
            print('Processing dataset {}'.format(train_path))

        self.work(
            datarun_ids,
            save_files,
            choose_randomly,
            cloud_mode,
            total_time,
            False,
            verbose=verbose
        )

        dataruns = self.db.get_dataruns(
            include_ids=datarun_ids,
            ignore_complete=False,
            ignore_pending=True
        )

        if run_per_partition:
            return dataruns

        elif len(dataruns) == 1:
            return dataruns[0]

    def load_model(self, classifier_id):
        """Load a Model from the Database.

        Args:
            classifier_id (int):
                Id of the Model to load.

        Returns:
            Model:
                The loaded model instance.
        """
        return self.db.get_classifier(classifier_id).load_model()
Beispiel #21
0
    def start(self):
        db = Database(**vars(self._sql_config))

        work(db, log_config=self._log_config, save_files=True)
Beispiel #22
0
 def __init__(self, sql_conf, aws_conf, log_conf):
     self.db = Database(**sql_conf.to_dict())
     self.aws_conf = aws_conf
     self.log_conf = log_conf
Beispiel #23
0
class ATM(object):
    """
    Thiss class is code API instance that allows you to use ATM in your python code.
    """

    LOOP_WAIT = 1

    def __init__(self, **kwargs):

        if kwargs.get('log_config') is None:
            kwargs['log_config'] = os.path.join(
                PROJECT_ROOT, 'config/templates/log-script.yaml')

        self.sql_conf, self.run_conf, self.aws_conf, self.log_conf = load_config(
            **kwargs)

        self.db = Database(**vars(self.sql_conf))

        initialize_logging(self.log_conf)

    def work(self,
             datarun_ids=None,
             save_files=False,
             choose_randomly=True,
             cloud_mode=False,
             total_time=None,
             wait=True):
        """
        Check the ModelHub database for unfinished dataruns, and spawn workers to
        work on them as they are added. This process will continue to run until it
        exceeds total_time or is broken with ctrl-C.

        datarun_ids (optional): list of IDs of dataruns to compute on. If None,
            this will work on all unfinished dataruns in the database.
        choose_randomly: if True, work on all highest-priority dataruns in random
            order. If False, work on them in sequential order (by ID)
        cloud_mode: if True, save processed datasets to AWS. If this option is set,
            aws_config must be supplied.
        total_time (optional): if set to an integer, this worker will only work for
            total_time seconds. Otherwise, it will continue working until all
            dataruns are complete (or indefinitely).
        wait: if True, once all dataruns in the database are complete, keep spinning
            and wait for new runs to be added. If False, exit once all dataruns are
            complete.
        """
        start_time = datetime.now()
        public_ip = get_public_ip()

        # main loop
        while True:
            # get all pending and running dataruns, or all pending/running dataruns
            # from the list we were given
            dataruns = self.db.get_dataruns(include_ids=datarun_ids,
                                            ignore_complete=True)
            if not dataruns:
                if wait:
                    logger.warning(
                        'No dataruns found. Sleeping %d seconds and trying again.',
                        ATM.LOOP_WAIT)
                    time.sleep(ATM.LOOP_WAIT)
                    continue

                else:
                    logger.warning('No dataruns found. Exiting.')
                    break

            max_priority = max([datarun.priority for datarun in dataruns])
            priority_runs = [r for r in dataruns if r.priority == max_priority]

            # either choose a run randomly, or take the run with the lowest ID
            if choose_randomly:
                run = random.choice(priority_runs)
            else:
                run = sorted(dataruns, key=attrgetter('id'))[0]

            # say we've started working on this datarun, if we haven't already
            self.db.mark_datarun_running(run.id)

            logger.info('Computing on datarun %d' % run.id)
            # actual work happens here
            worker = Worker(self.db,
                            run,
                            save_files=save_files,
                            cloud_mode=cloud_mode,
                            aws_config=self.aws_conf,
                            log_config=self.log_conf,
                            public_ip=public_ip)
            try:
                worker.run_classifier()

            except ClassifierError:
                # the exception has already been handled; just wait a sec so we
                # don't go out of control reporting errors
                logger.warning('Something went wrong. Sleeping %d seconds.',
                               ATM.LOOP_WAIT)
                time.sleep(ATM.LOOP_WAIT)

            elapsed_time = (datetime.now() - start_time).total_seconds()
            if total_time is not None and elapsed_time >= total_time:
                logger.warning('Total run time for worker exceeded; exiting.')
                break

    def create_dataset(self):
        """
        Create a dataset and add it to the ModelHub database.
        """
        # download data to the local filesystem to extract metadata
        train_local, test_local = download_data(self.run_conf.train_path,
                                                self.run_conf.test_path,
                                                self.aws_conf)

        # create the name of the dataset from the path to the data
        name = os.path.basename(train_local)
        name = name.replace("_train.csv", "").replace(".csv", "")

        # process the data into the form ATM needs and save it to disk
        meta = MetaData(self.run_conf.class_column, train_local, test_local)

        # enter dataset into database
        dataset = self.db.create_dataset(
            name=name,
            description=self.run_conf.data_description,
            train_path=self.run_conf.train_path,
            test_path=self.run_conf.test_path,
            class_column=self.run_conf.class_column,
            n_examples=meta.n_examples,
            k_classes=meta.k_classes,
            d_features=meta.d_features,
            majority=meta.majority,
            size_kb=old_div(meta.size, 1000))
        return dataset

    def create_datarun(self, dataset):
        """
        Given a config, creates a set of dataruns for the config and enters them into
        the database. Returns the ID of the created datarun.

        dataset: Dataset SQLAlchemy ORM object
        """
        # describe the datarun by its tuner and selector
        run_description = '__'.join(
            [self.run_conf.tuner, self.run_conf.selector])

        # set the deadline, if applicable
        deadline = self.run_conf.deadline
        if deadline:
            deadline = datetime.strptime(deadline, TIME_FMT)
            # this overrides the otherwise configured budget_type
            # TODO: why not walltime and classifiers budget simultaneously?
            self.run_conf.budget_type = 'walltime'
        elif self.run_conf.budget_type == 'walltime':
            deadline = datetime.now() + timedelta(minutes=self.run_conf.budget)

        target = self.run_conf.score_target + '_judgment_metric'
        datarun = self.db.create_datarun(dataset_id=dataset.id,
                                         description=run_description,
                                         tuner=self.run_conf.tuner,
                                         selector=self.run_conf.selector,
                                         gridding=self.run_conf.gridding,
                                         priority=self.run_conf.priority,
                                         budget_type=self.run_conf.budget_type,
                                         budget=self.run_conf.budget,
                                         deadline=deadline,
                                         metric=self.run_conf.metric,
                                         score_target=target,
                                         k_window=self.run_conf.k_window,
                                         r_minimum=self.run_conf.r_minimum)
        return datarun

    def enter_data(self, run_per_partition=False):
        """
        Generate a datarun, including a dataset if necessary.

        Returns: ID of the generated datarun
        """
        # connect to the database

        # if the user has provided a dataset id, use that. Otherwise, create a new
        # dataset based on the arguments we were passed.
        if self.run_conf.dataset_id is None:
            dataset = self.create_dataset()
            self.run_conf.dataset_id = dataset.id
        else:
            dataset = self.db.get_dataset(self.run_conf.dataset_id)

        method_parts = {}
        for m in self.run_conf.methods:
            # enumerate all combinations of categorical variables for this method
            method = Method(m)
            method_parts[m] = method.get_hyperpartitions()
            logger.info('method %s has %d hyperpartitions' %
                        (m, len(method_parts[m])))

        # create hyperpartitions and datarun(s)
        run_ids = []
        if not run_per_partition:
            logger.debug('saving datarun...')
            datarun = self.create_datarun(dataset)

        logger.debug('saving hyperpartions...')
        for method, parts in list(method_parts.items()):
            for part in parts:
                # if necessary, create a new datarun for each hyperpartition.
                # This setting is useful for debugging.
                if run_per_partition:
                    datarun = self.create_datarun(dataset)
                    run_ids.append(datarun.id)

                # create a new hyperpartition in the database
                self.db.create_hyperpartition(
                    datarun_id=datarun.id,
                    method=method,
                    tunables=part.tunables,
                    constants=part.constants,
                    categoricals=part.categoricals,
                    status=PartitionStatus.INCOMPLETE)

        logger.info('Data entry complete. Summary:')
        logger.info('\tDataset ID: %d', dataset.id)
        logger.info('\tTraining data: %s', dataset.train_path)
        logger.info('\tTest data: %s', (dataset.test_path or 'None'))

        if run_per_partition:
            logger.info('\tDatarun IDs: %s', ', '.join(map(str, run_ids)))

        else:
            logger.info('\tDatarun ID: %d', datarun.id)

        logger.info('\tHyperpartition selection strategy: %s',
                    datarun.selector)
        logger.info('\tParameter tuning strategy: %s', datarun.tuner)
        logger.info('\tBudget: %d (%s)', datarun.budget, datarun.budget_type)

        return run_ids or datarun.id
Beispiel #24
0
                    help='Only train on dataruns with these ids',
                    nargs='+')
parser.add_argument('--time', help='Number of seconds to run worker', type=int)
parser.add_argument(
    '--choose-randomly',
    action='store_true',
    help='Choose dataruns to work on randomly (default = sequential order)')
parser.add_argument('--no-save',
                    dest='save_files',
                    default=True,
                    action='store_const',
                    const=False,
                    help="don't save models and metrics at all")

# parse arguments and load configuration
args = parser.parse_args()

sql_config, _, aws_config, log_config = load_config(**vars(args))

db = Database(**vars(sql_config))

with db_session(db):  # keep a database session open to access the dataruns
    ## get all the classifier in the dataset
    classifiers = db.get_classifiers()
    ## or
    ## get one classifier by the classifier ID
    # classifier = db.get_classifier(classifier_id)
    print("total {} classifiers".format(len(classifiers)))
    for classifier in classifiers:
        metrics = load_metrics(classifier, metric_dir="./metrics")
        print(metrics)
Beispiel #25
0
def enter_datarun(sql_config, run_config, aws_config=None, upload_data=False,
                  run_per_partition=False):
    """
    Generate a datarun, including a dataset if necessary.

    sql_config: Object with all attributes necessary to initialize a Database.
    run_config: all attributes necessary to initialize a Datarun, including
        Dataset info if the dataset has not already been created.
    aws_config: all attributes necessary to connect to an S3 bucket.
    upload_data: whether to store processed data in the cloud

    Returns: ID of the generated datarun
    """
    # connect to the database
    db = Database(sql_config.dialect, sql_config.database, sql_config.username,
                  sql_config.password, sql_config.host, sql_config.port,
                  sql_config.query)

    # if the user has provided a dataset id, use that. Otherwise, create a new
    # dataset based on the arguments we were passed.
    if run_config.dataset_id is None:
        dataset = enter_dataset(db, run_config, aws_config=aws_config,
                                upload_data=upload_data)
    else:
        dataset = db.get_dataset(run_config.dataset_id)


    # create hyperpartitions for the new datarun
    print
    print 'creating hyperpartitions...'
    session = db.get_session()

    method_and_parts = []
    for m in run_config.methods:
        # enumerate all combinations of categorical variables for this method
        method = Method(METHODS_MAP[m])
        method_hyperparitions = method.get_hyperpartitions()

        for method_hyperparition in method_hyperparitions:
            method_and_parts.append((m, method_hyperparition))

        print 'method', m, 'has', len(method_hyperparitions), 'hyperpartitions'

    # create and save datarun to database
    print
    print 'creating datarun...'

    # create hyperpartitions and datarun(s)
    run_ids = []
    if not run_per_partition:
        datarun = create_datarun(db, session, dataset, run_config)
        session.commit()

    for method, part in method_and_parts:
        # if necessary, create a new datarun for each hyperpartition.
        # This setting is useful for debugging.
        if run_per_partition:
            datarun = create_datarun(db, session, dataset, run_config)
            session.commit()
            run_ids.append(datarun.id)

        hp = db.Hyperpartition(datarun_id=datarun.id,
                               method=method,
                               tunables=part.tunables,
                               constants=part.constants,
                               categoricals=part.categoricals,
                               status=PartitionStatus.INCOMPLETE)
        session.add(hp)
        session.commit()


    print
    print '========== Summary =========='
    print 'Dataset ID:', dataset.id
    print 'Training data:', dataset.train_path
    print 'Test data:', (dataset.test_path or '(None)')
    if run_per_partition:
        print 'Datarun IDs:', ', '.join(map(str, run_ids))
    else:
        print 'Datarun ID:', datarun.id
    print 'Hyperpartition selection strategy:', datarun.selector
    print 'Parameter tuning strategy:', datarun.tuner
    print 'Budget: %d (%s)' % (datarun.budget, datarun.budget_type)
    print

    return run_ids or datarun.id
Beispiel #26
0
def enter_datarun(sql_config,
                  run_config,
                  aws_config=None,
                  run_per_partition=False):
    """
    Generate a datarun, including a dataset if necessary.

    sql_config: Object with all attributes necessary to initialize a Database.
    run_config: all attributes necessary to initialize a Datarun, including
        Dataset info if the dataset has not already been created.
    aws_config: all attributes necessary to connect to an S3 bucket.

    Returns: ID of the generated datarun
    """
    # connect to the database
    db = Database(sql_config.dialect, sql_config.database, sql_config.username,
                  sql_config.password, sql_config.host, sql_config.port,
                  sql_config.query)

    # if the user has provided a dataset id, use that. Otherwise, create a new
    # dataset based on the arguments we were passed.
    if run_config.dataset_id is None:
        dataset = enter_dataset(db, run_config, aws_config=aws_config)
    else:
        dataset = db.get_dataset(run_config.dataset_id)

    method_parts = {}
    for m in run_config.methods:
        # enumerate all combinations of categorical variables for this method
        method = Method(m)
        method_parts[m] = method.get_hyperpartitions()
        print('method', m, 'has', len(method_parts[m]), 'hyperpartitions')

    # create hyperpartitions and datarun(s)
    run_ids = []
    if not run_per_partition:
        print('saving datarun...')
        datarun = create_datarun(db, dataset, run_config)

    print('saving hyperpartions...')
    for method, parts in method_parts.items():
        for part in parts:
            # if necessary, create a new datarun for each hyperpartition.
            # This setting is useful for debugging.
            if run_per_partition:
                datarun = create_datarun(db, dataset, run_config)
                run_ids.append(datarun.id)

            # create a new hyperpartition in the database
            db.create_hyperpartition(datarun_id=datarun.id,
                                     method=method,
                                     tunables=part.tunables,
                                     constants=part.constants,
                                     categoricals=part.categoricals,
                                     status=PartitionStatus.INCOMPLETE)

    print('done!')
    print()
    print('========== Summary ==========')
    print('Dataset ID:', dataset.id)
    print('Training data:', dataset.train_path)
    print('Test data:', (dataset.test_path or '(None)'))
    if run_per_partition:
        print('Datarun IDs:', ', '.join(map(str, run_ids)))
    else:
        print('Datarun ID:', datarun.id)
    print('Hyperpartition selection strategy:', datarun.selector)
    print('Parameter tuning strategy:', datarun.tuner)
    print('Budget: %d (%s)' % (datarun.budget, datarun.budget_type))
    print()

    return run_ids or datarun.id
Beispiel #27
0
class Admin:
    def __init__(self, host, port, username, password, database):
        self._log_config = self._build_log_config()
        atm.config.initialize_logging(self._log_config)
        self._sql_config = self._build_sql_config(host, port, username,
                                                  password, database)
        self._db = Database(**vars(self._sql_config))

    def create_datarun(self, dataset_url, class_column, budget_type, budget):
        run_config = self._build_run_config(dataset_url=dataset_url,
                                            class_column=class_column,
                                            budget_type=budget_type,
                                            budget=budget)
        id = enter_data(self._sql_config, run_config)
        return {'id': id}

    def get_datarun(self, datarun_id):
        datarun = self._db.get_datarun(datarun_id)
        classifier = self._db.get_best_classifier(
            score_target='cv',  # TODO: change to accuracy on test data
            datarun_id=datarun_id)

        return {
            'id': datarun_id,
            'status': datarun.status,
            'budget': datarun.budget,
            'budget_type': datarun.budget_type,
            'start_time': datarun.start_time,
            'end_time': datarun.end_time,
            'best_classifier_id': classifier.id
        }

    def get_classifier(self, classifier_id):
        classifier = self._db.get_classifier(classifier_id)
        hyperpartition = self._db.get_hyperpartition(
            classifier.hyperpartition_id)
        return {
            'id': classifier_id,
            'method': hyperpartition.method,
            'hyperparameters': classifier.hyperparameter_values,
            'cv_accuracy': float(classifier.cv_judgment_metric)
        }

    def query_classifier(self, classifier_id, queries):
        model = self._db.load_model(classifier_id)
        query_df = pd.DataFrame(queries, index=range(len(queries)))
        predictions = model.predict(query_df)
        return {'queries': queries, 'predictions': [x for x in predictions]}

    def _build_log_config(self):
        x = atm.config.LogConfig()
        return x

    def _build_sql_config(self, host, port, username, password, database):
        x = atm.config.SQLConfig()
        x.dialect = 'mysql'
        x.database = database
        x.username = username
        x.host = host
        x.port = port
        x.password = password
        return x

    def _build_run_config(self, dataset_url, class_column, budget_type,
                          budget):
        x = atm.config.RunConfig()
        x.train_path = dataset_url
        x.class_column = class_column
        x.methods = [
            'logreg', 'svm', 'sgd', 'dt', 'et', 'rf', 'gnb', 'mnb', 'bnb',
            'gp', 'pa', 'knn', 'mlp', 'ada'
        ]
        x.priority = 1
        x.budget_type = budget_type
        x.budget = budget
        x.tuner = 'uniform'
        x.selector = 'uniform'
        x.r_minimum = 2
        x.k_window = 3
        x.gridding = 0
        x.metric = 'f1'
        x.score_target = 'cv'
        return x
Beispiel #28
0
Run a single end-to-end test with 10 sample datasets.
The script will create a datarun for each dataset, then run a worker until the
jobs are finished.
''')
parser.add_argument('--processes',
                    help='number of processes to run concurrently',
                    type=int,
                    default=1)
parser.add_argument('--method', help='code for method to test')
parser.add_argument('--method-path',
                    help='path to JSON config for method to test')

args = parser.parse_args()
sql_config, run_config, aws_config = load_config(sql_path=SQL_CONFIG,
                                                 run_path=RUN_CONFIG)
db = Database(**vars(sql_config))

print('creating dataruns...')
datarun_ids = []
for ds in DATASETS:
    run_config.train_path = join(DATA_DIR, ds)
    run_config.methods = [args.method]
    dataset = enter_dataset(db, run_config, aws_config)
    datarun_ids.extend(
        enter_datarun(sql_config,
                      run_config,
                      aws_config,
                      run_per_partition=True))

print('computing on dataruns', datarun_ids)
work_parallel(db=db,
Beispiel #29
0
 def __init__(self, host, port, username, password, database):
     self._log_config = self._build_log_config()
     atm.config.initialize_logging(self._log_config)
     self._sql_config = self._build_sql_config(host, port, username,
                                               password, database)
     self._db = Database(**vars(self._sql_config))