Esempio n. 1
0
    def __init__(self,
                 code,
                 params,
                 judgment_metric,
                 label_column,
                 testing_ratio=0.3):
        """
        Parameters:
            code: the short method code (as defined in constants.py)
            judgment_metric: string that indicates which metric should be
                optimized for.
            params: parameters passed to the sklearn classifier constructor
            class_: sklearn classifier class
        """
        # configuration & database
        self.code = code
        self.params = params
        self.judgment_metric = judgment_metric
        self.label_column = label_column
        self.testing_ratio = testing_ratio

        # load the classifier method's class
        path = Method(METHODS_MAP[code]).class_path.split('.')
        mod_str, cls_str = '.'.join(path[:-1]), path[-1]
        mod = import_module(mod_str)
        self.class_ = getattr(mod, cls_str)

        # pipelining
        self.pipeline = None

        # persistent random state
        self.random_state = np.random.randint(1e7)
Esempio n. 2
0
def test_enumerate():
    js = {'name': 'test', 'class': 'test'}
    js['hyperparameters'] = {
        'a': {'type': 'int_cat', 'values': [0, 3]},
        'b': {'type': 'int', 'range': [0, 3]},
        'c': {'type': 'bool', 'values': [True, False]},
        'd': {'type': 'string', 'values': ['x', 'y']},
        'e': {'type': 'float_cat', 'values': [-0.5, 0.5, 1.0]},
        'f': {'type': 'float', 'range': [0.5]},
        'g': {'type': 'list',
              'list_length': [1, 2, 3],
              'element': {'type': 'int_exp', 'range': [1e-3, 1e3]}}
    }
    js['root_hyperparameters'] = ['a', 'f']
    js['conditional_hyperparameters'] = {
        'a': {'0': ['b'], '3': ['c']},
        'c': {'True': ['d'], 'False': ['e', 'g']},
    }

    config_path = '/tmp/method.json'
    with open(config_path, 'w') as f:
        json.dump(js, f)

    hps = Method(config_path).get_hyperpartitions()

    assert len(hps) == 12
    assert all('a' in list(zip(*hp.categoricals))[0] for hp in hps)
    assert all(('f', 0.5) in hp.constants for hp in hps)
    assert len([hp for hp in hps if hp.tunables and
                'b' in list(zip(*hp.tunables))[0]]) == 1
Esempio n. 3
0
    def __init__(self,
                 method,
                 params,
                 judgment_metric,
                 class_column,
                 testing_ratio=0.3,
                 verbose_metrics=False):
        """
        Parameters:
            method: the short method code (as defined in constants.py) or path
                to method json
            judgment_metric: string that indicates which metric should be
                optimized for.
            params: parameters passed to the sklearn classifier constructor
            class_: sklearn classifier class
        """
        # configuration & database
        self.method = method
        self.params = params
        self.judgment_metric = judgment_metric
        self.class_column = class_column
        self.testing_ratio = testing_ratio
        self.verbose_metrics = verbose_metrics

        # load the classifier method's class
        path = Method(method).class_path.split('.')
        mod_str, cls_str = '.'.join(path[:-1]), path[-1]
        mod = import_module(mod_str)
        self.class_ = getattr(mod, cls_str)

        # pipelining
        self.pipeline = None

        # persistent random state
        self.random_state = np.random.randint(1e7)
Esempio n. 4
0
def enter_datarun(sql_config, run_config, aws_config=None, upload_data=False,
                  run_per_partition=False):
    """
    Generate a datarun, including a dataset if necessary.

    sql_config: Object with all attributes necessary to initialize a Database.
    run_config: all attributes necessary to initialize a Datarun, including
        Dataset info if the dataset has not already been created.
    aws_config: all attributes necessary to connect to an S3 bucket.
    upload_data: whether to store processed data in the cloud

    Returns: ID of the generated datarun
    """
    # connect to the database
    db = Database(sql_config.dialect, sql_config.database, sql_config.username,
                  sql_config.password, sql_config.host, sql_config.port,
                  sql_config.query)

    # if the user has provided a dataset id, use that. Otherwise, create a new
    # dataset based on the arguments we were passed.
    if run_config.dataset_id is None:
        dataset = enter_dataset(db, run_config, aws_config=aws_config,
                                upload_data=upload_data)
    else:
        dataset = db.get_dataset(run_config.dataset_id)


    # create hyperpartitions for the new datarun
    print
    print 'creating hyperpartitions...'
    session = db.get_session()

    method_and_parts = []
    for m in run_config.methods:
        # enumerate all combinations of categorical variables for this method
        method = Method(METHODS_MAP[m])
        method_hyperparitions = method.get_hyperpartitions()

        for method_hyperparition in method_hyperparitions:
            method_and_parts.append((m, method_hyperparition))

        print 'method', m, 'has', len(method_hyperparitions), 'hyperpartitions'

    # create and save datarun to database
    print
    print 'creating datarun...'

    # create hyperpartitions and datarun(s)
    run_ids = []
    if not run_per_partition:
        datarun = create_datarun(db, session, dataset, run_config)
        session.commit()

    for method, part in method_and_parts:
        # if necessary, create a new datarun for each hyperpartition.
        # This setting is useful for debugging.
        if run_per_partition:
            datarun = create_datarun(db, session, dataset, run_config)
            session.commit()
            run_ids.append(datarun.id)

        hp = db.Hyperpartition(datarun_id=datarun.id,
                               method=method,
                               tunables=part.tunables,
                               constants=part.constants,
                               categoricals=part.categoricals,
                               status=PartitionStatus.INCOMPLETE)
        session.add(hp)
        session.commit()


    print
    print '========== Summary =========='
    print 'Dataset ID:', dataset.id
    print 'Training data:', dataset.train_path
    print 'Test data:', (dataset.test_path or '(None)')
    if run_per_partition:
        print 'Datarun IDs:', ', '.join(map(str, run_ids))
    else:
        print 'Datarun ID:', datarun.id
    print 'Hyperpartition selection strategy:', datarun.selector
    print 'Parameter tuning strategy:', datarun.tuner
    print 'Budget: %d (%s)' % (datarun.budget, datarun.budget_type)
    print

    return run_ids or datarun.id
Esempio n. 5
0
def enter_datarun(sql_config,
                  run_config,
                  aws_config=None,
                  run_per_partition=False):
    """
    Generate a datarun, including a dataset if necessary.

    sql_config: Object with all attributes necessary to initialize a Database.
    run_config: all attributes necessary to initialize a Datarun, including
        Dataset info if the dataset has not already been created.
    aws_config: all attributes necessary to connect to an S3 bucket.

    Returns: ID of the generated datarun
    """
    # connect to the database
    db = Database(sql_config.dialect, sql_config.database, sql_config.username,
                  sql_config.password, sql_config.host, sql_config.port,
                  sql_config.query)

    # if the user has provided a dataset id, use that. Otherwise, create a new
    # dataset based on the arguments we were passed.
    if run_config.dataset_id is None:
        dataset = enter_dataset(db, run_config, aws_config=aws_config)
    else:
        dataset = db.get_dataset(run_config.dataset_id)

    method_parts = {}
    for m in run_config.methods:
        # enumerate all combinations of categorical variables for this method
        method = Method(m)
        method_parts[m] = method.get_hyperpartitions()
        print('method', m, 'has', len(method_parts[m]), 'hyperpartitions')

    # create hyperpartitions and datarun(s)
    run_ids = []
    if not run_per_partition:
        print('saving datarun...')
        datarun = create_datarun(db, dataset, run_config)

    print('saving hyperpartions...')
    for method, parts in method_parts.items():
        for part in parts:
            # if necessary, create a new datarun for each hyperpartition.
            # This setting is useful for debugging.
            if run_per_partition:
                datarun = create_datarun(db, dataset, run_config)
                run_ids.append(datarun.id)

            # create a new hyperpartition in the database
            db.create_hyperpartition(datarun_id=datarun.id,
                                     method=method,
                                     tunables=part.tunables,
                                     constants=part.constants,
                                     categoricals=part.categoricals,
                                     status=PartitionStatus.INCOMPLETE)

    print('done!')
    print()
    print('========== Summary ==========')
    print('Dataset ID:', dataset.id)
    print('Training data:', dataset.train_path)
    print('Test data:', (dataset.test_path or '(None)'))
    if run_per_partition:
        print('Datarun IDs:', ', '.join(map(str, run_ids)))
    else:
        print('Datarun ID:', datarun.id)
    print('Hyperpartition selection strategy:', datarun.selector)
    print('Parameter tuning strategy:', datarun.tuner)
    print('Budget: %d (%s)' % (datarun.budget, datarun.budget_type))
    print()

    return run_ids or datarun.id
Esempio n. 6
0
    def add_datarun(self, dataset_id, budget=100, budget_type='classifier',
                    gridding=0, k_window=3, metric='f1', methods=['logreg', 'dt', 'knn'],
                    r_minimum=2, run_per_partition=False, score_target='cv', priority=1,
                    selector='uniform', tuner='uniform', deadline=None):

        dataruns = list()

        if deadline:
            deadline = datetime.strptime(deadline, TIME_FMT)
            budget_type = 'walltime'

        elif budget_type == 'walltime':
            deadline = datetime.now() + timedelta(minutes=budget)

        run_description = '___'.join([tuner, selector])
        target = score_target + '_judgment_metric'

        method_parts = {}
        for method in methods:
            # enumerate all combinations of categorical variables for this method
            method_instance = Method(method)
            method_parts[method] = method_instance.get_hyperpartitions()

            LOGGER.info('method {} has {} hyperpartitions'.format(
                method, len(method_parts[method])))

        if not run_per_partition:
            datarun = self.db.create_datarun(
                dataset_id=dataset_id,
                description=run_description,
                tuner=tuner,
                selector=selector,
                gridding=gridding,
                priority=priority,
                budget_type=budget_type,
                budget=budget,
                deadline=deadline,
                metric=metric,
                score_target=target,
                k_window=k_window,
                r_minimum=r_minimum
            )

            dataruns.append(datarun)

        for method, parts in method_parts.items():
            for part in parts:
                # if necessary, create a new datarun for each hyperpartition.
                # This setting is useful for debugging.
                if run_per_partition:
                    datarun = self.db.create_datarun(
                        dataset_id=dataset_id,
                        description=run_description,
                        tuner=tuner,
                        selector=selector,
                        gridding=gridding,
                        priority=priority,
                        budget_type=budget_type,
                        budget=budget,
                        deadline=deadline,
                        metric=metric,
                        score_target=target,
                        k_window=k_window,
                        r_minimum=r_minimum
                    )

                    dataruns.append(datarun)

                # create a new hyperpartition in the database
                self.db.create_hyperpartition(datarun_id=datarun.id,
                                              method=method,
                                              tunables=part.tunables,
                                              constants=part.constants,
                                              categoricals=part.categoricals,
                                              status=PartitionStatus.INCOMPLETE)

        dataset = self.db.get_dataset(dataset_id)
        LOGGER.info('Dataruns created. Summary:')
        LOGGER.info('\tDataset ID: {}'.format(dataset.id))
        LOGGER.info('\tTraining data: {}'.format(dataset.train_path))
        LOGGER.info('\tTest data: {}'.format(dataset.test_path))

        if run_per_partition:
            LOGGER.info('\tDatarun IDs: {}'.format(
                ', '.join(str(datarun.id) for datarun in dataruns)))

        else:
            LOGGER.info('\tDatarun ID: {}'.format(dataruns[0].id))

        LOGGER.info('\tHyperpartition selection strategy: {}'.format(dataruns[0].selector))
        LOGGER.info('\tParameter tuning strategy: {}'.format(dataruns[0].tuner))
        LOGGER.info('\tBudget: {} ({})'.format(dataruns[0].budget, dataruns[0].budget_type))

        return dataruns if run_per_partition else dataruns[0]
Esempio n. 7
0
    def enter_data(self, run_per_partition=False):
        """
        Generate a datarun, including a dataset if necessary.

        Returns: ID of the generated datarun
        """
        # connect to the database

        # if the user has provided a dataset id, use that. Otherwise, create a new
        # dataset based on the arguments we were passed.
        if self.run_conf.dataset_id is None:
            dataset = self.create_dataset()
            self.run_conf.dataset_id = dataset.id
        else:
            dataset = self.db.get_dataset(self.run_conf.dataset_id)

        method_parts = {}
        for m in self.run_conf.methods:
            # enumerate all combinations of categorical variables for this method
            method = Method(m)
            method_parts[m] = method.get_hyperpartitions()
            logger.info('method %s has %d hyperpartitions' %
                        (m, len(method_parts[m])))

        # create hyperpartitions and datarun(s)
        run_ids = []
        if not run_per_partition:
            logger.debug('saving datarun...')
            datarun = self.create_datarun(dataset)

        logger.debug('saving hyperpartions...')
        for method, parts in list(method_parts.items()):
            for part in parts:
                # if necessary, create a new datarun for each hyperpartition.
                # This setting is useful for debugging.
                if run_per_partition:
                    datarun = self.create_datarun(dataset)
                    run_ids.append(datarun.id)

                # create a new hyperpartition in the database
                self.db.create_hyperpartition(
                    datarun_id=datarun.id,
                    method=method,
                    tunables=part.tunables,
                    constants=part.constants,
                    categoricals=part.categoricals,
                    status=PartitionStatus.INCOMPLETE)

        logger.info('Data entry complete. Summary:')
        logger.info('\tDataset ID: %d', dataset.id)
        logger.info('\tTraining data: %s', dataset.train_path)
        logger.info('\tTest data: %s', (dataset.test_path or 'None'))

        if run_per_partition:
            logger.info('\tDatarun IDs: %s', ', '.join(map(str, run_ids)))

        else:
            logger.info('\tDatarun ID: %d', datarun.id)

        logger.info('\tHyperpartition selection strategy: %s',
                    datarun.selector)
        logger.info('\tParameter tuning strategy: %s', datarun.tuner)
        logger.info('\tBudget: %d (%s)', datarun.budget, datarun.budget_type)

        return run_ids or datarun.id
Esempio n. 8
0
    def create_dataruns(self, run_conf):
        """
        Generate a datarun, including a dataset if necessary.

        Returns: ID of the generated datarun
        """
        dataset = self.db.get_dataset(run_conf.dataset_id)
        if not dataset:
            raise ValueError('Invalid Dataset ID: {}'.format(
                run_conf.dataset_id))

        method_parts = {}
        for m in run_conf.methods:
            # enumerate all combinations of categorical variables for this method
            method = Method(m)
            method_parts[m] = method.get_hyperpartitions()
            LOGGER.info('method %s has %d hyperpartitions' %
                        (m, len(method_parts[m])))

        # create hyperpartitions and datarun(s)
        dataruns = []
        if not run_conf.run_per_partition:
            LOGGER.debug('saving datarun...')
            datarun = self.create_datarun(dataset, run_conf)
            dataruns.append(datarun)

        LOGGER.debug('saving hyperpartions...')
        for method, parts in list(method_parts.items()):
            for part in parts:
                # if necessary, create a new datarun for each hyperpartition.
                # This setting is useful for debugging.
                if run_conf.run_per_partition:
                    datarun = self.create_datarun(dataset, run_conf)
                    dataruns.append(datarun)

                # create a new hyperpartition in the database
                self.db.create_hyperpartition(
                    datarun_id=datarun.id,
                    method=method,
                    tunables=part.tunables,
                    constants=part.constants,
                    categoricals=part.categoricals,
                    status=PartitionStatus.INCOMPLETE)

        LOGGER.info('Dataruns created. Summary:')
        LOGGER.info('\tDataset ID: %d', dataset.id)
        LOGGER.info('\tTraining data: %s', dataset.train_path)
        LOGGER.info('\tTest data: %s', (dataset.test_path or 'None'))

        datarun = dataruns[0]
        if run_conf.run_per_partition:
            LOGGER.info('\tDatarun IDs: %s',
                        ', '.join(str(datarun.id) for datarun in dataruns))

        else:
            LOGGER.info('\tDatarun ID: %d', datarun.id)

        LOGGER.info('\tHyperpartition selection strategy: %s',
                    datarun.selector)
        LOGGER.info('\tParameter tuning strategy: %s', datarun.tuner)
        LOGGER.info('\tBudget: %d (%s)', datarun.budget, datarun.budget_type)

        return dataruns
Esempio n. 9
0
    def add_datarun(self, dataset_id, budget=100, budget_type='classifier',
                    gridding=0, k_window=3, metric='f1', methods=['logreg', 'dt', 'knn'],
                    r_minimum=2, run_per_partition=False, score_target='cv', priority=1,
                    selector='uniform', tuner='uniform', deadline=None):

        """Register one or more Dataruns to the Database.

        The methods hyperparameters will be analyzed and Hyperpartitions generated
        from them.
        If ``run_per_partition`` is ``True``, one Datarun will be created for each
        Hyperpartition. Otherwise, a single one will be created for all of them.

        Args:
            dataset_id (int):
                Id of the Dataset which this Datarun will belong to.
            budget (int):
                Budget amount. Optional. Defaults to ``100``.
            budget_type (str):
                Budget Type. Can be 'classifier' or 'walltime'.
                Optional. Defaults to ``'classifier'``.
            gridding (int):
                ``gridding`` setting for the Tuner. Optional. Defaults to ``0``.
            k_window (int):
                ``k`` setting for the Selector. Optional. Defaults to ``3``.
            metric (str):
                Metric to use for the tuning and selection. Optional. Defaults to ``'f1'``.
            methods (list):
                List of methods to try. Optional. Defaults to ``['logreg', 'dt', 'knn']``.
            r_minimum (int):
                ``r_minimum`` setting for the Tuner. Optional. Defaults to ``2``.
            run_per_partition (bool):
                whether to create a separated Datarun for each Hyperpartition or not.
                Optional. Defaults to ``False``.
            score_target (str):
                Which score to use for the tuning and selection process. It can be ``'cv'`` or
                ``'test'``. Optional. Defaults to ``'cv'``.
            priority (int):
                Priority of this Datarun. The higher the better. Optional. Defaults to ``1``.
            selector (str):
                Type of selector to use. Optional. Defaults to ``'uniform'``.
            tuner (str):
                Type of tuner to use. Optional. Defaults to ``'uniform'``.
            deadline (str):
                Time deadline. It must be a string representing a datetime in the format
                ``'%Y-%m-%d %H:%M'``. If given, ``budget_type`` will be set to ``'walltime'``.

        Returns:
            Datarun:
                The created Datarun or list of Dataruns.
        """

        if deadline:
            deadline = datetime.strptime(deadline, TIME_FMT)
            budget_type = 'walltime'

        elif budget_type == 'walltime':
            deadline = datetime.now() + timedelta(minutes=budget)

        run_description = '___'.join([tuner, selector])
        target = score_target + '_judgment_metric'

        method_parts = {}
        for method in methods:
            # enumerate all combinations of categorical variables for this method
            method_instance = Method(method)
            method_parts[method] = method_instance.get_hyperpartitions()

            LOGGER.info('method {} has {} hyperpartitions'.format(
                method, len(method_parts[method])))

        dataruns = list()
        if not run_per_partition:
            datarun = self.db.create_datarun(
                dataset_id=dataset_id,
                description=run_description,
                tuner=tuner,
                selector=selector,
                gridding=gridding,
                priority=priority,
                budget_type=budget_type,
                budget=budget,
                deadline=deadline,
                metric=metric,
                score_target=target,
                k_window=k_window,
                r_minimum=r_minimum
            )

            dataruns.append(datarun)

        for method, parts in method_parts.items():
            for part in parts:
                # if necessary, create a new datarun for each hyperpartition.
                # This setting is useful for debugging.
                if run_per_partition:
                    datarun = self.db.create_datarun(
                        dataset_id=dataset_id,
                        description=run_description,
                        tuner=tuner,
                        selector=selector,
                        gridding=gridding,
                        priority=priority,
                        budget_type=budget_type,
                        budget=budget,
                        deadline=deadline,
                        metric=metric,
                        score_target=target,
                        k_window=k_window,
                        r_minimum=r_minimum
                    )

                    dataruns.append(datarun)

                # create a new hyperpartition in the database
                self.db.create_hyperpartition(datarun_id=datarun.id,
                                              method=method,
                                              tunables=part.tunables,
                                              constants=part.constants,
                                              categoricals=part.categoricals,
                                              status=PartitionStatus.INCOMPLETE)

        dataset = self.db.get_dataset(dataset_id)
        LOGGER.info('Dataruns created. Summary:')
        LOGGER.info('\tDataset ID: {}'.format(dataset.id))
        LOGGER.info('\tTraining data: {}'.format(dataset.train_path))
        LOGGER.info('\tTest data: {}'.format(dataset.test_path))

        if run_per_partition:
            LOGGER.info('\tDatarun IDs: {}'.format(
                ', '.join(str(datarun.id) for datarun in dataruns)))

        else:
            LOGGER.info('\tDatarun ID: {}'.format(dataruns[0].id))

        LOGGER.info('\tHyperpartition selection strategy: {}'.format(dataruns[0].selector))
        LOGGER.info('\tParameter tuning strategy: {}'.format(dataruns[0].tuner))
        LOGGER.info('\tBudget: {} ({})'.format(dataruns[0].budget, dataruns[0].budget_type))

        return dataruns if run_per_partition else dataruns[0]
Esempio n. 10
0
def new_datarun(db, run_config, run_per_partition=False):
    """
    A modification of the atm.enter_data.enter_data
    Generate a datarun, including a dataset if necessary.

    db: an instance of atm.Database.
    run_config: all attributes necessary to initialize a Datarun, including
        Dataset info if the dataset has not already been created.

    Returns: ID of the generated datarun
    """
    # connect to the database

    # if the user has provided a dataset id, use that. Otherwise, create a new
    # dataset based on the arguments we were passed.
    # if run_config.dataset_id is None:
    #     raise ValueError('')
    #     # dataset = create_dataset(db, run_config, aws_config=aws_config)
    #     # run_config.dataset_id = dataset.id
    # else:
    dataset = db.get_dataset(run_config.dataset_id)

    method_parts = {}
    for m in run_config.methods:
        # enumerate all combinations of categorical variables for this method
        method = Method(m)
        method_parts[m] = method.get_hyperpartitions()
        logger.info('method %s has %d hyperpartitions' %
                    (m, len(method_parts[m])))

    # create hyperpartitions and datarun(s)
    run_ids = []
    if not run_per_partition:
        logger.debug('saving datarun...')
        datarun = create_datarun(db, dataset, run_config)

    logger.debug('saving hyperpartions...')
    for method, parts in list(method_parts.items()):
        for part in parts:
            # if necessary, create a new datarun for each hyperpartition.
            # This setting is useful for debugging.
            if run_per_partition:
                datarun = create_datarun(db, dataset, run_config)
                run_ids.append(datarun.id)

            # create a new hyperpartition in the database
            db.create_hyperpartition(datarun_id=datarun.id,
                                     method=method,
                                     tunables=part.tunables,
                                     constants=part.constants,
                                     categoricals=part.categoricals,
                                     status=PartitionStatus.INCOMPLETE)

    logger.info('Data entry complete. Summary:')
    logger.info('\tDataset ID: %d' % dataset.id)
    logger.info('\tTraining data: %s' % dataset.train_path)
    logger.info('\tTest data: %s' % (dataset.test_path or 'None'))
    if run_per_partition:
        logger.info('\tDatarun IDs: %s' % ', '.join(map(str, run_ids)))
    else:
        logger.info('\tDatarun ID: %d' % datarun.id)
    logger.info('\tHyperpartition selection strategy: %s' % datarun.selector)
    logger.info('\tParameter tuning strategy: %s' % datarun.tuner)
    logger.info('\tBudget: %d (%s)' % (datarun.budget, datarun.budget_type))

    return run_ids or datarun.id
Esempio n. 11
0
def create_wrapper(params, judgment_metric):
    config = METHODS_MAP[params["function"]]
    class_ = Method(config).class_
    return Wrapper(params["function"], judgment_metric, params, class_)