def __init__(self, code, params, judgment_metric, label_column, testing_ratio=0.3): """ Parameters: code: the short method code (as defined in constants.py) judgment_metric: string that indicates which metric should be optimized for. params: parameters passed to the sklearn classifier constructor class_: sklearn classifier class """ # configuration & database self.code = code self.params = params self.judgment_metric = judgment_metric self.label_column = label_column self.testing_ratio = testing_ratio # load the classifier method's class path = Method(METHODS_MAP[code]).class_path.split('.') mod_str, cls_str = '.'.join(path[:-1]), path[-1] mod = import_module(mod_str) self.class_ = getattr(mod, cls_str) # pipelining self.pipeline = None # persistent random state self.random_state = np.random.randint(1e7)
def test_enumerate(): js = {'name': 'test', 'class': 'test'} js['hyperparameters'] = { 'a': {'type': 'int_cat', 'values': [0, 3]}, 'b': {'type': 'int', 'range': [0, 3]}, 'c': {'type': 'bool', 'values': [True, False]}, 'd': {'type': 'string', 'values': ['x', 'y']}, 'e': {'type': 'float_cat', 'values': [-0.5, 0.5, 1.0]}, 'f': {'type': 'float', 'range': [0.5]}, 'g': {'type': 'list', 'list_length': [1, 2, 3], 'element': {'type': 'int_exp', 'range': [1e-3, 1e3]}} } js['root_hyperparameters'] = ['a', 'f'] js['conditional_hyperparameters'] = { 'a': {'0': ['b'], '3': ['c']}, 'c': {'True': ['d'], 'False': ['e', 'g']}, } config_path = '/tmp/method.json' with open(config_path, 'w') as f: json.dump(js, f) hps = Method(config_path).get_hyperpartitions() assert len(hps) == 12 assert all('a' in list(zip(*hp.categoricals))[0] for hp in hps) assert all(('f', 0.5) in hp.constants for hp in hps) assert len([hp for hp in hps if hp.tunables and 'b' in list(zip(*hp.tunables))[0]]) == 1
def __init__(self, method, params, judgment_metric, class_column, testing_ratio=0.3, verbose_metrics=False): """ Parameters: method: the short method code (as defined in constants.py) or path to method json judgment_metric: string that indicates which metric should be optimized for. params: parameters passed to the sklearn classifier constructor class_: sklearn classifier class """ # configuration & database self.method = method self.params = params self.judgment_metric = judgment_metric self.class_column = class_column self.testing_ratio = testing_ratio self.verbose_metrics = verbose_metrics # load the classifier method's class path = Method(method).class_path.split('.') mod_str, cls_str = '.'.join(path[:-1]), path[-1] mod = import_module(mod_str) self.class_ = getattr(mod, cls_str) # pipelining self.pipeline = None # persistent random state self.random_state = np.random.randint(1e7)
def enter_datarun(sql_config, run_config, aws_config=None, upload_data=False, run_per_partition=False): """ Generate a datarun, including a dataset if necessary. sql_config: Object with all attributes necessary to initialize a Database. run_config: all attributes necessary to initialize a Datarun, including Dataset info if the dataset has not already been created. aws_config: all attributes necessary to connect to an S3 bucket. upload_data: whether to store processed data in the cloud Returns: ID of the generated datarun """ # connect to the database db = Database(sql_config.dialect, sql_config.database, sql_config.username, sql_config.password, sql_config.host, sql_config.port, sql_config.query) # if the user has provided a dataset id, use that. Otherwise, create a new # dataset based on the arguments we were passed. if run_config.dataset_id is None: dataset = enter_dataset(db, run_config, aws_config=aws_config, upload_data=upload_data) else: dataset = db.get_dataset(run_config.dataset_id) # create hyperpartitions for the new datarun print print 'creating hyperpartitions...' session = db.get_session() method_and_parts = [] for m in run_config.methods: # enumerate all combinations of categorical variables for this method method = Method(METHODS_MAP[m]) method_hyperparitions = method.get_hyperpartitions() for method_hyperparition in method_hyperparitions: method_and_parts.append((m, method_hyperparition)) print 'method', m, 'has', len(method_hyperparitions), 'hyperpartitions' # create and save datarun to database print print 'creating datarun...' # create hyperpartitions and datarun(s) run_ids = [] if not run_per_partition: datarun = create_datarun(db, session, dataset, run_config) session.commit() for method, part in method_and_parts: # if necessary, create a new datarun for each hyperpartition. # This setting is useful for debugging. if run_per_partition: datarun = create_datarun(db, session, dataset, run_config) session.commit() run_ids.append(datarun.id) hp = db.Hyperpartition(datarun_id=datarun.id, method=method, tunables=part.tunables, constants=part.constants, categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) session.add(hp) session.commit() print print '========== Summary ==========' print 'Dataset ID:', dataset.id print 'Training data:', dataset.train_path print 'Test data:', (dataset.test_path or '(None)') if run_per_partition: print 'Datarun IDs:', ', '.join(map(str, run_ids)) else: print 'Datarun ID:', datarun.id print 'Hyperpartition selection strategy:', datarun.selector print 'Parameter tuning strategy:', datarun.tuner print 'Budget: %d (%s)' % (datarun.budget, datarun.budget_type) print return run_ids or datarun.id
def enter_datarun(sql_config, run_config, aws_config=None, run_per_partition=False): """ Generate a datarun, including a dataset if necessary. sql_config: Object with all attributes necessary to initialize a Database. run_config: all attributes necessary to initialize a Datarun, including Dataset info if the dataset has not already been created. aws_config: all attributes necessary to connect to an S3 bucket. Returns: ID of the generated datarun """ # connect to the database db = Database(sql_config.dialect, sql_config.database, sql_config.username, sql_config.password, sql_config.host, sql_config.port, sql_config.query) # if the user has provided a dataset id, use that. Otherwise, create a new # dataset based on the arguments we were passed. if run_config.dataset_id is None: dataset = enter_dataset(db, run_config, aws_config=aws_config) else: dataset = db.get_dataset(run_config.dataset_id) method_parts = {} for m in run_config.methods: # enumerate all combinations of categorical variables for this method method = Method(m) method_parts[m] = method.get_hyperpartitions() print('method', m, 'has', len(method_parts[m]), 'hyperpartitions') # create hyperpartitions and datarun(s) run_ids = [] if not run_per_partition: print('saving datarun...') datarun = create_datarun(db, dataset, run_config) print('saving hyperpartions...') for method, parts in method_parts.items(): for part in parts: # if necessary, create a new datarun for each hyperpartition. # This setting is useful for debugging. if run_per_partition: datarun = create_datarun(db, dataset, run_config) run_ids.append(datarun.id) # create a new hyperpartition in the database db.create_hyperpartition(datarun_id=datarun.id, method=method, tunables=part.tunables, constants=part.constants, categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) print('done!') print() print('========== Summary ==========') print('Dataset ID:', dataset.id) print('Training data:', dataset.train_path) print('Test data:', (dataset.test_path or '(None)')) if run_per_partition: print('Datarun IDs:', ', '.join(map(str, run_ids))) else: print('Datarun ID:', datarun.id) print('Hyperpartition selection strategy:', datarun.selector) print('Parameter tuning strategy:', datarun.tuner) print('Budget: %d (%s)' % (datarun.budget, datarun.budget_type)) print() return run_ids or datarun.id
def add_datarun(self, dataset_id, budget=100, budget_type='classifier', gridding=0, k_window=3, metric='f1', methods=['logreg', 'dt', 'knn'], r_minimum=2, run_per_partition=False, score_target='cv', priority=1, selector='uniform', tuner='uniform', deadline=None): dataruns = list() if deadline: deadline = datetime.strptime(deadline, TIME_FMT) budget_type = 'walltime' elif budget_type == 'walltime': deadline = datetime.now() + timedelta(minutes=budget) run_description = '___'.join([tuner, selector]) target = score_target + '_judgment_metric' method_parts = {} for method in methods: # enumerate all combinations of categorical variables for this method method_instance = Method(method) method_parts[method] = method_instance.get_hyperpartitions() LOGGER.info('method {} has {} hyperpartitions'.format( method, len(method_parts[method]))) if not run_per_partition: datarun = self.db.create_datarun( dataset_id=dataset_id, description=run_description, tuner=tuner, selector=selector, gridding=gridding, priority=priority, budget_type=budget_type, budget=budget, deadline=deadline, metric=metric, score_target=target, k_window=k_window, r_minimum=r_minimum ) dataruns.append(datarun) for method, parts in method_parts.items(): for part in parts: # if necessary, create a new datarun for each hyperpartition. # This setting is useful for debugging. if run_per_partition: datarun = self.db.create_datarun( dataset_id=dataset_id, description=run_description, tuner=tuner, selector=selector, gridding=gridding, priority=priority, budget_type=budget_type, budget=budget, deadline=deadline, metric=metric, score_target=target, k_window=k_window, r_minimum=r_minimum ) dataruns.append(datarun) # create a new hyperpartition in the database self.db.create_hyperpartition(datarun_id=datarun.id, method=method, tunables=part.tunables, constants=part.constants, categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) dataset = self.db.get_dataset(dataset_id) LOGGER.info('Dataruns created. Summary:') LOGGER.info('\tDataset ID: {}'.format(dataset.id)) LOGGER.info('\tTraining data: {}'.format(dataset.train_path)) LOGGER.info('\tTest data: {}'.format(dataset.test_path)) if run_per_partition: LOGGER.info('\tDatarun IDs: {}'.format( ', '.join(str(datarun.id) for datarun in dataruns))) else: LOGGER.info('\tDatarun ID: {}'.format(dataruns[0].id)) LOGGER.info('\tHyperpartition selection strategy: {}'.format(dataruns[0].selector)) LOGGER.info('\tParameter tuning strategy: {}'.format(dataruns[0].tuner)) LOGGER.info('\tBudget: {} ({})'.format(dataruns[0].budget, dataruns[0].budget_type)) return dataruns if run_per_partition else dataruns[0]
def enter_data(self, run_per_partition=False): """ Generate a datarun, including a dataset if necessary. Returns: ID of the generated datarun """ # connect to the database # if the user has provided a dataset id, use that. Otherwise, create a new # dataset based on the arguments we were passed. if self.run_conf.dataset_id is None: dataset = self.create_dataset() self.run_conf.dataset_id = dataset.id else: dataset = self.db.get_dataset(self.run_conf.dataset_id) method_parts = {} for m in self.run_conf.methods: # enumerate all combinations of categorical variables for this method method = Method(m) method_parts[m] = method.get_hyperpartitions() logger.info('method %s has %d hyperpartitions' % (m, len(method_parts[m]))) # create hyperpartitions and datarun(s) run_ids = [] if not run_per_partition: logger.debug('saving datarun...') datarun = self.create_datarun(dataset) logger.debug('saving hyperpartions...') for method, parts in list(method_parts.items()): for part in parts: # if necessary, create a new datarun for each hyperpartition. # This setting is useful for debugging. if run_per_partition: datarun = self.create_datarun(dataset) run_ids.append(datarun.id) # create a new hyperpartition in the database self.db.create_hyperpartition( datarun_id=datarun.id, method=method, tunables=part.tunables, constants=part.constants, categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) logger.info('Data entry complete. Summary:') logger.info('\tDataset ID: %d', dataset.id) logger.info('\tTraining data: %s', dataset.train_path) logger.info('\tTest data: %s', (dataset.test_path or 'None')) if run_per_partition: logger.info('\tDatarun IDs: %s', ', '.join(map(str, run_ids))) else: logger.info('\tDatarun ID: %d', datarun.id) logger.info('\tHyperpartition selection strategy: %s', datarun.selector) logger.info('\tParameter tuning strategy: %s', datarun.tuner) logger.info('\tBudget: %d (%s)', datarun.budget, datarun.budget_type) return run_ids or datarun.id
def create_dataruns(self, run_conf): """ Generate a datarun, including a dataset if necessary. Returns: ID of the generated datarun """ dataset = self.db.get_dataset(run_conf.dataset_id) if not dataset: raise ValueError('Invalid Dataset ID: {}'.format( run_conf.dataset_id)) method_parts = {} for m in run_conf.methods: # enumerate all combinations of categorical variables for this method method = Method(m) method_parts[m] = method.get_hyperpartitions() LOGGER.info('method %s has %d hyperpartitions' % (m, len(method_parts[m]))) # create hyperpartitions and datarun(s) dataruns = [] if not run_conf.run_per_partition: LOGGER.debug('saving datarun...') datarun = self.create_datarun(dataset, run_conf) dataruns.append(datarun) LOGGER.debug('saving hyperpartions...') for method, parts in list(method_parts.items()): for part in parts: # if necessary, create a new datarun for each hyperpartition. # This setting is useful for debugging. if run_conf.run_per_partition: datarun = self.create_datarun(dataset, run_conf) dataruns.append(datarun) # create a new hyperpartition in the database self.db.create_hyperpartition( datarun_id=datarun.id, method=method, tunables=part.tunables, constants=part.constants, categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) LOGGER.info('Dataruns created. Summary:') LOGGER.info('\tDataset ID: %d', dataset.id) LOGGER.info('\tTraining data: %s', dataset.train_path) LOGGER.info('\tTest data: %s', (dataset.test_path or 'None')) datarun = dataruns[0] if run_conf.run_per_partition: LOGGER.info('\tDatarun IDs: %s', ', '.join(str(datarun.id) for datarun in dataruns)) else: LOGGER.info('\tDatarun ID: %d', datarun.id) LOGGER.info('\tHyperpartition selection strategy: %s', datarun.selector) LOGGER.info('\tParameter tuning strategy: %s', datarun.tuner) LOGGER.info('\tBudget: %d (%s)', datarun.budget, datarun.budget_type) return dataruns
def add_datarun(self, dataset_id, budget=100, budget_type='classifier', gridding=0, k_window=3, metric='f1', methods=['logreg', 'dt', 'knn'], r_minimum=2, run_per_partition=False, score_target='cv', priority=1, selector='uniform', tuner='uniform', deadline=None): """Register one or more Dataruns to the Database. The methods hyperparameters will be analyzed and Hyperpartitions generated from them. If ``run_per_partition`` is ``True``, one Datarun will be created for each Hyperpartition. Otherwise, a single one will be created for all of them. Args: dataset_id (int): Id of the Dataset which this Datarun will belong to. budget (int): Budget amount. Optional. Defaults to ``100``. budget_type (str): Budget Type. Can be 'classifier' or 'walltime'. Optional. Defaults to ``'classifier'``. gridding (int): ``gridding`` setting for the Tuner. Optional. Defaults to ``0``. k_window (int): ``k`` setting for the Selector. Optional. Defaults to ``3``. metric (str): Metric to use for the tuning and selection. Optional. Defaults to ``'f1'``. methods (list): List of methods to try. Optional. Defaults to ``['logreg', 'dt', 'knn']``. r_minimum (int): ``r_minimum`` setting for the Tuner. Optional. Defaults to ``2``. run_per_partition (bool): whether to create a separated Datarun for each Hyperpartition or not. Optional. Defaults to ``False``. score_target (str): Which score to use for the tuning and selection process. It can be ``'cv'`` or ``'test'``. Optional. Defaults to ``'cv'``. priority (int): Priority of this Datarun. The higher the better. Optional. Defaults to ``1``. selector (str): Type of selector to use. Optional. Defaults to ``'uniform'``. tuner (str): Type of tuner to use. Optional. Defaults to ``'uniform'``. deadline (str): Time deadline. It must be a string representing a datetime in the format ``'%Y-%m-%d %H:%M'``. If given, ``budget_type`` will be set to ``'walltime'``. Returns: Datarun: The created Datarun or list of Dataruns. """ if deadline: deadline = datetime.strptime(deadline, TIME_FMT) budget_type = 'walltime' elif budget_type == 'walltime': deadline = datetime.now() + timedelta(minutes=budget) run_description = '___'.join([tuner, selector]) target = score_target + '_judgment_metric' method_parts = {} for method in methods: # enumerate all combinations of categorical variables for this method method_instance = Method(method) method_parts[method] = method_instance.get_hyperpartitions() LOGGER.info('method {} has {} hyperpartitions'.format( method, len(method_parts[method]))) dataruns = list() if not run_per_partition: datarun = self.db.create_datarun( dataset_id=dataset_id, description=run_description, tuner=tuner, selector=selector, gridding=gridding, priority=priority, budget_type=budget_type, budget=budget, deadline=deadline, metric=metric, score_target=target, k_window=k_window, r_minimum=r_minimum ) dataruns.append(datarun) for method, parts in method_parts.items(): for part in parts: # if necessary, create a new datarun for each hyperpartition. # This setting is useful for debugging. if run_per_partition: datarun = self.db.create_datarun( dataset_id=dataset_id, description=run_description, tuner=tuner, selector=selector, gridding=gridding, priority=priority, budget_type=budget_type, budget=budget, deadline=deadline, metric=metric, score_target=target, k_window=k_window, r_minimum=r_minimum ) dataruns.append(datarun) # create a new hyperpartition in the database self.db.create_hyperpartition(datarun_id=datarun.id, method=method, tunables=part.tunables, constants=part.constants, categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) dataset = self.db.get_dataset(dataset_id) LOGGER.info('Dataruns created. Summary:') LOGGER.info('\tDataset ID: {}'.format(dataset.id)) LOGGER.info('\tTraining data: {}'.format(dataset.train_path)) LOGGER.info('\tTest data: {}'.format(dataset.test_path)) if run_per_partition: LOGGER.info('\tDatarun IDs: {}'.format( ', '.join(str(datarun.id) for datarun in dataruns))) else: LOGGER.info('\tDatarun ID: {}'.format(dataruns[0].id)) LOGGER.info('\tHyperpartition selection strategy: {}'.format(dataruns[0].selector)) LOGGER.info('\tParameter tuning strategy: {}'.format(dataruns[0].tuner)) LOGGER.info('\tBudget: {} ({})'.format(dataruns[0].budget, dataruns[0].budget_type)) return dataruns if run_per_partition else dataruns[0]
def new_datarun(db, run_config, run_per_partition=False): """ A modification of the atm.enter_data.enter_data Generate a datarun, including a dataset if necessary. db: an instance of atm.Database. run_config: all attributes necessary to initialize a Datarun, including Dataset info if the dataset has not already been created. Returns: ID of the generated datarun """ # connect to the database # if the user has provided a dataset id, use that. Otherwise, create a new # dataset based on the arguments we were passed. # if run_config.dataset_id is None: # raise ValueError('') # # dataset = create_dataset(db, run_config, aws_config=aws_config) # # run_config.dataset_id = dataset.id # else: dataset = db.get_dataset(run_config.dataset_id) method_parts = {} for m in run_config.methods: # enumerate all combinations of categorical variables for this method method = Method(m) method_parts[m] = method.get_hyperpartitions() logger.info('method %s has %d hyperpartitions' % (m, len(method_parts[m]))) # create hyperpartitions and datarun(s) run_ids = [] if not run_per_partition: logger.debug('saving datarun...') datarun = create_datarun(db, dataset, run_config) logger.debug('saving hyperpartions...') for method, parts in list(method_parts.items()): for part in parts: # if necessary, create a new datarun for each hyperpartition. # This setting is useful for debugging. if run_per_partition: datarun = create_datarun(db, dataset, run_config) run_ids.append(datarun.id) # create a new hyperpartition in the database db.create_hyperpartition(datarun_id=datarun.id, method=method, tunables=part.tunables, constants=part.constants, categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) logger.info('Data entry complete. Summary:') logger.info('\tDataset ID: %d' % dataset.id) logger.info('\tTraining data: %s' % dataset.train_path) logger.info('\tTest data: %s' % (dataset.test_path or 'None')) if run_per_partition: logger.info('\tDatarun IDs: %s' % ', '.join(map(str, run_ids))) else: logger.info('\tDatarun ID: %d' % datarun.id) logger.info('\tHyperpartition selection strategy: %s' % datarun.selector) logger.info('\tParameter tuning strategy: %s' % datarun.tuner) logger.info('\tBudget: %d (%s)' % (datarun.budget, datarun.budget_type)) return run_ids or datarun.id
def create_wrapper(params, judgment_metric): config = METHODS_MAP[params["function"]] class_ = Method(config).class_ return Wrapper(params["function"], judgment_metric, params, class_)