def produce(self, specification): configuration = specification.get('configuration', {}) predict_type = configuration.get('predict_type', 'RAW') # REFIT dataframe_train = Dataset(specification['train']).get_dataframe().dropna() stimulus = self.make_stimulus(dataframe_train[self.predictors]) self.fit(stimulus, dataframe_train[self.targets[0]], specification['train']) # PRODUCE dataframe = Dataset(specification['input']).get_dataframe().dropna() dataframe.reset_index(drop=True, inplace=True) stimulus = self.make_stimulus(dataframe[self.predictors]) output_directory_path = specification['output']['resource_uri'].replace('file://', '') output_path = '/' + os.path.join( *output_directory_path.split('/'), str(uuid.uuid4()) + '.csv') if self.system == 'mljar-supervised': predictions = self.model.predict(stimulus) if predict_type == 'RAW': predictions = pandas.DataFrame((predictions.idxmax(axis=1) == 'p_1').astype(int)) predictions.columns = [self.targets[0]] else: if predict_type == 'RAW': predictions = self.model.predict(stimulus) if len(predictions.shape) > 1: predictions = np.argmax(predictions, axis=-1) predictions = pandas.DataFrame(predictions, columns=[self.targets[0]]).astype(int) else: predictions = self.model.predict_proba(stimulus) # TODO: standardize probability column names predictions = pandas.DataFrame(predictions, columns=[f'p_{i}' for i in range(predictions.shape[1])]) predictions.reset_index(drop=True, inplace=True) predictions.insert(0, 'd3mIndex', dataframe['d3mIndex']) if not os.path.exists(output_directory_path): os.makedirs(output_directory_path) cwd = os.getcwd() try: os.chdir('/') predictions.to_csv(output_path, index=False) finally: os.chdir(cwd) return { 'produce': { 'input': specification['input'], 'configuration': configuration, 'data_pointer': output_path }, 'search_id': self.search_id, 'model_id': self.model_id, 'system': self.system }
def score(self, specification): # TODO: refitting -> respect configuration configuration = specification['configuration'] dataframe = Dataset(specification['input']).get_dataframe() target = self.targets[0] if self.task == 'CLASSIFICATION': dataframe[target] = dataframe[target].astype(str) predicted = self.model.predict(dataframe) scores = [] for metric in specification['performanceMetrics']: scores.append({ 'value': get_metric(metric)(dataframe[target], predicted[f'{target}_predictions']), 'metric': metric, 'target': target }) return { 'search_id': self.search_id, 'model_id': self.model_id, 'scores': scores, 'system': self.system }
def produce(self, produce_specification): configuration = produce_specification.get('configuration', {}) predict_type = configuration.get('predict_type', 'RAW') dataframe = Dataset(produce_specification['input']).get_dataframe() if self.task in ['REGRESSION', 'CLASSIFICATION']: dataframe_train = Dataset(produce_specification['train']).get_dataframe().dropna() self.fit(dataframe=dataframe_train, data_specification=produce_specification['train']) if predict_type == 'RAW': predicted = self.model.predict(dataframe) # if len(predicted.columns.values) > 1: # predicted = np.argmax(predicted, axis=-1) else: predicted = self.model.predict_proba(dataframe) # TODO: standardize probability column names predicted = pandas.DataFrame(predicted, columns=[f'p_{i}' for i in range(predicted.shape[1])]) output_directory_path = produce_specification['output']['resource_uri'].replace('file://', '') output_path = '/' + os.path.join( *output_directory_path.split('/'), str(uuid.uuid4()) + '.csv') if 'd3mIndex' not in predicted.columns.values: predicted.insert(0, 'd3mIndex', dataframe['d3mIndex']) if not os.path.exists(output_directory_path): os.makedirs(output_directory_path) cwd = os.getcwd() try: os.chdir('/') predicted.to_csv(output_path, index=False) finally: os.chdir(cwd) return { 'produce': { 'input': produce_specification['input'], 'configuration': configuration, 'data_pointer': output_path }, 'search_id': self.search_id, 'model_id': self.model_id, 'system': self.system }
def run(self): from supervised.automl import AutoML dataset = Dataset(self.specification['input']) dataframe = dataset.get_dataframe().dropna() X = self.specification['problem']['predictors'] y = self.specification['problem']['targets'][0] stimulus, preprocessor = preprocess(dataframe, self.specification) if self.specification.get('timeBoundSearch'): self.system_params['total_time_limit'] = self.specification[ 'timeBoundSearch'] if self.specification.get('timeBoundRun'): self.system_params['learner_time_limit'] = self.specification[ 'timeBoundRun'] automl = AutoML(**self.system_params) # mljar seems kind of fragile? stimulus = pandas.DataFrame(stimulus) stimulus.columns = [str(i).strip() for i in stimulus.columns] automl.fit(stimulus, dataframe[y]) for model_mljar in sorted(automl._models, key=lambda m: m.get_final_loss())[:4]: model = ModelSklearn( model_mljar, system='mljar-supervised', search_id=self.search_id, predictors=X, targets=[y], preprocess=preprocessor, task=self.specification['problem']['taskType']) model.save() from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS FOUND_MODEL_CALLBACKS[self.callback_found]( model, **(self.callback_arguments or {})) return { KEY_SUCCESS: True, KEY_MESSAGE: 'search complete', KEY_DATA: { 'search_id': self.search_id, 'system': 'mljar-supervised' } }
def score(self, score_specification): # configuration = score_specification['configuration'] dataframe = Dataset(score_specification['input']).get_dataframe() if self.task == "FORECASTING": # dataframe_train = Dataset(score_specification['train']).get_dataframe() # horizon = configuration.get('forecastingHorizon', {}).get('value', 1) # if len(dataframe) < horizon: # raise ValueError(f'No predictions with a horizon of {horizon} are within range of the test data.') predicted = self.model.predict(dataframe) # predicted = self.forecast( # dataframe=dataframe_train, # dataframe_rolling=dataframe, # horizon=horizon)[:len(dataframe) - horizon + 1] elif self.task in ['CLASSIFICATION', 'REGRESSION']: # TODO: respect configuration on holdout vs cross-validation, do refitting, etc. if self.task == 'CLASSIFICATION': for target in self.targets: dataframe[target] = dataframe[target].astype(str) predicted = self.model.predict(dataframe) if self.task == 'CLASSIFICATION': for target in self.targets: predicted[target] = predicted[target].astype(str) else: raise NotImplementedError scores = [] for target in self.targets: for metric in score_specification['performanceMetrics']: results = pandas.DataFrame({'actual': dataframe[target], 'predicted': predicted[target]}) results.dropna(inplace=True) scores.append({ 'value': get_metric(metric)(results['actual'], results['predicted']), 'metric': metric, 'target': target }) return { 'search_id': self.search_id, 'model_id': self.model_id, 'scores': scores, 'system': self.system }
def score(self, specification): dataframe = Dataset(specification['input']).get_dataframe()[self.predictors + self.targets].dropna() dataframe.reset_index(drop=True, inplace=True) configuration = specification['configuration'] splits = self.make_splits(configuration, dataframe) split_scores = defaultdict(list) split_weights = defaultdict(list) for train_split, test_split in splits: self.fit(self.make_stimulus(train_split), train_split[self.targets[0]]) actual = np.array(test_split[self.targets[0]]) predicted = self.model.predict(self.make_stimulus(test_split)) if 'CLASSIFICATION' in self.task: actual = actual.astype(int) if self.system == 'mljar-supervised': predicted = pandas.DataFrame((predicted.idxmax(axis=1) == 'p_1').astype(int)) predicted.columns = [self.targets[0]] for metric in specification['performanceMetrics']: try: split_scores[json.dumps(metric)].append(get_metric(metric)(actual, predicted)) split_weights[json.dumps(metric)].append(test_split.size) except ValueError: pass scores = [] for metric in split_scores: if split_scores[metric]: scores.append({ 'value': np.average(split_scores[metric], weights=split_weights[metric]), 'metric': json.loads(metric), 'target': self.targets[0] }) return { 'search_id': self.search_id, 'model_id': self.model_id, 'scores': scores, 'system': self.system }
def produce(self, specification): configuration = specification.get('configuration', {}) predict_type = configuration.get('predict_type', 'RAW') dataset = Dataset(specification['input']) dataframe = dataset.get_dataframe() predictions = self.model.predict(dataframe) if predict_type == 'RAW': predictions = predictions[[f'{self.targets[0]}_predictions']] predictions.columns = [self.targets[0]] if predict_type == 'PROBABILITIES': predictions = predictions[[i for i in predictions.columns.values if i.startswith(f'{self.targets}_probabilities_')]] predictions.insert(0, 'd3mIndex', dataframe['d3mIndex']) output_directory_path = specification['output']['resource_uri'].replace('file://', '') output_path = '/' + os.path.join( *output_directory_path.split('/'), str(uuid.uuid4()) + '.csv') if not os.path.exists(output_directory_path): os.makedirs(output_directory_path) cwd = os.getcwd() try: os.chdir('/') predictions.to_csv(output_path, index=False) finally: os.chdir(cwd) return { 'produce': { 'input': specification['input'], 'configuration': configuration, 'data_pointer': output_path }, 'search_id': self.search_id, 'model_id': self.model_id, "system": self.system }
def run(self): import autosklearn.classification import autosklearn.regression dataset = Dataset(self.specification['input']) dataframe = dataset.get_dataframe().dropna() dataframe.reset_index(drop=True, inplace=True) stimulus, preprocessor = preprocess(dataframe, self.specification) x = self.specification['problem']['predictors'] y = self.specification['problem']['targets'][0] # if os.path.exists(tmp_folder): # shutil.rmtree(tmp_folder) # if os.path.exists(output_folder): # shutil.rmtree(output_folder) # TODO: auto_sklearn has a bug with weak references when certain non-default options are used. # Just avoiding this bug for now # if 'configuration' in self.specification: # config = self.specification['configuration'] # # self.system_params['resampling_strategy_arguments'] = self.system_params.get('resampling_strategy_arguments', {}) # self.system_params['resampling_strategy_arguments']['shuffle'] = config.get('shuffle', False) # # if config['method'] == "HOLDOUT": # self.system_params['resampling_strategy'] = 'holdOut' # self.system_params['resampling_strategy_arguments']['train_size'] = max(0, config.get('trainTestRatio')) or .6 # # if config['method'] == "K_FOLD": # self.system_params['resampling_strategy'] = 'cv' # self.system_params['resampling_strategy_arguments']['folds'] = config.get('folds') or 10 if self.specification.get('timeBoundSearch'): self.system_params[ 'time_left_for_this_task'] = self.specification.get( 'timeBoundSearch') if self.specification.get('timeBoundRun'): self.system_params['per_run_time_limit'] = self.specification.get( 'timeBoundRun') # sklearn_temp_path = '/ravens_volume/solvers/auto_sklearn/temporary/' + str(uuid.uuid4()) # tmp_folder = os.path.join(*sklearn_temp_path.split('/'), 'temp') # output_folder = os.path.join(*sklearn_temp_path.split('/'), 'output') # self.system_params['tmp_folder'] = tmp_folder # self.system_params['output_folder'] = output_folder # self.system_params['delete_tmp_folder_after_terminate'] = False # turn off daemon flag from the currently running process, to allow child processes from auto_sklearn fit multiprocessing.current_process()._config['daemon'] = False self.system_params['n_jobs'] = 1 # valid system params # https://automl.github.io/auto-sklearn/master/api.html#api automl = { 'REGRESSION': autosklearn.regression.AutoSklearnRegressor, 'CLASSIFICATION': autosklearn.classification.AutoSklearnClassifier }[self.specification['problem']['taskType']](**self.system_params) automl.fit(stimulus.copy(), dataframe[y].copy()) # if self.system_params.get('resampling_strategy') == 'cv': automl.refit(stimulus, dataframe[y]) model = ModelSklearn(automl, system='auto_sklearn', search_id=self.search_id, predictors=x, targets=[y], preprocess=preprocessor, task=self.specification['problem']['taskType']) model.save() from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS FOUND_MODEL_CALLBACKS[self.callback_found](model, **(self.callback_arguments or {})) return { KEY_SUCCESS: True, KEY_MESSAGE: 'search complete', KEY_DATA: { 'search_id': self.search_id, 'system': 'auto_sklearn' } }
def run(self): from ludwig.api import LudwigModel dataset = Dataset(self.specification['input']) dataframe = dataset.get_dataframe() predictors = self.specification['problem']['predictors'] targets = self.specification['problem']['targets'] target_type = { "REGRESSION": 'numerical', "CLASSIFICATION": 'category' }[self.specification['problem']['taskType']] if self.specification['problem']['taskType'] == 'CLASSIFICATION': dataframe[targets[0]] = dataframe[targets[0]].astype(str) # https://github.com/uber/ludwig/blob/master/tests/integration_tests/utils.py model_definition = { "input_features": [{ "name": predictor, "type": 'category' if predictor in self.specification['problem']['categorical'] else 'numerical' } for predictor in predictors], "output_features": [{ "name": target, "type": target_type } for target in targets] } automl = LudwigModel(model_definition) train_statistics = automl.train(dataframe) print('train_statistics') print(train_statistics) model = ModelLudwig(automl, search_id=self.search_id, predictors=predictors, targets=targets, task=self.specification['problem']['taskType']) model.save() from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS FOUND_MODEL_CALLBACKS[self.callback_found](model, **(self.callback_arguments or {})) return { KEY_SUCCESS: True, KEY_MESSAGE: 'search complete', KEY_DATA: { 'search_id': self.search_id, 'system': 'ludwig' } }
def run(self): import mlbox.model.regression dataset = Dataset(self.specification['input']) dataframe = dataset.get_dataframe().dropna() X = self.specification['problem']['predictors'] y = self.specification['problem']['targets'][0] stimulus, preprocessor = preprocess(dataframe, self.specification) strategies = { 'REGRESSION': [ "LightGBM", "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost", "Linear" ], 'CLASSIFICATION': [ "LightGBM", "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost", "Linear" ], } if self.FAST_DEBUG: strategies = { 'REGRESSION': ["RandomForest"], 'CLASSIFICATION': ["RandomForest"], } solver = { 'REGRESSION': mlbox.model.regression.Regressor, 'CLASSIFICATION': mlbox.model.classification.Classifier } for strategy in strategies[self.specification['problem']['taskType']]: automl = solver[self.specification['problem']['taskType']]( strategy=strategy, **self.system_params) if issubclass(type(stimulus), csr_matrix): stimulus = stimulus.toarray() automl.fit(df_train=pandas.DataFrame(stimulus), y_train=dataframe[y]) model = ModelSklearn( automl, system='mlbox', search_id=self.search_id, predictors=X, targets=[y], preprocess=preprocessor, task=self.specification['problem']['taskType']) model.save() from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS FOUND_MODEL_CALLBACKS[self.callback_found]( model, **(self.callback_arguments or {})) return { KEY_SUCCESS: True, KEY_MESSAGE: 'search complete', KEY_DATA: { 'search_id': self.search_id, 'system': 'mlbox' } }
def run(self): import tpot dataset = Dataset(self.specification['input']) dataframe = dataset.get_dataframe().dropna() stimulus, preprocessor = preprocess(dataframe, self.specification) X = self.specification['problem']['predictors'] y = self.specification['problem']['targets'][0] self.system_params['config_dict'] = 'TPOT sparse' # if 'configuration' in self.specification: # config = self.specification['configuration'] # # if config['method'] == "HOLDOUT": # self.system_params['cv'] = # # if config['method'] == "K_FOLD": # self.system_params['cv'] = if self.specification.get('timeBoundSearch'): self.system_params['max_time_mins'] = self.specification.get( 'timeBoundSearch') / 60. if self.specification.get('timeBoundRun'): self.system_params['max_eval_time_mins'] = self.specification.get( 'timeBoundRun') / 60. # custom scorers cause unidentified SIGSEGV upon exit of search # scorer = make_scorer( # get_metric(self.specification['performanceMetric']), # greater_is_better=should_maximize(self.specification['performanceMetric'])) # self.system_params['scoring'] = scorer self.system_params['n_jobs'] = 1 automl = { 'REGRESSION': tpot.TPOTRegressor, 'CLASSIFICATION': tpot.TPOTClassifier }[self.specification['problem']['taskType']](**self.system_params) automl.fit(stimulus, dataframe[y]) # selected models along the cost-complexity vs accuracy frontier for model_str in automl.pareto_front_fitted_pipelines_: model = ModelSklearn( automl.pareto_front_fitted_pipelines_[model_str], system='tpot', search_id=self.search_id, predictors=X, targets=[y], preprocess=preprocessor, task=self.specification['problem']['taskType']) model.save() from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS FOUND_MODEL_CALLBACKS[self.callback_found]( model, **(self.callback_arguments or {})) return { KEY_SUCCESS: True, KEY_MESSAGE: 'search complete', KEY_DATA: { 'search_id': self.search_id, 'system': 'tpot' } }
def run(self): import h2o from h2o.automl import H2OAutoML # ensure backend solver is running h2o.init() train = h2o.import_file( Dataset(self.specification['input']).get_resource_uri()) test = None X = self.specification['problem']['predictors'] y = self.specification['problem']['targets'][0] if self.specification['problem']['taskType'] == 'CLASSIFICATION': if train.types[y] == u'real': train[y] = train[y].ascharacter() # For classification, response should be a factor train[y] = train[y].asfactor() if 'configuration' in self.specification: config = self.specification['configuration'] if config['method'] == "HOLDOUT": train, test = train.split_frame( ratios=[max(0, config.get('trainTestRatio')) or .6], seed=config.get('randomSeed')) if config['method'] == "K_FOLD": self.system_params['nfolds'] = config.get('folds') or 10 self.system_params['balance_classes'] = config.get( 'stratified', False) if 'timeBoundSearch' in self.specification: self.system_params['max_runtime_secs'] = self.specification[ 'timeBoundSearch'] if 'timeBoundRun' in self.specification: self.system_params[ 'max_runtime_secs_per_model'] = self.specification[ 'timeBoundRun'] if 'rankSolutionsLimit' in self.specification: self.system_params['max_models'] = self.specification[ 'rankSolutionsLimit'] # sort_metrics = { # 'ACCURACY': "rmse", # 'ROC_AUC': "auc", # 'MEAN_SQUARED_ERROR': "mse", # 'ROOT_MEAN_SQUARED_ERROR': "rmse", # 'MEAN_ABSOLUTE_ERROR': "mae", # 'LOSS': "logloss", # } # if 'performanceMetric' in self.specification: # metric_spec = self.specification['performanceMetric'] # if metric_spec['metric'] in sort_metrics: # self.system_params['sort_metric'] = sort_metrics[metric_spec['metric']] # self.system_params['stopping_metric'] = sort_metrics[metric_spec['metric']] # CV models are useful for model comparisons # self.system_params['keep_cross_validation_models'] = True if 'CLASSIFICATION' in self.specification['problem']['taskType']: train[y] = train[y].asfactor() train_params = {"x": X, "y": y, "training_frame": train} if test: train_params['leaderboard_frame'] = test automl = H2OAutoML(**self.system_params) automl.train(**train_params) if not automl.leader: return { KEY_SUCCESS: False, KEY_MESSAGE: 'no models found', KEY_DATA: { 'search_id': self.search_id, 'system': 'h2o' } } leaderboard = automl.leaderboard # take up to 10 models for model_id in leaderboard.head(10).as_data_frame()['model_id']: model = ModelH2O(h2o.get_model(model_id), search_id=self.search_id, predictors=X, targets=[y], task=self.specification['problem']['taskType']) from tworaven_apps.solver_interfaces.tasks import FOUND_MODEL_CALLBACKS FOUND_MODEL_CALLBACKS[self.callback_found]( model, **(self.callback_arguments or {})) return { KEY_SUCCESS: True, KEY_MESSAGE: 'search complete', KEY_DATA: { 'search_id': self.search_id, 'system': 'h2o' } }
def produce(self, specification): import h2o configuration = specification.get('configuration', {}) predict_type = configuration.get('predict_type', 'RAW') train = h2o.import_file(Dataset(specification['train']).get_resource_uri()) y = self.targets[0] if 'CLASSIFICATION' in self.task: if train.types[y] == u'real': train[y] = train[y].ascharacter() train[self.targets[0]] = train[self.targets[0]].asfactor() self.fit(train, specification['train']) test_dataset = Dataset(specification['input']) data = h2o.import_file(test_dataset.get_resource_uri()) if 'CLASSIFICATION' in self.task: if data.types[y] == u'real': data[y] = data[y].ascharacter() data[y] = data[y].asfactor() # retry once try: predictions = self.model.predict(data).as_data_frame() except Exception as err: predictions = self.model.predict(data).as_data_frame() if predict_type == 'RAW': if 'CLASSIFICATION' in self.task: if data.types[y] == u'real': data[y] = data[y].ascharacter() predictions = predictions[['predict']] predictions.columns = [y] else: # TODO: standardize probability column names predictions.drop('predict', 1, inplace=True) predictions['d3mIndex'] = test_dataset.get_dataframe()['d3mIndex'] output_directory_path = specification['output']['resource_uri'].replace('file://', '') output_path = '/' + os.path.join( *output_directory_path.split('/'), str(uuid.uuid4()) + '.csv') if not os.path.exists(output_directory_path): os.makedirs(output_directory_path) cwd = os.getcwd() try: os.chdir('/') predictions.to_csv(output_path, index=False) finally: os.chdir(cwd) return { 'produce': { 'input': specification['input'], 'configuration': configuration, 'data_pointer': output_path }, 'search_id': self.search_id, 'model_id': self.model_id, "system": self.system }
def score(self, specification): import h2o configuration = specification['configuration'] resource_uri = Dataset(specification['input']).get_resource_uri() data = h2o.import_file(resource_uri) y = self.targets[0] if 'CLASSIFICATION' in self.task: if data.types[y] == u'real': data[y] = data[y].ascharacter() data[y] = data[y].asfactor() results = pandas.DataFrame({ 'predict': self.model.predict(data).as_data_frame()['predict'], 'actual': data[y].as_data_frame()[y] }).dropna() if 'CLASSIFICATION' in self.task: if data.types[y] == u'real': data[y] = data[y].ascharacter() results['actual'] = results['actual'].astype(int) scores = [] for metric_schema in specification['performanceMetrics']: try: scores.append({ 'value': get_metric(metric_schema)( results['actual'], results['predict']), 'metric': metric_schema, 'target': y }) except ValueError as err: print(f'Could not evaluate metric: {str(metric_schema)}') print(err) # if configuration.get('stratified'): # # how does h2o know which column to stratify for? weirdness here # folds = data.stratified_kfold_column(n_folds=configuration['folds']) # else: # folds = data.kfold_column(n_folds=configuration['folds']) # # split_scores = defaultdict(list) # split_weights = defaultdict(list) # for split_id in range(configuration['folds']): # train, test = data[folds != split_id], data[folds == split_id] # self.fit(train) # results = pandas.DataFrame({ # 'predict': self.model.predict(test).as_data_frame()['predict'], # 'actual': test[self.targets[0]].as_data_frame()[self.targets[0]] # }).dropna() # # if 'CLASSIFICATION' in self.task: # results['actual'] = results['actual'].astype(int) # # for metric_schema in specification['performanceMetrics']: # try: # split_scores[json.dumps(metric_schema)].append(get_metric(metric_schema)( # results['actual'], # results['predict'])) # split_weights[json.dumps(metric_schema)].append(results.size) # except ValueError as err: # print(f'Could not evaluate metric: {str(metric_schema)}') # print(err) # for metric in split_scores: # scores.append({ # 'value': np.average(split_scores[metric], weights=split_weights[metric]), # 'metric': json.loads(metric), # 'target': self.targets[0] # }) return { 'search_id': self.search_id, 'model_id': self.model_id, 'scores': scores, "system": self.system }