def _update_latest_model_id(config, run_ids): """ This is a helper function for using `latest` feature in local environment. Updates local file containing the latest id used for an asset. :param config: current flow run configuration as a Dictionary :params run_ids: list of mlapp identifiers generated in the current flow run. """ # prepare latest file local_path = settings.get('local_storage_path', 'output') latest_file_name = settings.get('latest_file_name', 'latest_ids.json') if not os.path.exists(local_path): os.makedirs(local_path) latest_ids_path = os.path.join(local_path, latest_file_name) latest = {} try: with open(latest_ids_path) as f: latest = json.load(f) except: pass # iterate over pipelines for pipeline, run_id in zip(config['pipelines_configs'], run_ids): # check if ran any pipeline where id is being stored # TODO: add here asset label level # TODO: add here data_id/model_id/reuse_features_id if pipeline['job_settings']['pipeline'] in [ 'train', 'feature_engineering' ]: latest[pipeline['job_settings']['asset_name']] = run_id with open(latest_ids_path, 'w') as f: json.dump(latest, f)
def _insert_latest_id_in_config(config): """ This is a helper function for using `latest` feature in local environment. Updates current configuration to be run with the latest id stored in a local file containing it by reference of asset name. :param config: current flow run configuration as a Dictionary """ # prepare latest file local_path = settings.get('local_storage_path', 'output') latest_file_name = settings.get('latest_file_name', 'latest_ids.json') latest_ids_path = os.path.join(local_path, latest_file_name) try: with open(latest_ids_path) as f: latest = json.load(f) except: latest = {} # iterate pipelines for i, pipeline in enumerate(config.get('pipelines_configs', [])): # iterate optional ids for id_type in ['model_id', 'data_id', 'reuse_features_id']: # check if requested latest if pipeline.get('job_settings', {}).get(id_type, None) == 'latest': # get current asset name asset_name = pipeline['job_settings']['asset_name'] # check if available id if asset_name in latest: # TODO: add here asset label level # TODO: add here data_id/model_id/reuse_features_id config['pipelines_configs'][i]['job_settings'][id_type] = latest[asset_name] else: # raise exception as not found id raise Exception("Could not find latest `" + id_type + "` for `" + asset_name + "`. \n" "Please update your config with a valid `" + id_type + "`")
def __init__(self, run_id, pipeline_input, _input: IOManager, _output: IOManager, config, *args, **kwargs): """ :param pipeline_input: the pipeline name string or list of strings :param _input: IOmanager instance with input to the pipeline :param _output: IOmanager instance to store the outputs of the pipelines to be saved externally :param config: config string of the pipeline :param args: :param kwargs: """ for asset_name in AVAILABLE_STAGES: if asset_name != BASE_CLASS_NAME: AVAILABLE_STAGES[asset_name] = {} self.pipeline_name = '' # pipeline can be either list of stages or string of a default pipeline if isinstance(pipeline_input, list): self.stages = pipeline_input if isinstance(pipeline_input, str): self.pipeline_name = " '" + pipeline_input + "'" self.stages = settings.get('pipelines', {}).get(pipeline_input, []) self.config = config self.run_id = run_id self.input_manager = _input self.output_manager = _output self.asset_name = self.config.get('job_settings', {}).get('asset_name', '') self.data_manager_instance = self.create_manager_instance('data') self.model_manager_instance = self.create_manager_instance('model') # first inputs self.state = dict.fromkeys(self.stages, {})
def init(self, handler_type): """ Initialization, should be called once only Populates the `self._handlers` and `self_main_handlers` variables depending on the set environment :param handler_type: used for filtering services by the handler type """ if not self.initialized: for service_name in settings.get('services', []): service_item = settings['services'][service_name] if 'type' not in service_item: raise Exception("'{}' service is missing 'type' key, must be filled in config.py with" " the one of the following: database/file_storage/database/spark".format(service_name)) if service_item['type'] == handler_type: try: self._handlers[service_name] = service_item['handler'](service_item.get('settings', {})) # set it as main if service_item.get('main', False): self._main_handlers.append(service_name) except SkipServiceException as e: pass # skipping this service except Exception as e: if service_item['handler'] is None: raise Exception("'{}' service of type '{}' is missing a python library installation." .format(service_name, service_item.get('type'))) else: raise e self.initialized = True
def __init__(self, config, _input: IOManager, _output: IOManager, run_id: str, *args, **kwargs): self.local_storage_path = settings.get("local_storage_path", "output") self.run_id = run_id self.data_settings = config.get('data_settings', {}) self.model_settings = config.get('model_settings', {}) self.job_settings = config.get('job_settings', {}) self.flow_settings = config.get('flow_settings', {}) self._input_manager = _input self._output_manager = _output
from mlapp.main import MLApp from mlapp.config import settings import pandas as pd if __name__ == "__main__": config = { 'handler_name': 'handler', 'files': [{ 'table_name': 'table', 'file_name': 'file_name.csv' }] } mlapp = MLApp({'env_file_path': 'path/to/.env'}) handlers = {} for service_name in settings.get('services', []): service_item = settings['services'][service_name] try: handlers[service_name] = service_item['handler'](service_item.get('settings', {})) except Exception as e: if service_item['handler'] is None: raise Exception("{} service is missing a python library installation.".format(service_name)) else: raise e for item in config['files']: df = pd.read_csv(item['file_name']) handlers[config['handler_name']].insert_df(item['table_name'], df)