Ejemplo n.º 1
0
    def _update_latest_model_id(config, run_ids):
        """
        This is a helper function for using `latest` feature in local environment.
        Updates local file containing the latest id used for an asset.
        :param config: current flow run configuration as a Dictionary
        :params run_ids: list of mlapp identifiers generated in the current flow run.
        """
        # prepare latest file
        local_path = settings.get('local_storage_path', 'output')
        latest_file_name = settings.get('latest_file_name', 'latest_ids.json')
        if not os.path.exists(local_path):
            os.makedirs(local_path)
        latest_ids_path = os.path.join(local_path, latest_file_name)

        latest = {}
        try:
            with open(latest_ids_path) as f:
                latest = json.load(f)
        except:
            pass

        # iterate over pipelines
        for pipeline, run_id in zip(config['pipelines_configs'], run_ids):
            # check if ran any pipeline where id is being stored
            # TODO: add here asset label level
            # TODO: add here data_id/model_id/reuse_features_id
            if pipeline['job_settings']['pipeline'] in [
                    'train', 'feature_engineering'
            ]:
                latest[pipeline['job_settings']['asset_name']] = run_id

            with open(latest_ids_path, 'w') as f:
                json.dump(latest, f)
Ejemplo n.º 2
0
    def _insert_latest_id_in_config(config):
        """
        This is a helper function for using `latest` feature in local environment.
        Updates current configuration to be run with the latest id stored in a local file containing it
        by reference of asset name.
        :param config: current flow run configuration as a Dictionary
        """
        # prepare latest file
        local_path = settings.get('local_storage_path', 'output')
        latest_file_name = settings.get('latest_file_name', 'latest_ids.json')
        latest_ids_path = os.path.join(local_path, latest_file_name)
        try:
            with open(latest_ids_path) as f:
                latest = json.load(f)
        except:
            latest = {}

        # iterate pipelines
        for i, pipeline in enumerate(config.get('pipelines_configs', [])):
            # iterate optional ids
            for id_type in ['model_id', 'data_id', 'reuse_features_id']:
                # check if requested latest
                if pipeline.get('job_settings', {}).get(id_type, None) == 'latest':
                    # get current asset name
                    asset_name = pipeline['job_settings']['asset_name']
                    # check if available id
                    if asset_name in latest:
                        # TODO: add here asset label level
                        # TODO: add here data_id/model_id/reuse_features_id
                        config['pipelines_configs'][i]['job_settings'][id_type] = latest[asset_name]
                    else:
                        # raise exception as not found id
                        raise Exception("Could not find latest `" + id_type + "` for `" + asset_name + "`. \n"
                                        "Please update your config with a valid `" + id_type + "`")
Ejemplo n.º 3
0
    def __init__(self, run_id, pipeline_input, _input: IOManager,
                 _output: IOManager, config, *args, **kwargs):
        """
        :param pipeline_input: the pipeline name string or list of strings
        :param _input: IOmanager instance with input to the pipeline
        :param _output: IOmanager instance to store the outputs of the pipelines to be saved externally
        :param config: config string of the pipeline
        :param args:
        :param kwargs:
        """
        for asset_name in AVAILABLE_STAGES:
            if asset_name != BASE_CLASS_NAME:
                AVAILABLE_STAGES[asset_name] = {}

        self.pipeline_name = ''

        # pipeline can be either list of stages or string of a default pipeline
        if isinstance(pipeline_input, list):
            self.stages = pipeline_input
        if isinstance(pipeline_input, str):
            self.pipeline_name = " '" + pipeline_input + "'"
            self.stages = settings.get('pipelines', {}).get(pipeline_input, [])

        self.config = config
        self.run_id = run_id
        self.input_manager = _input
        self.output_manager = _output
        self.asset_name = self.config.get('job_settings',
                                          {}).get('asset_name', '')

        self.data_manager_instance = self.create_manager_instance('data')
        self.model_manager_instance = self.create_manager_instance('model')

        # first inputs
        self.state = dict.fromkeys(self.stages, {})
Ejemplo n.º 4
0
    def init(self, handler_type):
        """
        Initialization, should be called once only
        Populates the `self._handlers` and `self_main_handlers` variables depending on the set environment
        :param handler_type: used for filtering services by the handler type
        """
        if not self.initialized:
            for service_name in settings.get('services', []):
                service_item = settings['services'][service_name]
                if 'type' not in service_item:
                    raise Exception("'{}' service is missing 'type' key, must be filled in config.py with"
                                    " the one of the following: database/file_storage/database/spark".format(service_name))
                if service_item['type'] == handler_type:
                    try:
                        self._handlers[service_name] = service_item['handler'](service_item.get('settings', {}))

                        # set it as main
                        if service_item.get('main', False):
                            self._main_handlers.append(service_name)

                    except SkipServiceException as e:
                        pass  # skipping this service
                    except Exception as e:
                        if service_item['handler'] is None:
                            raise Exception("'{}' service of type '{}' is missing a python library installation."
                                            .format(service_name, service_item.get('type')))
                        else:
                            raise e
            self.initialized = True
Ejemplo n.º 5
0
 def __init__(self, config, _input: IOManager, _output: IOManager, run_id: str, *args, **kwargs):
     self.local_storage_path = settings.get("local_storage_path", "output")
     self.run_id = run_id
     self.data_settings = config.get('data_settings', {})
     self.model_settings = config.get('model_settings', {})
     self.job_settings = config.get('job_settings', {})
     self.flow_settings = config.get('flow_settings', {})
     self._input_manager = _input
     self._output_manager = _output
Ejemplo n.º 6
0
from mlapp.main import MLApp
from mlapp.config import settings
import pandas as pd

if __name__ == "__main__":
    config = {
        'handler_name': 'handler',
        'files': [{
            'table_name': 'table',
            'file_name': 'file_name.csv'
        }]
    }

    mlapp = MLApp({'env_file_path': 'path/to/.env'})
    handlers = {}
    for service_name in settings.get('services', []):
        service_item = settings['services'][service_name]
        try:
            handlers[service_name] = service_item['handler'](service_item.get('settings', {}))
        except Exception as e:
            if service_item['handler'] is None:
                raise Exception("{} service is missing a python library installation.".format(service_name))
            else:
                raise e

    for item in config['files']:
        df = pd.read_csv(item['file_name'])
        handlers[config['handler_name']].insert_df(item['table_name'], df)