def get_full_metadata_by_uuid(model_uuid, collection_name=None):
    """Retrieve model parameter metadata for the given model_uuid and collection.
    The returned metadata dictionary will include training run performance metrics and
    training dataset metadata.

    Args:
        model_uuid (str): model unique identifier
        collection_name(str): collection to search (optional, searches all collections if not specified)
    Returns:
        Matching metadata dictionary. Raises MongoQueryException if the query fails.
    """

    if not mlmt_supported:
        print(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return None

    mlmt_client = dsf.initialize_model_tracker()

    if collection_name is None:
        collection_name = get_model_collection_by_uuid(model_uuid,
                                                       mlmt_client=mlmt_client)

    return mlmt_client.get_model(collection_name=collection_name,
                                 model_uuid=model_uuid)
def get_model_collection_by_uuid(model_uuid, mlmt_client=None):
    """Retrieve model collection given a uuid.

    Args:
        model_uuid (str): model uuid

        mlmt_client: Ignored
    Returns:
        Matching collection name
    Raises:
        ValueError if there is no collection containing a model with the given uuid.
    """

    if not mlmt_supported:
        print(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return None

    mlmt_client = dsf.initialize_model_tracker()

    collections = mlmt_client.collections.get_collection_names().result()
    for col in collections:
        if not col.startswith('old_'):
            if mlmt_client.count_models(collection_name=col,
                                        model_uuid=model_uuid) > 0:
                return col

    raise ValueError('Collection not found for uuid: ' + model_uuid)
def get_full_metadata(filter_dict, collection_name=None):
    """Retrieve relevant full metadata (including training run metrics) of models matching given criteria.

    Args:
        filter_dict (dict): dictionary to filter on

        collection_name (str): Name of collection to search

    Returns:
        A list of matching full model metadata (including training run metrics) dictionaries. Raises MongoQueryException if the query fails.
    """
    if not mlmt_supported:
        print(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return None

    if filter_dict is None:
        raise ValueError('Parameter filter_dict cannot be None.')
    if collection_name is None:
        raise ValueError('Parameter collection_name cannot be None.')
    mlmt_client = dsf.initialize_model_tracker()

    query_params = {
        "match_metadata": filter_dict,
    }

    metadata_list = mlmt_client.model.query_model_metadata(
        collection_name=collection_name, query_params=query_params).result()
    return list(metadata_list)
def get_metadata_by_uuid(model_uuid, collection_name=None):
    """Retrieve model parameter metadata by model_uuid. The resulting metadata dictionary can
    be passed to parameter_parser.wrapper(); it does not contain performance metrics or
    training dataset metadata.

    Args:
        model_uuid (str): model unique identifier
        collection_name(str): collection to search (optional, searches all collections if not specified)
    Returns:
        Matching metadata dictionary. Raises MongoQueryException if the query fails.
    """

    if not mlmt_supported:
        print(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return None

    mlmt_client = dsf.initialize_model_tracker()

    if collection_name is None:
        collection_name = get_model_collection_by_uuid(model_uuid,
                                                       mlmt_client=mlmt_client)

    exclude_fields = [
        "training_metrics", "time_built", "training_dataset.dataset_metadata"
    ]
    return mlmt_client.get_model(collection_name=collection_name,
                                 model_uuid=model_uuid,
                                 exclude_fields=exclude_fields)
예제 #5
0
def train_model_from_tracker(model_uuid, output_dir):
    """ Retrain a model saved in the model tracker, but save it to output_dir and don't insert it into the model tracker

    Args:
        model_uuid (str): model tracker model_uuid file

        output_dir (str): path to output directory

    Returns:
        the model pipeline object with trained model
    """

    if not mlmt_supported:
        logger.debug(
            "Model tracker not supported in your environment; can load models from filesystem only."
        )
        return None

    mlmt_client = dsf.initialize_model_tracker()

    collection_name = mt.get_model_collection_by_uuid(model_uuid,
                                                      mlmt_client=mlmt_client)

    # get metadata from tracker
    config = mt.get_metadata_by_uuid(model_uuid)

    # check if datastore dataset
    try:
        result = dsf.retrieve_dataset_by_datasetkey(
            config['training_dataset']['dataset_key'],
            bucket=config['training_dataset']['bucket'])
        if result is not None:
            config['datastore'] = True
    except:
        pass
    # fix weird old parameters
    #if config[]
    # Parse parameters
    params = parse.wrapper(config)
    params.result_dir = output_dir
    # otherwise this will have the same uuid as the source model
    params.model_uuid = None
    # use the same split
    params.previously_split = True
    params.split_uuid = config['splitting_parameters']['split_uuid']
    # specify collection
    params.collection_name = collection_name

    logger.debug("model params %s" % str(params))

    # Create model pipeline
    model = mp.ModelPipeline(params)

    # Train model
    model.train_model()

    return model
예제 #6
0
    def instantiate_mlmt_client(self, use_production_server=True):
        """Instantiate the mlmt_client.

        Args:
            use_production_server (bool): True if production server should be
            used. False if local server should be used. Default True. Local
            server should only be used for testing.
            
        """

        # =====================================================
        # Set up machine learning model tracker (mlmt) client.
        # =====================================================
        # Toggle True/False to use production server or the forsyth2 personal
        # server.
        # The former should almost always be used, unless testing with code only
        # running on the latter.
        self.ds_client, self.mlmt_client = dsf.initialize_model_tracker(
            use_production_server, self.ds_client)
예제 #7
0
def get_model_collection_by_uuid(model_uuid, mlmt_client=None):
    """Retrieve model collection given a uuid.

    Args:
        model_uuid (str): model uuid

        mlmt_client: Ignored
    Returns:
        Matching collection name
    Raises:
        ValueError if there is no collection containing a model with the given uuid.
    """

    mlmt_client = dsf.initialize_model_tracker()

    collections = mlmt_client.collections.get_collection_names().result()
    for col in collections:
        if mlmt_client.count_models(collection_name=col, model_uuid=model_uuid) > 0:
            return col

    raise ValueError('Collection not found for uuid: ' + model_uuid)
def save_model(pipeline, collection_name='model_tracker', log=True):
    """Save the model.

    Save the model files to the datastore and save the model metadata dict to the Mongo database.

    Args:
        pipeline (ModelPipeline object): the pipeline to use
        collection_name (str): the name of the Mongo DB collection to use
        log (bool): True if logs should be printed, default False
        use_personal_client (bool): True if personal client should be used (i.e. for testing), default False

    Returns:
        None if insertion was successful, raises UnableToTarException, DatastoreInsertionException, MLMTClientInstantiationException
        or MongoInsertionException otherwise
    """

    if pipeline is None:
        raise Exception('pipeline cannot be None.')

    if not mlmt_supported:
        print(
            "Model tracker not supported in your environment; can save models in filesystem only."
        )
        return

    # ModelPipeline.create_model_metadata() should be called before the call to save_model.
    # Get the metadata dictionary from the model pipeline.
    metadata_dict = pipeline.model_metadata
    model_uuid = metadata_dict['model_uuid']
    if model_uuid is None:
        raise ValueError("model_uuid is missing from pipeline metadata.")

    #### Part 1: Save the model tarball ####
    model = pipeline.model_wrapper
    # best_model_dir is an absolute path.
    directory_to_tar = model.best_model_dir
    # Put tar file in a temporary directory that will automatically be destroyed when we're done
    with tempfile.TemporaryDirectory() as tmp_dir:
        tar_file = os.path.join(
            tmp_dir, 'model_{model_uuid}.tar.gz'.format(model_uuid=model_uuid))
        tar_flags = 'czf'
        # Change directory to model_dir so that paths in tarball are relative to model_dir.
        tar_command = 'tar -{tar_flags} {tar_file} -C {directory_to_tar} .'.format(
            tar_flags=tar_flags,
            tar_file=tar_file,
            directory_to_tar=directory_to_tar)
        try:
            subprocess.check_output(tar_command.split())
        except subprocess.CalledProcessError as e:
            pipeline.log.error(
                'Command to create model tarball returned status {return_code}'
                .format(return_code=e.returncode))
            pipeline.log.error('Command was: "{cmd}"'.format(cmd=e.cmd))
            pipeline.log.error(
                'Output was: "{output}"'.format(output=e.output))
            pipeline.log.error(
                'stderr was: "{stderr}"'.format(stderr=e.stderr))
            raise UnableToTarException(
                'Unable to tar {directory_to_tar}.'.format(
                    directory_to_tar=directory_to_tar))
        title = '{model_uuid} model tarball'.format(model_uuid=model_uuid)
        uploaded_results = dsf.upload_file_to_DS(
            bucket=pipeline.params.model_bucket,
            title=title,
            description=title,
            tags=[],
            key_values={
                'model_uuid': model_uuid,
                'file_category': 'ml_model'
            },
            filepath=tmp_dir,
            filename=tar_file,
            dataset_key='model_' + model_uuid + '_tarball',
            client=pipeline.ds_client,
            return_metadata=True)
        if uploaded_results is None:
            raise DatastoreInsertionException(
                'Unable to upload title={title} to datastore.'.format(
                    title=title))
    # Get the dataset_oid for actual metadata file stored in datastore.
    model_dataset_oid = uploaded_results['dataset_oid']
    # By adding dataset_oid to the dict, we can immediately find the datastore file asssociated with a model.
    metadata_dict['model_parameters']['model_dataset_oid'] = model_dataset_oid

    #### Part 2: Save the model metadata ####
    mlmt_client = dsf.initialize_model_tracker()
    mlmt_client.save_metadata(collection_name=collection_name,
                              model_uuid=metadata_dict['model_uuid'],
                              model_metadata=metadata_dict)
    if log:
        print('Successfully inserted into the database with model_uuid %s.' %
              model_uuid)
예제 #9
0
def save_model(pipeline, collection_name='model_tracker', log=True):
    """Save the model.

    Save the model files to the datastore and save the model metadata dict to the Mongo database.

    Args:
        pipeline (ModelPipeline object): the pipeline to use

        collection_name (str): the name of the Mongo DB collection to use

        log (bool): True if logs should be printed, default False

        use_personal_client (bool): True if personal client should be used (i.e. for testing), default False

    Returns:
        None if insertion was successful, raises DatastoreInsertionException, MLMTClientInstantiationException
        or MongoInsertionException otherwise
    """

    if pipeline is None:
        raise Exception('pipeline cannot be None.')

    if not mlmt_supported:
        logger.error(
            "Model tracker not supported in your environment; can save models in filesystem only."
        )
        return

    # ModelPipeline.create_model_metadata() should be called before the call to save_model.
    # Get the metadata dictionary from the model pipeline.
    metadata_dict = pipeline.model_metadata
    model_uuid = metadata_dict['model_uuid']
    if model_uuid is None:
        raise ValueError("model_uuid is missing from pipeline metadata.")

    #### Part 1: Save the model tarball in the datastore ####
    model = pipeline.model_wrapper
    # Put tar file in a temporary directory that will automatically be destroyed when we're done
    with tempfile.TemporaryDirectory() as tmp_dir:
        tarball_path = os.path.join(tmp_dir, f"model_{model_uuid}.tar.gz")
        save_model_tarball(pipeline.params.output_dir, tarball_path)

        title = f"{model_uuid} model tarball"
        ds_key = f"model_{model_uuid}_tarball"
        uploaded_results = dsf.upload_file_to_DS(
            bucket=pipeline.params.model_bucket,
            title=title,
            description=title,
            tags=[],
            key_values={
                'model_uuid': model_uuid,
                'file_category': 'ml_model'
            },
            filepath=tmp_dir,
            filename=tarball_path,
            dataset_key=ds_key,
            client=pipeline.ds_client,
            return_metadata=True)
        if uploaded_results is None:
            raise DatastoreInsertionException(
                'Unable to upload title={title} to datastore.'.format(
                    title=title))
    # Get the dataset_oid for actual metadata file stored in datastore.
    model_dataset_oid = uploaded_results['dataset_oid']
    # By adding dataset_oid to the dict, we can immediately find the datastore file asssociated with a model.
    metadata_dict['model_parameters']['model_dataset_oid'] = model_dataset_oid

    #### Part 2: Save the model metadata in the model tracker ####
    mlmt_client = dsf.initialize_model_tracker()
    mlmt_client.save_metadata(collection_name=collection_name,
                              model_uuid=metadata_dict['model_uuid'],
                              model_metadata=metadata_dict)
    if log:
        logger.info(
            'Successfully inserted into the database with model_uuid %s.' %
            model_uuid)
예제 #10
0
    def __init__(self, params, hyperparam_uuid=None):
        """
        
        Args:
            
            params: The input hyperparameter parameters
            
            hyperparam_uuid: Optional, UUID for hyperparameter run if you want to group this run with a previous run.
            We ended up mainly doing this via collections, so not really used
        """
        self.hyperparam_layers = {'layer_sizes', 'dropouts', 'weight_init_stddevs', 'bias_init_consts'}
        self.hyperparam_keys = {'model_type', 'featurizer', 'splitter', 'learning_rate', 'weight_decay_penalty',
                                'rf_estimators', 'rf_max_features', 'rf_max_depth',
                                'umap_dim', 'umap_targ_wt', 'umap_metric', 'umap_neighbors', 'umap_min_dist',
                                'xgb_learning_rate',
                                'xgb_gamma'}
        self.nn_specific_keys = {'learning_rate', 'layers','weight_decay_penalty'}
        self.rf_specific_keys = {'rf_estimators', 'rf_max_features', 'rf_max_depth'}
        self.xgboost_specific_keys = {'xgb_learning_rate', 'xgb_gamma'}
        self.hyperparam_keys |= self.hyperparam_layers
        self.excluded_keys = excluded_keys
        self.convert_to_float = parse.convert_to_float_list
        self.convert_to_int = parse.convert_to_int_list
        self.params = params
        # simplify NN layer construction
        if (params.layer_nums != None) and (params.node_nums != None) and (params.dropout_list != None):

            self.params.layer_sizes, self.params.dropouts = permutate_NNlayer_combo_params(params.layer_nums,
                                                                                           params.node_nums,
                                                                                           params.dropout_list,
                                                                                           params.max_final_layer_size)
        if hyperparam_uuid is None:
            self.hyperparam_uuid = str(uuid.uuid4())
        else:
            self.hyperparam_uuid = hyperparam_uuid
        self.hyperparams = {}
        self.new_params = {}
        self.layers = {}
        self.param_combos = []
        self.num_rows = {}
        self.log = logging.getLogger("hyperparam_search")
        # Create handlers
        c_handler = logging.StreamHandler()
        log_path = os.path.join(self.params.result_dir, 'logs')
        if not os.path.exists(log_path):
            os.makedirs(log_path)
        f_handler = logging.FileHandler(os.path.join(log_path, '{0}.log'.format(self.hyperparam_uuid)))
        self.out_file = open(os.path.join(log_path, '{0}.json'.format(self.hyperparam_uuid)), 'a')
        c_handler.setLevel(logging.WARNING)
        f_handler.setLevel(logging.INFO)
        # Create formatters and add it to handlers
        c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
        f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        c_handler.setFormatter(c_format)
        f_handler.setFormatter(f_format)
        # Add handlers to the logger
        self.log.addHandler(c_handler)
        self.log.addHandler(f_handler)

        self.mlmt_client = dsf.initialize_model_tracker()

        slurm_path = os.path.join(self.params.result_dir, 'slurm_files')
        if not os.path.exists(slurm_path):
            os.makedirs(slurm_path)
        self.shell_script = os.path.join(self.params.script_dir, 'utils', 'run.sh')
        with open(self.shell_script, 'w') as f:
            hostname = ''.join(list(filter(lambda x: x.isalpha(), socket.gethostname())))
            f.write("#!/bin/bash\n#SBATCH -A {2}\n#SBATCH -N 1\n#SBATCH -p partition={0}\n#SBATCH -t 24:00:00"
                    "\n#SBATCH -p {3}\n#SBATCH --export=ALL\n#SBATCH -D {1}\n".format(hostname, slurm_path,
                    self.params.lc_account,self.params.slurm_partition))
            f.write('start=`date +%s`\necho $3\n$1 $2/pipeline/model_pipeline.py $3\nend=`date +%s`\n'
                    'runtime=$((end-start))\necho "runtime: " $runtime')