Example #1
0
    def get_feature_object_name_and_path(cls, input_path, feature_dir):
        """
        This function determines how to name the object for a specific feature class
        and creates the full path to save the object. This full path is also used as
        the feature name attribute
        Args:
            input_path (str): path to the input data from processed cycler run
            feature_dir (str): path to the base directory for the feature sets.
        Returns:
            str: the full path (including filename) to use for saving the feature
                object
        """
        new_filename = os.path.basename(input_path)
        new_filename = scrub_underscore_suffix(new_filename)

        # Append model_name along with "features" to demarcate
        # different models when saving the feature vectors.
        new_filename = add_suffix_to_filename(
            new_filename, "_features" + "_" + cls.class_feature_name)
        if not os.path.isdir(os.path.join(feature_dir,
                                          cls.class_feature_name)):
            os.makedirs(os.path.join(feature_dir, cls.class_feature_name))
        feature_path = os.path.join(feature_dir, cls.class_feature_name,
                                    new_filename)
        feature_path = os.path.abspath(feature_path)
        return feature_path
Example #2
0
def process_file_list_from_json(file_list_json,
                                processed_dir='data-share/features/',
                                features_label='full_model',
                                predict_only=False,
                                prediction_type="multi",
                                predicted_quantity="cycle"):
    """
    Function to take a json file containing processed cycler run file locations,
    extract features, dump the processed file into a predetermined directory,
    and return a jsonable dict of feature file locations.

    Args:
        file_list_json (str): json string or json filename corresponding
            to a dictionary with a file_list attribute,
            if this string ends with ".json", a json file is assumed
            and loaded, otherwise interpreted as a json string.
        processed_dir (str): location for processed cycler run output files
            to be placed.
        features_label (str): name of feature generation method.
        predict_only (bool): whether to calculate predictions or not.
        prediction_type (str): Single or multi-point predictions.
        predicted_quantity (str): quantity being predicted - cycle or capacity.

    Returns:
        str: json string of feature files (with key "file_list").

    """
    # Get file list and validity from json, if ends with .json,
    # assume it's a file, if not assume it's a json string
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup Events
    events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode'])

    # Add root path to processed_dir
    processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"),
                                 processed_dir)
    file_list = file_list_data['file_list']
    run_ids = file_list_data['run_list']
    processed_run_list = []
    processed_result_list = []
    processed_message_list = []
    processed_paths_list = []

    required_cycle_num = 100  #for full model

    for path, run_id in zip(file_list, run_ids):
        logger.info('run_id=%s featurizing=%s', str(run_id), path, extra=s)

        #check if there is enough data to try featurizing
        if not len(loadfn(path).summary) > required_cycle_num:
            logger.info("run_id=%s Insufficient data for featurization",
                        str(run_id),
                        extra=s)
            processed_paths_list.append(path)
            processed_run_list.append(run_id)
            processed_result_list.append("incomplete")
            processed_message_list.append({
                'comment': 'Insufficient data for featurization',
                'error': ''
            })

        else:
            processed_data = DegradationPredictor.from_processed_cycler_run_file(
                path,
                features_label=features_label,
                predict_only=predict_only,
                prediction_type=prediction_type,
                predicted_quantity=predicted_quantity)
            new_filename = os.path.basename(path)
            new_filename = scrub_underscore_suffix(new_filename)

            # Append model_name along with "features" to demarcate
            # different models when saving the feature vectors.
            new_filename = add_suffix_to_filename(
                new_filename,
                "_" + features_label + "_" + prediction_type + "_features")
            processed_path = os.path.join(processed_dir, new_filename)
            processed_path = os.path.abspath(processed_path)
            dumpfn(processed_data, processed_path)
            processed_paths_list.append(processed_path)
            processed_run_list.append(run_id)
            processed_result_list.append("success")
            processed_message_list.append({'comment': '', 'error': ''})

    output_data = {
        "file_list": processed_paths_list,
        "run_list": processed_run_list,
        "result_list": processed_result_list,
        "message_list": processed_message_list
    }

    events.put_analyzing_event(output_data, 'featurizing', 'complete')
    # Return jsonable file list
    return json.dumps(output_data)
Example #3
0
def process_file_list_from_json(file_list_json, model_dir="/data-share/models/",
                                processed_dir='data-share/predictions/',
                                hyperparameters=None, model_name=None, predict_only=True):
    """
    Function to take a json file containing featurized json locations,
    train a new model if necessary, write files containing predictions into a
    predetermined directory, and return a jsonable dict of prediction file locations

    Args:
        file_list_json (str): json string or json filename corresponding
            to a dictionary with a file_list attribute,
            if this string ends with ".json", a json file is assumed
            and loaded, otherwise interpreted as a json string
        model_dir (str): location where models are serialized and stored
        processed_dir (str): location for processed cycler run output files
            to be placed
        hyperparameters (dict): dictionary of hyperparameters to optimize/use for training
        model_name (str): name of feature generation method
        predict_only (bool):

    Returns:
        str: json string of feature files (with key "feature_file_list").

    """
    # Get file list and validity from json, if ends with .json,
    # assume it's a file, if not assume it's a json string
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup Events
    events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode'])

    # Add BEEP_ROOT to processed_dir
    processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"),
                                 processed_dir)
    file_list = file_list_data['file_list']
    run_ids = file_list_data['run_list']
    processed_run_list = []
    processed_result_list = []
    processed_message_list = []
    processed_paths_list = []
    project_name = get_project_name_from_list(file_list)
    if predict_only:
        features = loadfn(file_list[0])
        if model_name is None and project_name in DEFAULT_MODEL_PROJECTS:

            if features.prediction_type == 'multi':
                model = DegradationModel.from_serialized_model(model_dir=model_dir,
                                                               serialized_model='d3batt_multi_point.model')
            else:
                model = DegradationModel.from_serialized_model(model_dir=model_dir,
                                                               serialized_model='d3batt_single_point.model')

        elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS:
            output_data = {"file_list": [],
                           "run_list": [],
                           "result_list": [],
                           "message_list": []
                           }

            events.put_analyzing_event(output_data, 'predicting', 'error')

            # Return jsonable file list
            return json.dumps(output_data)

        else:
            model = DegradationModel.from_serialized_model(model_dir=model_dir,
                                                           serialized_model=model_name)

    else:
        if hyperparameters is None:
            hyperparameters = {'random_state': 1,
                               'test_size': .3,
                               'k_fold': 5,
                               'tol': 0.001,
                               'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]
                               }

        dataset_id = file_list_data.get("dataset_id")
        model = DegradationModel.train(file_list_json, dataset_id=dataset_id,
                                       model_type='linear', regularization_type='elasticnet',
                                       model_name=model_name, hyperparameters=hyperparameters)
        logger.warning('fitting=%s dataset=%s', model.name, str(dataset_id), extra=s)

    for path, run_id in zip(file_list, run_ids):
        logger.info('model=%s run_id=%s predicting=%s', model.name, str(run_id), path, extra=s)
        features = loadfn(path)
        prediction = model.predict(features)
        prediction_dict = model.prediction_to_dict(prediction, features.nominal_capacity)
        new_filename = os.path.basename(path)
        new_filename = scrub_underscore_suffix(new_filename)
        new_filename = add_suffix_to_filename(new_filename, "_predictions")
        processed_path = os.path.join(processed_dir, new_filename)
        processed_path = os.path.abspath(processed_path)
        dumpfn(prediction_dict, processed_path)

        # Append file loc to list to be returned
        processed_paths_list.append(processed_path)
        processed_run_list.append(run_id)
        processed_result_list.append("success")
        processed_message_list.append({'comment': '',
                                       'error': ''})

    output_data = {"file_list": processed_paths_list,
                   "run_list": processed_run_list,
                   "result_list": processed_result_list,
                   "message_list": processed_message_list
                   }

    events.put_analyzing_event(output_data, 'predicting', 'complete')

    # Return jsonable file list
    return json.dumps(output_data)
Example #4
0
def process_file_list_from_json(
    file_list_json,
    model_dir="/data-share/models/",
    processed_dir="data-share/predictions/",
    hyperparameters=None,
    model_name=None,
    predict_only=True,
):
    """
    Function to take a json file containing featurized json locations,
    train a new model if necessary, write files containing predictions into a
    predetermined directory, and return a jsonable dict of prediction file locations

    Args:
        file_list_json (str): json string or json filename corresponding
            to a dictionary with a file_list attribute,
            if this string ends with ".json", a json file is assumed
            and loaded, otherwise interpreted as a json string
        model_dir (str): location where models are serialized and stored
        processed_dir (str): location for processed cycler run output files
            to be placed
        hyperparameters (dict): dictionary of hyperparameters to optimize/use for training
        model_name (str): name of feature generation method
        predict_only (bool):

    Returns:
        str: json string of feature files (with key "feature_file_list").

    """
    # Get file list and validity from json, if ends with .json,
    # assume it's a file, if not assume it's a json string
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup workflow TODO

    # Add BEEP_PROCESSING_DIR to processed_dir
    processed_dir = os.path.join(os.environ.get("BEEP_PROCESSING_DIR", "/"),
                                 processed_dir)
    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)

    file_list = file_list_data["file_list"]
    run_ids = file_list_data["run_list"]
    processed_run_list = []
    processed_result_list = []
    processed_message_list = []
    processed_paths_list = []
    project_name = get_project_name_from_list(file_list)
    if predict_only:
        features = loadfn(file_list[0])
        if model_name is None and project_name in DEFAULT_MODEL_PROJECTS:

            if features.prediction_type == "multi":
                model = DegradationModel.from_serialized_model(
                    model_dir=model_dir,
                    serialized_model="d3batt_multi_point.model")
            else:
                model = DegradationModel.from_serialized_model(
                    model_dir=model_dir,
                    serialized_model="d3batt_single_point.model")

        elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS:
            output_data = {
                "file_list": [],
                "run_list": [],
                "result_list": [],
                "message_list": [],
            }

            # Return jsonable file list
            return json.dumps(output_data)

        else:
            model = DegradationModel.from_serialized_model(
                model_dir=model_dir, serialized_model=model_name)

    else:
        if hyperparameters is None:
            hyperparameters = {
                "random_state": 1,
                "test_size": 0.3,
                "k_fold": 5,
                "tol": 0.001,
                "l1_ratio": [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
            }

        dataset_id = file_list_data.get("dataset_id")
        model = DegradationModel.train(
            file_list_json,
            dataset_id=dataset_id,
            model_type="linear",
            regularization_type="elasticnet",
            model_name=model_name,
            hyperparameters=hyperparameters,
        )
        logger.warning("fitting=%s dataset=%s",
                       model.name,
                       str(dataset_id),
                       extra=s)

    for path, run_id in zip(file_list, run_ids):
        logger.info("model=%s run_id=%s predicting=%s",
                    model.name,
                    str(run_id),
                    path,
                    extra=s)
        features = loadfn(path)
        prediction = model.predict(features)
        prediction_dict = model.prediction_to_dict(prediction,
                                                   features.nominal_capacity)
        new_filename = os.path.basename(path)
        new_filename = scrub_underscore_suffix(new_filename)
        new_filename = add_suffix_to_filename(new_filename, "_predictions")
        processed_path = os.path.join(processed_dir, new_filename)
        processed_path = os.path.abspath(processed_path)
        dumpfn(prediction_dict, processed_path)

        # Append file loc to list to be returned
        processed_paths_list.append(processed_path)
        processed_run_list.append(run_id)
        processed_result_list.append("success")
        processed_message_list.append({"comment": "", "error": ""})

    output_data = {
        "file_list": processed_paths_list,
        "run_list": processed_run_list,
        "result_list": processed_result_list,
        "message_list": processed_message_list,
    }

    # Return jsonable file list
    return json.dumps(output_data)