def get_feature_object_name_and_path(cls, input_path, feature_dir): """ This function determines how to name the object for a specific feature class and creates the full path to save the object. This full path is also used as the feature name attribute Args: input_path (str): path to the input data from processed cycler run feature_dir (str): path to the base directory for the feature sets. Returns: str: the full path (including filename) to use for saving the feature object """ new_filename = os.path.basename(input_path) new_filename = scrub_underscore_suffix(new_filename) # Append model_name along with "features" to demarcate # different models when saving the feature vectors. new_filename = add_suffix_to_filename( new_filename, "_features" + "_" + cls.class_feature_name) if not os.path.isdir(os.path.join(feature_dir, cls.class_feature_name)): os.makedirs(os.path.join(feature_dir, cls.class_feature_name)) feature_path = os.path.join(feature_dir, cls.class_feature_name, new_filename) feature_path = os.path.abspath(feature_path) return feature_path
def process_file_list_from_json(file_list_json, processed_dir='data-share/features/', features_label='full_model', predict_only=False, prediction_type="multi", predicted_quantity="cycle"): """ Function to take a json file containing processed cycler run file locations, extract features, dump the processed file into a predetermined directory, and return a jsonable dict of feature file locations. Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string. processed_dir (str): location for processed cycler run output files to be placed. features_label (str): name of feature generation method. predict_only (bool): whether to calculate predictions or not. prediction_type (str): Single or multi-point predictions. predicted_quantity (str): quantity being predicted - cycle or capacity. Returns: str: json string of feature files (with key "file_list"). """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup Events events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode']) # Add root path to processed_dir processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"), processed_dir) file_list = file_list_data['file_list'] run_ids = file_list_data['run_list'] processed_run_list = [] processed_result_list = [] processed_message_list = [] processed_paths_list = [] required_cycle_num = 100 #for full model for path, run_id in zip(file_list, run_ids): logger.info('run_id=%s featurizing=%s', str(run_id), path, extra=s) #check if there is enough data to try featurizing if not len(loadfn(path).summary) > required_cycle_num: logger.info("run_id=%s Insufficient data for featurization", str(run_id), extra=s) processed_paths_list.append(path) processed_run_list.append(run_id) processed_result_list.append("incomplete") processed_message_list.append({ 'comment': 'Insufficient data for featurization', 'error': '' }) else: processed_data = DegradationPredictor.from_processed_cycler_run_file( path, features_label=features_label, predict_only=predict_only, prediction_type=prediction_type, predicted_quantity=predicted_quantity) new_filename = os.path.basename(path) new_filename = scrub_underscore_suffix(new_filename) # Append model_name along with "features" to demarcate # different models when saving the feature vectors. new_filename = add_suffix_to_filename( new_filename, "_" + features_label + "_" + prediction_type + "_features") processed_path = os.path.join(processed_dir, new_filename) processed_path = os.path.abspath(processed_path) dumpfn(processed_data, processed_path) processed_paths_list.append(processed_path) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({'comment': '', 'error': ''}) output_data = { "file_list": processed_paths_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list } events.put_analyzing_event(output_data, 'featurizing', 'complete') # Return jsonable file list return json.dumps(output_data)
def process_file_list_from_json(file_list_json, model_dir="/data-share/models/", processed_dir='data-share/predictions/', hyperparameters=None, model_name=None, predict_only=True): """ Function to take a json file containing featurized json locations, train a new model if necessary, write files containing predictions into a predetermined directory, and return a jsonable dict of prediction file locations Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string model_dir (str): location where models are serialized and stored processed_dir (str): location for processed cycler run output files to be placed hyperparameters (dict): dictionary of hyperparameters to optimize/use for training model_name (str): name of feature generation method predict_only (bool): Returns: str: json string of feature files (with key "feature_file_list"). """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup Events events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode']) # Add BEEP_ROOT to processed_dir processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"), processed_dir) file_list = file_list_data['file_list'] run_ids = file_list_data['run_list'] processed_run_list = [] processed_result_list = [] processed_message_list = [] processed_paths_list = [] project_name = get_project_name_from_list(file_list) if predict_only: features = loadfn(file_list[0]) if model_name is None and project_name in DEFAULT_MODEL_PROJECTS: if features.prediction_type == 'multi': model = DegradationModel.from_serialized_model(model_dir=model_dir, serialized_model='d3batt_multi_point.model') else: model = DegradationModel.from_serialized_model(model_dir=model_dir, serialized_model='d3batt_single_point.model') elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS: output_data = {"file_list": [], "run_list": [], "result_list": [], "message_list": [] } events.put_analyzing_event(output_data, 'predicting', 'error') # Return jsonable file list return json.dumps(output_data) else: model = DegradationModel.from_serialized_model(model_dir=model_dir, serialized_model=model_name) else: if hyperparameters is None: hyperparameters = {'random_state': 1, 'test_size': .3, 'k_fold': 5, 'tol': 0.001, 'l1_ratio': [.1, .5, .7, .9, .95, .99, 1] } dataset_id = file_list_data.get("dataset_id") model = DegradationModel.train(file_list_json, dataset_id=dataset_id, model_type='linear', regularization_type='elasticnet', model_name=model_name, hyperparameters=hyperparameters) logger.warning('fitting=%s dataset=%s', model.name, str(dataset_id), extra=s) for path, run_id in zip(file_list, run_ids): logger.info('model=%s run_id=%s predicting=%s', model.name, str(run_id), path, extra=s) features = loadfn(path) prediction = model.predict(features) prediction_dict = model.prediction_to_dict(prediction, features.nominal_capacity) new_filename = os.path.basename(path) new_filename = scrub_underscore_suffix(new_filename) new_filename = add_suffix_to_filename(new_filename, "_predictions") processed_path = os.path.join(processed_dir, new_filename) processed_path = os.path.abspath(processed_path) dumpfn(prediction_dict, processed_path) # Append file loc to list to be returned processed_paths_list.append(processed_path) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({'comment': '', 'error': ''}) output_data = {"file_list": processed_paths_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list } events.put_analyzing_event(output_data, 'predicting', 'complete') # Return jsonable file list return json.dumps(output_data)
def process_file_list_from_json( file_list_json, model_dir="/data-share/models/", processed_dir="data-share/predictions/", hyperparameters=None, model_name=None, predict_only=True, ): """ Function to take a json file containing featurized json locations, train a new model if necessary, write files containing predictions into a predetermined directory, and return a jsonable dict of prediction file locations Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string model_dir (str): location where models are serialized and stored processed_dir (str): location for processed cycler run output files to be placed hyperparameters (dict): dictionary of hyperparameters to optimize/use for training model_name (str): name of feature generation method predict_only (bool): Returns: str: json string of feature files (with key "feature_file_list"). """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup workflow TODO # Add BEEP_PROCESSING_DIR to processed_dir processed_dir = os.path.join(os.environ.get("BEEP_PROCESSING_DIR", "/"), processed_dir) if not os.path.exists(processed_dir): os.makedirs(processed_dir) file_list = file_list_data["file_list"] run_ids = file_list_data["run_list"] processed_run_list = [] processed_result_list = [] processed_message_list = [] processed_paths_list = [] project_name = get_project_name_from_list(file_list) if predict_only: features = loadfn(file_list[0]) if model_name is None and project_name in DEFAULT_MODEL_PROJECTS: if features.prediction_type == "multi": model = DegradationModel.from_serialized_model( model_dir=model_dir, serialized_model="d3batt_multi_point.model") else: model = DegradationModel.from_serialized_model( model_dir=model_dir, serialized_model="d3batt_single_point.model") elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS: output_data = { "file_list": [], "run_list": [], "result_list": [], "message_list": [], } # Return jsonable file list return json.dumps(output_data) else: model = DegradationModel.from_serialized_model( model_dir=model_dir, serialized_model=model_name) else: if hyperparameters is None: hyperparameters = { "random_state": 1, "test_size": 0.3, "k_fold": 5, "tol": 0.001, "l1_ratio": [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], } dataset_id = file_list_data.get("dataset_id") model = DegradationModel.train( file_list_json, dataset_id=dataset_id, model_type="linear", regularization_type="elasticnet", model_name=model_name, hyperparameters=hyperparameters, ) logger.warning("fitting=%s dataset=%s", model.name, str(dataset_id), extra=s) for path, run_id in zip(file_list, run_ids): logger.info("model=%s run_id=%s predicting=%s", model.name, str(run_id), path, extra=s) features = loadfn(path) prediction = model.predict(features) prediction_dict = model.prediction_to_dict(prediction, features.nominal_capacity) new_filename = os.path.basename(path) new_filename = scrub_underscore_suffix(new_filename) new_filename = add_suffix_to_filename(new_filename, "_predictions") processed_path = os.path.join(processed_dir, new_filename) processed_path = os.path.abspath(processed_path) dumpfn(prediction_dict, processed_path) # Append file loc to list to be returned processed_paths_list.append(processed_path) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({"comment": "", "error": ""}) output_data = { "file_list": processed_paths_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list, } # Return jsonable file list return json.dumps(output_data)