def dummy(ctx, test): logger.debug(f"Test msg: {test}") logger.info("Info msg") logger.warning("Warning message") logger.error("Error message") logger.critical("CRITICAL MESSAGE!") if test == "throw_error": raise ValueError("Some error!") dumpfn({"example": "status"}, ctx.obj.output_status_json)
def main(): """ Main function of this module, takes in arguments of an input and output filename corresponding to featurized run data and creates a predictor object output for analysis/ML processing """ # Parse args and construct initial cycler run logger.info("starting", extra=s) logger.info("Running version=%s", __version__, extra=s) try: args = docopt(__doc__) input_json = args["INPUT_JSON"] if args["--fit"]: print( process_file_list_from_json( input_json, predict_only=False, model_dir=MODEL_DIR ), end="", ) else: print(process_file_list_from_json(input_json, model_dir=MODEL_DIR), end="") except Exception as e: logger.error(str(e), extra=s) raise e logger.info("finish", extra=s) return None
def main(): logger.info('starting', extra=s) logger.info('Running version=%s', __version__, extra=s) try: args = docopt(__doc__) input_json = args['INPUT_JSON'] print(validate_file_list_from_json(input_json), end="") except Exception as e: logger.error(str(e), extra=s) raise e logger.info('finish', extra=s) return None
def main(): """Main function for the script""" logger.info("starting", extra=s) logger.info("Running version=%s", __version__, extra=s) try: args = docopt(__doc__) input_json = args["INPUT_JSON"] print(process_csv_file_list_from_json(input_json), end="") except Exception as e: logger.error(str(e), extra=s) raise e logger.info("finish", extra=s) return None
def main(): """Main function of this module, takes in arguments of an input and output filename and uses the input file to create a structured data output for analysis/ML processing. """ logger.info("starting", extra=SERVICE_CONFIG) logger.info("Running version=%s", __version__, extra=SERVICE_CONFIG) try: args = docopt(__doc__) input_json = args["INPUT_JSON"] print(process_file_list_from_json(input_json)) except Exception as e: logger.error(str(e), extra=SERVICE_CONFIG) raise e logger.info("finish", extra=SERVICE_CONFIG) return None
def main(): """ Main function for running of this module as a script Returns: (None) """ logger.info("starting", extra=s) logger.info("Running version=%s", __version__, extra=s) try: args = docopt(__doc__) input_json = args["INPUT_JSON"] print(validate_file_list_from_json(input_json), end="") except Exception as e: logger.error(str(e), extra=s) raise e logger.info("finish", extra=s) return None
def main(): """ Main function of this module, takes in arguments of an input and output filename corresponding to structured cycler run data and creates a predictor object output for analysis/ML processing Returns: None """ # Parse args and construct initial cycler run logger.info('starting', extra=s) logger.info('Running version=%s', __version__, extra=s) try: args = docopt(__doc__) input_json = args['INPUT_JSON'] print(process_file_list_from_json(input_json), end="") except Exception as e: logger.error(str(e), extra=s) raise e logger.info('finish', extra=s) return None
def process_file_list_from_json(file_list_json, processed_dir='data-share/features/', features_label='full_model', predict_only=False, prediction_type="multi", predicted_quantity="cycle"): """ Function to take a json file containing processed cycler run file locations, extract features, dump the processed file into a predetermined directory, and return a jsonable dict of feature file locations. Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string. processed_dir (str): location for processed cycler run output files to be placed. features_label (str): name of feature generation method. predict_only (bool): whether to calculate predictions or not. prediction_type (str): Single or multi-point predictions. predicted_quantity (str): quantity being predicted - cycle or capacity. Returns: str: json string of feature files (with key "file_list"). """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup Events events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode']) # Add root path to processed_dir processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"), processed_dir) file_list = file_list_data['file_list'] run_ids = file_list_data['run_list'] processed_run_list = [] processed_result_list = [] processed_message_list = [] processed_paths_list = [] for path, run_id in zip(file_list, run_ids): logger.info('run_id=%s featurizing=%s', str(run_id), path, extra=s) processed_cycler_run = loadfn(path) featurizer_classes = [DeltaQFastCharge, TrajectoryFastCharge] for featurizer_class in featurizer_classes: featurizer = featurizer_class.from_run(path, processed_dir, processed_cycler_run) if featurizer: dumpfn(featurizer, featurizer.name) processed_paths_list.append(featurizer.name) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({'comment': '', 'error': ''}) logger.info('Successfully generated %s', featurizer.name, extra=s) else: processed_paths_list.append(path) processed_run_list.append(run_id) processed_result_list.append("incomplete") processed_message_list.append({ 'comment': 'Insufficient or incorrect data for featurization', 'error': '' }) logger.info('Unable to featurize %s', path, extra=s) output_data = { "file_list": processed_paths_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list } events.put_analyzing_event(output_data, 'featurizing', 'complete') # Return jsonable file list return json.dumps(output_data)
def process_file_list_from_json(file_list_json, model_dir="/data-share/models/", processed_dir='data-share/predictions/', hyperparameters=None, model_name=None, predict_only=True): """ Function to take a json file containing featurized json locations, train a new model if necessary, write files containing predictions into a predetermined directory, and return a jsonable dict of prediction file locations Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string model_dir (str): location where models are serialized and stored processed_dir (str): location for processed cycler run output files to be placed hyperparameters (dict): dictionary of hyperparameters to optimize/use for training model_name (str): name of feature generation method predict_only (bool): Returns: str: json string of feature files (with key "feature_file_list"). """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup Events events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode']) # Add BEEP_ROOT to processed_dir processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"), processed_dir) file_list = file_list_data['file_list'] run_ids = file_list_data['run_list'] processed_run_list = [] processed_result_list = [] processed_message_list = [] processed_paths_list = [] project_name = get_project_name_from_list(file_list) if predict_only: features = loadfn(file_list[0]) if model_name is None and project_name in DEFAULT_MODEL_PROJECTS: if features.prediction_type == 'multi': model = DegradationModel.from_serialized_model(model_dir=model_dir, serialized_model='d3batt_multi_point.model') else: model = DegradationModel.from_serialized_model(model_dir=model_dir, serialized_model='d3batt_single_point.model') elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS: output_data = {"file_list": [], "run_list": [], "result_list": [], "message_list": [] } events.put_analyzing_event(output_data, 'predicting', 'error') # Return jsonable file list return json.dumps(output_data) else: model = DegradationModel.from_serialized_model(model_dir=model_dir, serialized_model=model_name) else: if hyperparameters is None: hyperparameters = {'random_state': 1, 'test_size': .3, 'k_fold': 5, 'tol': 0.001, 'l1_ratio': [.1, .5, .7, .9, .95, .99, 1] } dataset_id = file_list_data.get("dataset_id") model = DegradationModel.train(file_list_json, dataset_id=dataset_id, model_type='linear', regularization_type='elasticnet', model_name=model_name, hyperparameters=hyperparameters) logger.warning('fitting=%s dataset=%s', model.name, str(dataset_id), extra=s) for path, run_id in zip(file_list, run_ids): logger.info('model=%s run_id=%s predicting=%s', model.name, str(run_id), path, extra=s) features = loadfn(path) prediction = model.predict(features) prediction_dict = model.prediction_to_dict(prediction, features.nominal_capacity) new_filename = os.path.basename(path) new_filename = scrub_underscore_suffix(new_filename) new_filename = add_suffix_to_filename(new_filename, "_predictions") processed_path = os.path.join(processed_dir, new_filename) processed_path = os.path.abspath(processed_path) dumpfn(prediction_dict, processed_path) # Append file loc to list to be returned processed_paths_list.append(processed_path) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({'comment': '', 'error': ''}) output_data = {"file_list": processed_paths_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list } events.put_analyzing_event(output_data, 'predicting', 'complete') # Return jsonable file list return json.dumps(output_data)
def generate_protocol_files_from_csv(csv_filename, output_directory=None): """ Generates a set of protocol files from csv filename input by reading protocol file input corresponding to each line of the csv file. Writes a csv file that. Args: csv_filename (str): CSV containing protocol file parameters. output_directory (str): directory in which to place the output files """ # Read csv file protocol_params_df = pd.read_csv(csv_filename) new_files = [] names = [] result = "" message = {"comment": "", "error": ""} if output_directory is None: output_directory = PROCEDURE_TEMPLATE_DIR for index, protocol_params in protocol_params_df.iterrows(): template = protocol_params["template"] # Filename for the output filename_prefix = "_".join( [ protocol_params["project_name"], "{:06d}".format(protocol_params["seq_num"]), ] ) # Switch for template invocation if template == "EXP.000": protocol = Procedure.from_exp( **protocol_params[["cutoff_voltage", "charge_rate", "discharge_rate"]] ) filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, "procedures", filename) elif template == "diagnosticV2.000": diag_params_df = pd.read_csv( os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv") ) diagnostic_params = diag_params_df[ diag_params_df["diagnostic_parameter_set"] == protocol_params["diagnostic_parameter_set"] ].squeeze() # TODO: should these be separated? protocol = Procedure.from_regcyclev2(protocol_params) protocol.add_procedure_diagcyclev2( protocol_params["capacity_nominal"], diagnostic_params ) filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, "procedures", filename) # TODO: how are these different? elif template in ["diagnosticV3.000", "diagnosticV4.000"]: diag_params_df = pd.read_csv( os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv") ) diagnostic_params = diag_params_df[ diag_params_df["diagnostic_parameter_set"] == protocol_params["diagnostic_parameter_set"] ].squeeze() protocol = Procedure.generate_procedure_regcyclev3(index, protocol_params) protocol.generate_procedure_diagcyclev3( protocol_params["capacity_nominal"], diagnostic_params ) filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, "procedures", filename) elif template == "formationV1.mps": protocol = Settings.from_file(os.path.join(BIOLOGIC_TEMPLATE_DIR, template)) protocol = protocol.formation_protocol_bcs(protocol, protocol_params) filename = "{}.mps".format(filename_prefix) filename = os.path.join(output_directory, "settings", filename) else: warnings.warn("Unsupported file template {}, skipping.".format(template)) result = "error" message = { "comment": "Unable to find template: " + template, "error": "Not Found", } continue logger.info(filename, extra=s) if not os.path.isfile(filename): protocol.to_file(filename) new_files.append(filename) names.append(filename_prefix + "_") elif ".sdu" in template: logger.warning("Schedule file generation not yet implemented", extra=s) result = "error" message = { "comment": "Schedule file generation is not yet implemented", "error": "Not Implemented", } # This block of code produces the file containing all of the run file # names produced in this function call. This is to make starting tests easier _, namefile = os.path.split(csv_filename) namefile = namefile.split("_")[0] + "_names_" namefile = namefile + datetime.datetime.now().strftime("%Y%m%d_%H%M") + ".csv" with open( os.path.join(output_directory, "names", namefile), "w", newline="" ) as outputfile: wr = csv.writer(outputfile) for name in names: wr.writerow([name]) outputfile.close() if not result: result = "success" message = { "comment": "Generated {} protocols".format(str(len(new_files))), "error": "", } return new_files, result, message
def generate_protocol_files_from_csv(csv_filename, output_directory, **kwargs): """ Generates a set of protocol files from csv filename input by reading protocol file input corresponding to each line of the csv file. Writes a csv file that. Args: csv_filename (str): CSV containing protocol file parameters. output_directory (str): directory in which to place the output files **kwargs: kwargs to ProcedureFile, the object which does the protocol file generation """ # Invoke ProcedureFile object from **kwargs procedure_file_generator = ProcedureFile(**kwargs) # Read csv file protocol_params_df = pd.read_csv(csv_filename) new_files = [] names = [] result = '' message = {'comment': '', 'error': ''} for index, protocol_params in protocol_params_df.iterrows(): template = protocol_params['template'] if template not in ["EXP.000", "diagnosticV1.000", "diagnosticV2.000", "diagnosticV3.000"]: warnings.warn("Unsupported file template {}, skipping.".format(template)) result = "error" message = {'comment': 'Unable to find template: ' + template, 'error': 'Not Found'} continue if ".000" in template: # Generate primary procedure dictionary proc_dict, sp = procedure_file_generator.to_dict( os.path.join(PROCEDURE_TEMPLATE_DIR, "{}".format(template)), os.path.join(PROCEDURE_TEMPLATE_DIR, "{}.json".format(template.split('.')[0])) ) # Generate EXP-based proc_dict if template == "EXP.000": proc_dict = procedure_file_generator.generate_procedure_exp( proc_dict, **protocol_params[["cutoff_voltage", "charge_rate", "discharge_rate"]]) elif template == 'diagnosticV2.000': diag_params_df = pd.read_csv(os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")) diagnostic_params = diag_params_df[diag_params_df['diagnostic_parameter_set'] == protocol_params['diagnostic_parameter_set']].squeeze() proc_dict = procedure_file_generator.generate_procedure_regcyclev2( proc_dict, protocol_params) proc_dict = procedure_file_generator.generate_procedure_diagcyclev2( proc_dict, protocol_params["capacity_nominal"], diagnostic_params) elif template == 'diagnosticV3.000': diag_params_df = pd.read_csv(os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")) diagnostic_params = diag_params_df[diag_params_df['diagnostic_parameter_set'] == protocol_params['diagnostic_parameter_set']].squeeze() proc_dict = procedure_file_generator.generate_procedure_regcyclev3(index, proc_dict, protocol_params) proc_dict = procedure_file_generator.generate_procedure_diagcyclev3( proc_dict, protocol_params["capacity_nominal"], diagnostic_params) filename_prefix = '_'.join( [protocol_params["project_name"], '{:06d}'.format(protocol_params["seq_num"])]) filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, 'procedures', filename) logger.info(filename, extra=s) if not os.path.isfile(filename): proc_dict = procedure_file_generator.maccor_format_dict(proc_dict) procedure_file_generator.dict_to_xml( proc_dict=proc_dict, xml_file=filename, sp=sp) new_files.append(filename) names.append(filename_prefix + '_') elif '.sdu' in template: logger.warning('Schedule file generation not yet implemented', extra=s) result = "error" message = {'comment': 'Schedule file generation is not yet implemented', 'error': 'Not Implemented'} # This block of code produces the file containing all of the run file # names produced in this function call. This is to make starting tests easier _, namefile = os.path.split(csv_filename) namefile = namefile.split('_')[0] + '_names_' namefile = namefile + datetime.datetime.now().strftime("%Y%m%d_%H%M") + '.csv' with open(os.path.join(output_directory, "names", namefile), 'w') as outputfile: wr = csv.writer(outputfile) for name in names: wr.writerow([name]) outputfile.close() if not result: result = "success" message = {'comment': 'Generated {} protocols'.format(str(len(new_files))), 'error': ''} return new_files, result, message
def generate_protocol_files_from_csv(csv_filename, output_directory=None): """ Generates a set of protocol files from csv filename input by reading protocol file input corresponding to each line of the csv file. Writes a csv file that. Args: csv_filename (str): CSV containing protocol file parameters. output_directory (str): directory in which to place the output files """ # Read csv file protocol_params_df = pd.read_csv(csv_filename) successfully_generated_files = [] file_generation_failures = [] names = [] result = "" message = {"comment": "", "error": ""} if output_directory is None: output_directory = PROCEDURE_TEMPLATE_DIR for index, protocol_params in protocol_params_df.iterrows(): template = protocol_params["template"] protocol = None # Filename for the output filename_prefix = "_".join( [ protocol_params["project_name"], "{:06d}".format(protocol_params["seq_num"]), ] ) if ".000" in template: # Extension for maccor procedure files template_fullpath = os.path.join(PROCEDURE_TEMPLATE_DIR, template) template_length = template_detection(template_fullpath) if "diagnostic_parameter_set" in protocol_params: # For parameters include diagnostics load those values diag_params_df = pd.read_csv( os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv") ) diagnostic_params = diag_params_df[ diag_params_df["diagnostic_parameter_set"] == protocol_params["diagnostic_parameter_set"] ].squeeze() if template_length == 23 and template == "EXP.000": # length and name for initial procedure files protocol = Procedure.from_exp( **protocol_params[["cutoff_voltage", "charge_rate", "discharge_rate"]] ) elif template_length == 72: # length for V1 and V1 diagnostic templates without ending diagnostics protocol = Procedure.from_regcyclev2(protocol_params) protocol.add_procedure_diagcyclev2( protocol_params["capacity_nominal"], diagnostic_params ) elif template_length == 96: # template length for diagnostic type cycling mwf_dir = os.path.join(output_directory, "mwf_files") if protocol_params["project_name"] == "RapidC": # Project with charging waveform waveform_name = insert_charging_parametersv1(protocol_params, waveform_directory=mwf_dir) protocol = Procedure.generate_procedure_chargingv1(index, protocol_params, waveform_name, template=template_fullpath) elif protocol_params["project_name"] == "Drive": # Project with discharging waveform waveform_name = insert_driving_parametersv1(protocol_params, waveform_directory=mwf_dir) protocol = Procedure.generate_procedure_drivingv1(index, protocol_params, waveform_name, template=template_fullpath) else: # Use the default parameterization for PreDiag/Prediction Diagnostic projects protocol = Procedure.generate_procedure_regcyclev3(index, protocol_params, template=template_fullpath) protocol.generate_procedure_diagcyclev3( protocol_params["capacity_nominal"], diagnostic_params ) else: # Case where its not possible to match the procedure template failure = { "comment": "Unable to find template: " + template, "error": "Not Found", } file_generation_failures.append(failure) warnings.warn("Unsupported file template {}, skipping.".format(template)) result = "error" continue filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, "procedures", filename) elif ".mps" in template and template == "formationV1.mps": # biologic settings template and formation project protocol = Settings.from_file(os.path.join(BIOLOGIC_TEMPLATE_DIR, template)) protocol = protocol.formation_protocol_bcs(protocol_params) filename = "{}.mps".format(filename_prefix) filename = os.path.join(output_directory, "settings", filename) elif ".sdu" in template: # No schedule file templates implemented failure = { "comment": "Schedule file generation is not yet implemented", "error": "Not Implemented" } file_generation_failures.append(failure) logger.warning("Schedule file generation not yet implemented", extra=s) result = "error" continue else: # Unable to match to any known template format failure = { "comment": "Unable to find template: " + template, "error": "Not Found", } file_generation_failures.append(failure) warnings.warn("Unsupported file template {}, skipping.".format(template)) result = "error" continue logger.info(filename, extra=s) protocol.to_file(filename) successfully_generated_files.append(filename) names.append(filename_prefix + "_") # This block of code produces the file containing all of the run file # names produced in this function call. This is to make starting tests easier _, namefile = os.path.split(csv_filename) namefile = namefile.split("_")[0] + "_names_" namefile = namefile + datetime.datetime.now().strftime("%Y%m%d_%H%M") + ".csv" names_dir = os.path.join(output_directory, "names") os.makedirs(names_dir, exist_ok=True) with open(os.path.join(names_dir, namefile), "w", newline="") as outputfile: wr = csv.writer(outputfile) for name in names: wr.writerow([name]) outputfile.close() num_generated_files = len(successfully_generated_files) num_generation_failures = len(file_generation_failures) num_files = num_generated_files + num_generation_failures message = { "comment": "Generated {} of {} protocols".format(num_generated_files, num_files), "error": "" } if not result: result = "success" else: message["error"] = "Failed to generate {} of {} protocols".format(num_generation_failures, num_files) logger.error(message["error"]) return successfully_generated_files, file_generation_failures, result, message
def train(self, X: pd.DataFrame = None, y: pd.DataFrame = None): """Train on 100% of available data. Args: X (pd.Dataframe): Clean and homogenized learning features. If not specified, df defined in __init__ (all training data) is used. y (pd.DataFrame): Clean and homogenized targets. If not specified, df defined in __init__ (all training data) is used. Returns: model (BaseEstimator): The sklearn model, fit on training data. training_errors (dict): Training errors based on multiple metrics. """ X = X if X is not None else self.X y = y if y is not None else self.y if not self.multi: y = y[self.targets[0]] X = self.scaler.fit_transform(X) logger.info( f"Training on {X.shape[0]} samples with {X.shape[1]} features " f"predicting {y.shape[0]}") kwargs = { "fit_intercept": True, "alphas": self.alphas, "cv": self.kfold, "max_iter": self.max_iter, "tol": self.tol, "l1_ratio": self.l1_ratio } if self.model_name == "elasticnet": if self.multi: cv_class = MultiTaskElasticNetCV model_class = MultiTaskElasticNet else: cv_class = ElasticNetCV model_class = ElasticNet elif self.model_name == "lasso": cv_class = LassoCV model_class = Lasso kwargs.pop("l1_ratio") elif self.model_name == "ridge": cv_class = RidgeCV model_class = Ridge kwargs.pop("l1_ratio") kwargs.pop("max_iter") kwargs.pop("tol") # Ridge has to have alphas set by hand as it has no # default alphas if not kwargs["alphas"]: kwargs["alphas"] = (1e-3, 1e-2, 1e-1, 1, 10, 100, 1000) else: raise NotImplementedError(f"Unsupported model '{self.model_name}'") # Search for optimal hyperparameters cv = cv_class(**kwargs) cv.fit(X, y) # Set optimal hyperparameters and refit optimal_hyperparameters = {"alpha": cv.alpha_} if self.model_name == "elasticnet": optimal_hyperparameters["l1_ratio"] = cv.l1_ratio_ model_kwargs = { "fit_intercept": True, "normalize": False, "max_iter": self.max_iter, } model_kwargs.update(optimal_hyperparameters) self.optimal_hyperparameters = optimal_hyperparameters model = model_class(**model_kwargs) model.fit(X, y) self.model = model y_training = model.predict(X) y_training = pd.DataFrame(data=y_training, columns=self.targets) training_errors = self._score_arrays(y, y_training) return model, training_errors
def process_file_list_from_json(file_list_json, processed_dir="data-share/structure/", omit_raw=True): """Function to take a json filename corresponding to a data structure with a 'file_list' and a 'validity' attribute, process each file with a corresponding True validity, dump the processed file into a predetermined directory, and return a jsonable dict of processed cycler run file locations Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list and validity attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string. processed_dir (str): location for processed cycler run output files to be placed. omit_raw (bool): Omit the raw_data from being saved to file. Creates legacy file structure for all structured datapaths. Returns: (str): json string of processed files (with key "processed_file_list"). Note that this list contains None values for every file that had a corresponding False in the validity list. """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup workflow outputs = WorkflowOutputs() # Prepend optional root to output directory processed_dir = os.path.join(os.environ.get("BEEP_PROCESSING_DIR", "/"), processed_dir) if not os.path.exists(processed_dir): os.makedirs(processed_dir) file_list = file_list_data["file_list"] validities = file_list_data["validity"] run_ids = file_list_data["run_list"] processed_file_list = [] processed_run_list = [] processed_result_list = [] processed_message_list = [] invalid_file_list = [] for filename, validity, run_id in zip(file_list, validities, run_ids): logger.info("run_id=%s structuring=%s", str(run_id), filename, extra=SERVICE_CONFIG) if validity == "valid": # Process datapath and dump to file dp = auto_load(filename) dp.autostructure() # raw_cycler_run = RawCyclerRun.from_file(filename) # processed_cycler_run = raw_cycler_run.to_processed_cycler_run() new_filename, ext = os.path.splitext(os.path.basename(filename)) new_filename = new_filename + ".json" new_filename = add_suffix_to_filename(new_filename, "_structure") structured_run_loc = os.path.join(processed_dir, new_filename) structured_run_loc = os.path.abspath(structured_run_loc) dp.to_json_file(structured_run_loc, omit_raw) # Append file loc to list to be returned processed_file_list.append(structured_run_loc) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({"comment": "", "error": ""}) else: invalid_file_list.append(filename) output_json = { "file_list": processed_file_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list, "invalid_file_list": invalid_file_list, } # Workflow outputs file_list_size = len(output_json["file_list"]) if file_list_size > 1 or file_list_size == 0: logger.warning("{file_list_size} files being validated, should be 1") output_data = { "filename": output_json["file_list"][0], "run_id": output_json["run_list"][0], "result": output_json["result_list"][0], } outputs.put_workflow_outputs(output_data, "structuring") # Return jsonable file list return json.dumps(output_json)
def validate_from_paths(self, paths, record_results=False, skip_existing=False, record_path=DEFAULT_VALIDATION_RECORDS): """ This method streamlines validation of multiple Arbin csv files given a list of paths. It can also do bookkeeping of validations by dumping results in a json file, locally until a more centralized method is implemented. Args: paths (list): a list of paths to csv files record_results (bool): Whether to record the validation results locally or not (defaults to False) skip_existing (bool): Whether to skip already validated files. This is done by checking if the file is in the validation_records. skip_existing only matters if record_results is True. (defaults to False) record_path (str): path to the json file storing the past validation results. Returns: dict: Results of the validation in the form of a key,value pairs where each key corresponds to the filename validated. For each file, the results contain a field "validated", True if validation was successful or False if not. "errors", "method" and "time" are simply the errors encountered during validation, method used for validation, and time of validation, respectively. """ if record_results: if os.path.isfile(record_path): self.validation_records = loadfn(record_path) if skip_existing: paths = [ path for path in paths if os.path.basename(path) not in self.validation_records ] else: self.validation_records = {} results = {} for path in tqdm(paths): name = os.path.basename(path) results[name] = {} if re.match(ARBIN_CONFIG['file_pattern'], path): schema_filename = os.path.join(VALIDATION_SCHEMA_DIR, "schema-arbin-lfp.yaml") self.schema = loadfn(schema_filename) df = pd.read_csv(path, index_col=0) validated, reason = self.validate(df) method = "simple_arbin" elif re.match(MACCOR_CONFIG['file_pattern'], path): schema_filename = os.path.join(VALIDATION_SCHEMA_DIR, "schema-maccor-2170.yaml") self.schema = loadfn(schema_filename) self.allow_unknown = True df = pd.read_csv(path, delimiter='\t', skiprows=1) # Columns need to be retyped and renamed for validation, # conversion will happen during structuring df['State'] = df['State'].astype(str) df['current'] = df['Amps'] validated, reason = self.validate(df) method = "simple_maccor" else: validated, reason = False, "File type not recognized" method = None results[name].update({ "validated": validated, "method": method, "errors": reason, "time": json.dumps(datetime.now(), indent=4, sort_keys=True, default=str) }) if validated: logger.info("%s method=%s errors=%s", name, method, reason, extra=s) else: logger.warning("%s method=%s errors=%s", name, method, reason, extra=s) if record_results: self.validation_records.update(results) dumpfn(self.validation_records, record_path) return results
def process_file_list_from_json( file_list_json, model_dir="/data-share/models/", processed_dir="data-share/predictions/", hyperparameters=None, model_name=None, predict_only=True, ): """ Function to take a json file containing featurized json locations, train a new model if necessary, write files containing predictions into a predetermined directory, and return a jsonable dict of prediction file locations Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string model_dir (str): location where models are serialized and stored processed_dir (str): location for processed cycler run output files to be placed hyperparameters (dict): dictionary of hyperparameters to optimize/use for training model_name (str): name of feature generation method predict_only (bool): Returns: str: json string of feature files (with key "feature_file_list"). """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup workflow TODO # Add BEEP_PROCESSING_DIR to processed_dir processed_dir = os.path.join(os.environ.get("BEEP_PROCESSING_DIR", "/"), processed_dir) if not os.path.exists(processed_dir): os.makedirs(processed_dir) file_list = file_list_data["file_list"] run_ids = file_list_data["run_list"] processed_run_list = [] processed_result_list = [] processed_message_list = [] processed_paths_list = [] project_name = get_project_name_from_list(file_list) if predict_only: features = loadfn(file_list[0]) if model_name is None and project_name in DEFAULT_MODEL_PROJECTS: if features.prediction_type == "multi": model = DegradationModel.from_serialized_model( model_dir=model_dir, serialized_model="d3batt_multi_point.model") else: model = DegradationModel.from_serialized_model( model_dir=model_dir, serialized_model="d3batt_single_point.model") elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS: output_data = { "file_list": [], "run_list": [], "result_list": [], "message_list": [], } # Return jsonable file list return json.dumps(output_data) else: model = DegradationModel.from_serialized_model( model_dir=model_dir, serialized_model=model_name) else: if hyperparameters is None: hyperparameters = { "random_state": 1, "test_size": 0.3, "k_fold": 5, "tol": 0.001, "l1_ratio": [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], } dataset_id = file_list_data.get("dataset_id") model = DegradationModel.train( file_list_json, dataset_id=dataset_id, model_type="linear", regularization_type="elasticnet", model_name=model_name, hyperparameters=hyperparameters, ) logger.warning("fitting=%s dataset=%s", model.name, str(dataset_id), extra=s) for path, run_id in zip(file_list, run_ids): logger.info("model=%s run_id=%s predicting=%s", model.name, str(run_id), path, extra=s) features = loadfn(path) prediction = model.predict(features) prediction_dict = model.prediction_to_dict(prediction, features.nominal_capacity) new_filename = os.path.basename(path) new_filename = scrub_underscore_suffix(new_filename) new_filename = add_suffix_to_filename(new_filename, "_predictions") processed_path = os.path.join(processed_dir, new_filename) processed_path = os.path.abspath(processed_path) dumpfn(prediction_dict, processed_path) # Append file loc to list to be returned processed_paths_list.append(processed_path) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({"comment": "", "error": ""}) output_data = { "file_list": processed_paths_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list, } # Return jsonable file list return json.dumps(output_data)
def process_file_list_from_json(file_list_json, processed_dir='data-share/features/', features_label='full_model', predict_only=False, prediction_type="multi", predicted_quantity="cycle"): """ Function to take a json file containing processed cycler run file locations, extract features, dump the processed file into a predetermined directory, and return a jsonable dict of feature file locations. Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string. processed_dir (str): location for processed cycler run output files to be placed. features_label (str): name of feature generation method. predict_only (bool): whether to calculate predictions or not. prediction_type (str): Single or multi-point predictions. predicted_quantity (str): quantity being predicted - cycle or capacity. Returns: str: json string of feature files (with key "file_list"). """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup Events events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode']) # Add root path to processed_dir processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"), processed_dir) file_list = file_list_data['file_list'] run_ids = file_list_data['run_list'] processed_run_list = [] processed_result_list = [] processed_message_list = [] processed_paths_list = [] required_cycle_num = 100 #for full model for path, run_id in zip(file_list, run_ids): logger.info('run_id=%s featurizing=%s', str(run_id), path, extra=s) #check if there is enough data to try featurizing if not len(loadfn(path).summary) > required_cycle_num: logger.info("run_id=%s Insufficient data for featurization", str(run_id), extra=s) processed_paths_list.append(path) processed_run_list.append(run_id) processed_result_list.append("incomplete") processed_message_list.append({ 'comment': 'Insufficient data for featurization', 'error': '' }) else: processed_data = DegradationPredictor.from_processed_cycler_run_file( path, features_label=features_label, predict_only=predict_only, prediction_type=prediction_type, predicted_quantity=predicted_quantity) new_filename = os.path.basename(path) new_filename = scrub_underscore_suffix(new_filename) # Append model_name along with "features" to demarcate # different models when saving the feature vectors. new_filename = add_suffix_to_filename( new_filename, "_" + features_label + "_" + prediction_type + "_features") processed_path = os.path.join(processed_dir, new_filename) processed_path = os.path.abspath(processed_path) dumpfn(processed_data, processed_path) processed_paths_list.append(processed_path) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({'comment': '', 'error': ''}) output_data = { "file_list": processed_paths_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list } events.put_analyzing_event(output_data, 'featurizing', 'complete') # Return jsonable file list return json.dumps(output_data)
def generate_protocol_files_from_csv(csv_filename, output_directory=None): """ Generates a set of protocol files from csv filename input by reading protocol file input corresponding to each line of the csv file. Writes a csv file that. Args: csv_filename (str): CSV containing protocol file parameters. output_directory (str): directory in which to place the output files """ # Read csv file protocol_params_df = pd.read_csv(csv_filename) new_files = [] names = [] result = '' message = {'comment': '', 'error': ''} if output_directory is None: output_directory = PROCEDURE_TEMPLATE_DIR for index, protocol_params in protocol_params_df.iterrows(): template = protocol_params['template'] # Switch for template invocation if template == "EXP.000": procedure = Procedure.from_exp(**protocol_params[ ["cutoff_voltage", "charge_rate", "discharge_rate"]]) elif template == 'diagnosticV2.000': diag_params_df = pd.read_csv( os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")) diagnostic_params = diag_params_df[ diag_params_df['diagnostic_parameter_set'] == protocol_params['diagnostic_parameter_set']].squeeze() # TODO: should these be separated? procedure = Procedure.from_regcyclev2(protocol_params) procedure.add_procedure_diagcyclev2( protocol_params["capacity_nominal"], diagnostic_params) # TODO: how are these different? elif template in ['diagnosticV3.000', 'diagnosticV4.000']: diag_params_df = pd.read_csv( os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")) diagnostic_params = diag_params_df[ diag_params_df['diagnostic_parameter_set'] == protocol_params['diagnostic_parameter_set']].squeeze() procedure = Procedure.generate_procedure_regcyclev3( index, protocol_params) procedure.generate_procedure_diagcyclev3( protocol_params["capacity_nominal"], diagnostic_params) else: warnings.warn( "Unsupported file template {}, skipping.".format(template)) result = "error" message = { 'comment': 'Unable to find template: ' + template, 'error': 'Not Found' } continue filename_prefix = '_'.join([ protocol_params["project_name"], '{:06d}'.format(protocol_params["seq_num"]) ]) filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, 'procedures', filename) logger.info(filename, extra=s) if not os.path.isfile(filename): procedure.to_file(filename) new_files.append(filename) names.append(filename_prefix + '_') elif '.sdu' in template: logger.warning('Schedule file generation not yet implemented', extra=s) result = "error" message = { 'comment': 'Schedule file generation is not yet implemented', 'error': 'Not Implemented' } # This block of code produces the file containing all of the run file # names produced in this function call. This is to make starting tests easier _, namefile = os.path.split(csv_filename) namefile = namefile.split('_')[0] + '_names_' namefile = namefile + datetime.datetime.now().strftime( "%Y%m%d_%H%M") + '.csv' with open(os.path.join(output_directory, "names", namefile), 'w', newline='') as outputfile: wr = csv.writer(outputfile) for name in names: wr.writerow([name]) outputfile.close() if not result: result = "success" message = { 'comment': 'Generated {} protocols'.format(str(len(new_files))), 'error': '' } return new_files, result, message