def dummy(ctx, test): logger.debug(f"Test msg: {test}") logger.info("Info msg") logger.warning("Warning message") logger.error("Error message") logger.critical("CRITICAL MESSAGE!") if test == "throw_error": raise ValueError("Some error!") dumpfn({"example": "status"}, ctx.obj.output_status_json)
def from_file(cls, path, metadata_path=None): """Load an Arbin file to a datapath. Args: path (str, Pathlike): Path to the raw data csv. Returns: (ArbinDatapath) """ data = pd.read_csv(path) data.rename(str.lower, axis="columns", inplace=True) for column, dtype in ARBIN_CONFIG["data_types"].items(): if column in data: if not data[column].isnull().values.any(): data[column] = data[column].astype(dtype) data.rename(ARBIN_CONFIG["data_columns"], axis="columns", inplace=True) metadata_path = metadata_path if metadata_path else path.replace( ".csv", "_Metadata.csv") if os.path.exists(metadata_path): metadata = pd.read_csv(metadata_path) metadata.rename(str.lower, axis="columns", inplace=True) metadata.rename(ARBIN_CONFIG["metadata_fields"], axis="columns", inplace=True) # Note the to_dict, which scrubs numpy typing metadata = { col: item[0] for col, item in metadata.to_dict("list").items() } else: logger.warning(f"No associated metadata file for Arbin: " f"'{metadata_path}'. No metadata loaded.") metadata = {} # standardizing time format data["date_time_iso"] = data["date_time"].apply( lambda x: datetime.utcfromtimestamp(x).replace(tzinfo=pytz.UTC ).isoformat()) paths = {"raw": path, "metadata": metadata_path if metadata else None} return cls(data, metadata, paths)
def process_file_list_from_json(file_list_json, model_dir="/data-share/models/", processed_dir='data-share/predictions/', hyperparameters=None, model_name=None, predict_only=True): """ Function to take a json file containing featurized json locations, train a new model if necessary, write files containing predictions into a predetermined directory, and return a jsonable dict of prediction file locations Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string model_dir (str): location where models are serialized and stored processed_dir (str): location for processed cycler run output files to be placed hyperparameters (dict): dictionary of hyperparameters to optimize/use for training model_name (str): name of feature generation method predict_only (bool): Returns: str: json string of feature files (with key "feature_file_list"). """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup Events events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode']) # Add BEEP_ROOT to processed_dir processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"), processed_dir) file_list = file_list_data['file_list'] run_ids = file_list_data['run_list'] processed_run_list = [] processed_result_list = [] processed_message_list = [] processed_paths_list = [] project_name = get_project_name_from_list(file_list) if predict_only: features = loadfn(file_list[0]) if model_name is None and project_name in DEFAULT_MODEL_PROJECTS: if features.prediction_type == 'multi': model = DegradationModel.from_serialized_model(model_dir=model_dir, serialized_model='d3batt_multi_point.model') else: model = DegradationModel.from_serialized_model(model_dir=model_dir, serialized_model='d3batt_single_point.model') elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS: output_data = {"file_list": [], "run_list": [], "result_list": [], "message_list": [] } events.put_analyzing_event(output_data, 'predicting', 'error') # Return jsonable file list return json.dumps(output_data) else: model = DegradationModel.from_serialized_model(model_dir=model_dir, serialized_model=model_name) else: if hyperparameters is None: hyperparameters = {'random_state': 1, 'test_size': .3, 'k_fold': 5, 'tol': 0.001, 'l1_ratio': [.1, .5, .7, .9, .95, .99, 1] } dataset_id = file_list_data.get("dataset_id") model = DegradationModel.train(file_list_json, dataset_id=dataset_id, model_type='linear', regularization_type='elasticnet', model_name=model_name, hyperparameters=hyperparameters) logger.warning('fitting=%s dataset=%s', model.name, str(dataset_id), extra=s) for path, run_id in zip(file_list, run_ids): logger.info('model=%s run_id=%s predicting=%s', model.name, str(run_id), path, extra=s) features = loadfn(path) prediction = model.predict(features) prediction_dict = model.prediction_to_dict(prediction, features.nominal_capacity) new_filename = os.path.basename(path) new_filename = scrub_underscore_suffix(new_filename) new_filename = add_suffix_to_filename(new_filename, "_predictions") processed_path = os.path.join(processed_dir, new_filename) processed_path = os.path.abspath(processed_path) dumpfn(prediction_dict, processed_path) # Append file loc to list to be returned processed_paths_list.append(processed_path) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({'comment': '', 'error': ''}) output_data = {"file_list": processed_paths_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list } events.put_analyzing_event(output_data, 'predicting', 'complete') # Return jsonable file list return json.dumps(output_data)
def generate_protocol_files_from_csv(csv_filename, output_directory=None): """ Generates a set of protocol files from csv filename input by reading protocol file input corresponding to each line of the csv file. Writes a csv file that. Args: csv_filename (str): CSV containing protocol file parameters. output_directory (str): directory in which to place the output files """ # Read csv file protocol_params_df = pd.read_csv(csv_filename) new_files = [] names = [] result = "" message = {"comment": "", "error": ""} if output_directory is None: output_directory = PROCEDURE_TEMPLATE_DIR for index, protocol_params in protocol_params_df.iterrows(): template = protocol_params["template"] # Filename for the output filename_prefix = "_".join( [ protocol_params["project_name"], "{:06d}".format(protocol_params["seq_num"]), ] ) # Switch for template invocation if template == "EXP.000": protocol = Procedure.from_exp( **protocol_params[["cutoff_voltage", "charge_rate", "discharge_rate"]] ) filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, "procedures", filename) elif template == "diagnosticV2.000": diag_params_df = pd.read_csv( os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv") ) diagnostic_params = diag_params_df[ diag_params_df["diagnostic_parameter_set"] == protocol_params["diagnostic_parameter_set"] ].squeeze() # TODO: should these be separated? protocol = Procedure.from_regcyclev2(protocol_params) protocol.add_procedure_diagcyclev2( protocol_params["capacity_nominal"], diagnostic_params ) filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, "procedures", filename) # TODO: how are these different? elif template in ["diagnosticV3.000", "diagnosticV4.000"]: diag_params_df = pd.read_csv( os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv") ) diagnostic_params = diag_params_df[ diag_params_df["diagnostic_parameter_set"] == protocol_params["diagnostic_parameter_set"] ].squeeze() protocol = Procedure.generate_procedure_regcyclev3(index, protocol_params) protocol.generate_procedure_diagcyclev3( protocol_params["capacity_nominal"], diagnostic_params ) filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, "procedures", filename) elif template == "formationV1.mps": protocol = Settings.from_file(os.path.join(BIOLOGIC_TEMPLATE_DIR, template)) protocol = protocol.formation_protocol_bcs(protocol, protocol_params) filename = "{}.mps".format(filename_prefix) filename = os.path.join(output_directory, "settings", filename) else: warnings.warn("Unsupported file template {}, skipping.".format(template)) result = "error" message = { "comment": "Unable to find template: " + template, "error": "Not Found", } continue logger.info(filename, extra=s) if not os.path.isfile(filename): protocol.to_file(filename) new_files.append(filename) names.append(filename_prefix + "_") elif ".sdu" in template: logger.warning("Schedule file generation not yet implemented", extra=s) result = "error" message = { "comment": "Schedule file generation is not yet implemented", "error": "Not Implemented", } # This block of code produces the file containing all of the run file # names produced in this function call. This is to make starting tests easier _, namefile = os.path.split(csv_filename) namefile = namefile.split("_")[0] + "_names_" namefile = namefile + datetime.datetime.now().strftime("%Y%m%d_%H%M") + ".csv" with open( os.path.join(output_directory, "names", namefile), "w", newline="" ) as outputfile: wr = csv.writer(outputfile) for name in names: wr.writerow([name]) outputfile.close() if not result: result = "success" message = { "comment": "Generated {} protocols".format(str(len(new_files))), "error": "", } return new_files, result, message
def generate_protocol_files_from_csv(csv_filename, output_directory, **kwargs): """ Generates a set of protocol files from csv filename input by reading protocol file input corresponding to each line of the csv file. Writes a csv file that. Args: csv_filename (str): CSV containing protocol file parameters. output_directory (str): directory in which to place the output files **kwargs: kwargs to ProcedureFile, the object which does the protocol file generation """ # Invoke ProcedureFile object from **kwargs procedure_file_generator = ProcedureFile(**kwargs) # Read csv file protocol_params_df = pd.read_csv(csv_filename) new_files = [] names = [] result = '' message = {'comment': '', 'error': ''} for index, protocol_params in protocol_params_df.iterrows(): template = protocol_params['template'] if template not in ["EXP.000", "diagnosticV1.000", "diagnosticV2.000", "diagnosticV3.000"]: warnings.warn("Unsupported file template {}, skipping.".format(template)) result = "error" message = {'comment': 'Unable to find template: ' + template, 'error': 'Not Found'} continue if ".000" in template: # Generate primary procedure dictionary proc_dict, sp = procedure_file_generator.to_dict( os.path.join(PROCEDURE_TEMPLATE_DIR, "{}".format(template)), os.path.join(PROCEDURE_TEMPLATE_DIR, "{}.json".format(template.split('.')[0])) ) # Generate EXP-based proc_dict if template == "EXP.000": proc_dict = procedure_file_generator.generate_procedure_exp( proc_dict, **protocol_params[["cutoff_voltage", "charge_rate", "discharge_rate"]]) elif template == 'diagnosticV2.000': diag_params_df = pd.read_csv(os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")) diagnostic_params = diag_params_df[diag_params_df['diagnostic_parameter_set'] == protocol_params['diagnostic_parameter_set']].squeeze() proc_dict = procedure_file_generator.generate_procedure_regcyclev2( proc_dict, protocol_params) proc_dict = procedure_file_generator.generate_procedure_diagcyclev2( proc_dict, protocol_params["capacity_nominal"], diagnostic_params) elif template == 'diagnosticV3.000': diag_params_df = pd.read_csv(os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")) diagnostic_params = diag_params_df[diag_params_df['diagnostic_parameter_set'] == protocol_params['diagnostic_parameter_set']].squeeze() proc_dict = procedure_file_generator.generate_procedure_regcyclev3(index, proc_dict, protocol_params) proc_dict = procedure_file_generator.generate_procedure_diagcyclev3( proc_dict, protocol_params["capacity_nominal"], diagnostic_params) filename_prefix = '_'.join( [protocol_params["project_name"], '{:06d}'.format(protocol_params["seq_num"])]) filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, 'procedures', filename) logger.info(filename, extra=s) if not os.path.isfile(filename): proc_dict = procedure_file_generator.maccor_format_dict(proc_dict) procedure_file_generator.dict_to_xml( proc_dict=proc_dict, xml_file=filename, sp=sp) new_files.append(filename) names.append(filename_prefix + '_') elif '.sdu' in template: logger.warning('Schedule file generation not yet implemented', extra=s) result = "error" message = {'comment': 'Schedule file generation is not yet implemented', 'error': 'Not Implemented'} # This block of code produces the file containing all of the run file # names produced in this function call. This is to make starting tests easier _, namefile = os.path.split(csv_filename) namefile = namefile.split('_')[0] + '_names_' namefile = namefile + datetime.datetime.now().strftime("%Y%m%d_%H%M") + '.csv' with open(os.path.join(output_directory, "names", namefile), 'w') as outputfile: wr = csv.writer(outputfile) for name in names: wr.writerow([name]) outputfile.close() if not result: result = "success" message = {'comment': 'Generated {} protocols'.format(str(len(new_files))), 'error': ''} return new_files, result, message
def generate_protocol_files_from_csv(csv_filename, output_directory=None): """ Generates a set of protocol files from csv filename input by reading protocol file input corresponding to each line of the csv file. Writes a csv file that. Args: csv_filename (str): CSV containing protocol file parameters. output_directory (str): directory in which to place the output files """ # Read csv file protocol_params_df = pd.read_csv(csv_filename) new_files = [] names = [] result = '' message = {'comment': '', 'error': ''} if output_directory is None: output_directory = PROCEDURE_TEMPLATE_DIR for index, protocol_params in protocol_params_df.iterrows(): template = protocol_params['template'] # Switch for template invocation if template == "EXP.000": procedure = Procedure.from_exp(**protocol_params[ ["cutoff_voltage", "charge_rate", "discharge_rate"]]) elif template == 'diagnosticV2.000': diag_params_df = pd.read_csv( os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")) diagnostic_params = diag_params_df[ diag_params_df['diagnostic_parameter_set'] == protocol_params['diagnostic_parameter_set']].squeeze() # TODO: should these be separated? procedure = Procedure.from_regcyclev2(protocol_params) procedure.add_procedure_diagcyclev2( protocol_params["capacity_nominal"], diagnostic_params) # TODO: how are these different? elif template in ['diagnosticV3.000', 'diagnosticV4.000']: diag_params_df = pd.read_csv( os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")) diagnostic_params = diag_params_df[ diag_params_df['diagnostic_parameter_set'] == protocol_params['diagnostic_parameter_set']].squeeze() procedure = Procedure.generate_procedure_regcyclev3( index, protocol_params) procedure.generate_procedure_diagcyclev3( protocol_params["capacity_nominal"], diagnostic_params) else: warnings.warn( "Unsupported file template {}, skipping.".format(template)) result = "error" message = { 'comment': 'Unable to find template: ' + template, 'error': 'Not Found' } continue filename_prefix = '_'.join([ protocol_params["project_name"], '{:06d}'.format(protocol_params["seq_num"]) ]) filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, 'procedures', filename) logger.info(filename, extra=s) if not os.path.isfile(filename): procedure.to_file(filename) new_files.append(filename) names.append(filename_prefix + '_') elif '.sdu' in template: logger.warning('Schedule file generation not yet implemented', extra=s) result = "error" message = { 'comment': 'Schedule file generation is not yet implemented', 'error': 'Not Implemented' } # This block of code produces the file containing all of the run file # names produced in this function call. This is to make starting tests easier _, namefile = os.path.split(csv_filename) namefile = namefile.split('_')[0] + '_names_' namefile = namefile + datetime.datetime.now().strftime( "%Y%m%d_%H%M") + '.csv' with open(os.path.join(output_directory, "names", namefile), 'w', newline='') as outputfile: wr = csv.writer(outputfile) for name in names: wr.writerow([name]) outputfile.close() if not result: result = "success" message = { 'comment': 'Generated {} protocols'.format(str(len(new_files))), 'error': '' } return new_files, result, message
def __init__( self, feature_matrix: BEEPFeatureMatrix, target_matrix: BEEPFeatureMatrix, targets: List[str], model_name: str, alphas: Union[None, Iterable[float]] = None, train_feature_drop_nan_thresh: float = 0.95, train_sample_drop_nan_thresh: float = 0.50, predict_sample_nan_thresh: float = 0.75, drop_nan_training_targets: bool = False, impute_strategy: str = "median", kfold: int = 5, max_iter: int = 1e6, tol: float = 1e-4, # only relevant for elasticnet l1_ratio: Union[Tuple[float], List[float]] = (0.001, 0.1, 0.5, 0.7, 0.9, 0.95, 1), homogenize_features: bool = True): if model_name not in self.ALLOWED_MODELS: raise ValueError( f"Model {model_name} not supported by {self.__class__.__name__}" ) if len(targets) < 1: raise ValueError("At least one target must be specified") self.feature_matrix = feature_matrix self.target_matrix = target_matrix X = self.feature_matrix.matrix.replace([np.inf, -np.inf], np.nan) y = self.target_matrix.matrix.replace([np.inf, -np.inf], np.nan) if homogenize_features: X = self._remove_param_hash_from_features(X) y = self._remove_param_hash_from_features(y) if X.shape[0] != y.shape[0]: raise BEEPMLExperimentError( "Can't run experiment on unequal numbers of input samples.") if X.shape[0] < X.shape[1]: logger.warning( f"Number of samples ({X.shape[0]}) less than number of " f"features ({X.shape[1]}); may cause overfitting.") # Form the clean feature matrix X = X.dropna(axis=1, thresh=train_sample_drop_nan_thresh * X.shape[0]) X = X.dropna(axis=0, thresh=train_sample_drop_nan_thresh * X.shape[1]) X = self._impute_df(X, method=impute_strategy) self.impute_strategy = impute_strategy if X.shape[0] < 2 or X.shape[1] < 1: raise BEEPMLExperimentError( "Cleaned feature matrix has dimensions of less " "than 1 feature or less than 2 samples. Try adjusting " "the thresholds for cleaning or examine your feature " "matrix.") # Form the clean target matrix missing_targets = [t for t in targets if t not in y.columns] if missing_targets: raise BEEPMLExperimentError( f"Required target columns missing from " f"target matrix: {missing_targets}") y = y[targets].loc[X.index] if y.isna().any().any(): if drop_nan_training_targets: y = y.dropna(axis=0) else: raise BEEPMLExperimentError( "Target matrix contains nans and drop_nan_targets is " "set to False.") if y.shape[0] < 2: raise BEEPMLExperimentError( "Target matrix after dropping nans is less than 2 samples.") # Ensure there will be an equal number of X samples # and y samples self.X = X.loc[y.index] self.y = y # These features must be present in passed dfs for predictions to work self.feature_labels = self.X.columns.tolist() self.targets = targets self.multi = len(self.targets) > 1 if self.multi and model_name != "elasticnet": raise BEEPMLExperimentError( f"Model {model_name} not supported for multiple target " f"regression.") self.model_name = model_name if model_name else "elasticnet" self.model = None self.train_feature_drop_thresh = train_feature_drop_nan_thresh self.train_sample_drop_thresh = train_sample_drop_nan_thresh self.predict_sample_nan_thresh = predict_sample_nan_thresh self.drop_nan_training_targets = drop_nan_training_targets # todo: this is only to help with deserialization, this could cause # todo: contamination in judging test scores when used with # todo: train_and_score() self.scaler = StandardScaler().fit(X) self.kfold = kfold self.alphas = alphas self.max_iter = max_iter self.tol = tol self.l1_ratio = l1_ratio self.optimal_hyperparameters = None self.homogenize_features = homogenize_features
def predict( self, feature_matrix: Union[BEEPFeatureMatrix, pd.DataFrame], homogenize_features: Union[None, bool] = None, ): """Use the trained model to predict new degradation characteristics based on an incoming feature matrix. Args: feature_matrix (BEEPFeatureMatrix): The feature matrix to use for predicting degradation character. homogenize_features (bool, None): Whether to homogenize the incoming matrix's features. Overrides homogenize_features as set in __init__. Returns: y_pred (pd.DataFrame): The predictions, in dataframe format. dropped (list): List of dropped samples, by incoming df index (e.g., filename). """ if not self.model: raise BEEPMLExperimentError("No model has been trained.") # condense features down to those required, throwing error if not present if isinstance(feature_matrix, BEEPFeatureMatrix): X = feature_matrix.matrix else: X = feature_matrix # make sure features will have the same names if homogenize features # even if featurizer' hyperparameters are different homogenize_features = self.homogenize_features if homogenize_features is None else homogenize_features if homogenize_features: X = self._remove_param_hash_from_features(X) missing_features = [ f for f in self.feature_labels if f not in X.columns ] extra_features = [f for f in X.columns if f not in self.feature_labels] if missing_features: raise BEEPMLExperimentError( f"{len(missing_features)} features present in training set not present " f"in prediction: " f"\n{pprint.pformat(missing_features)}") if extra_features: logger.warning( f"{len(extra_features)} extra features not in training set present in " f"prediction set due to fitting with nan threshold ({self.train_feature_drop_thresh}) - " f"these will be dropped: \n{pprint.pformat(extra_features)}") # Assemble the correct data while retaining all features X_old = copy.deepcopy(X) X = X[self.feature_labels].dropna( axis=0, thresh=self.predict_sample_nan_thresh * X.shape[1]) X = self._impute_df(X, self.impute_strategy) dropped = [] if X_old.shape[0] != X.shape[0]: dropped = [s for s in X_old.index if s not in X] logger.warning( f"{len(dropped)} samples dropped due to nan sample threshold " f"of {self.predict_sample_nan_thresh}. List of those dropped " f"indices is returned by .predict().") X_indices = X.index X = self.scaler.transform(X) y_pred = self.model.predict(X) # y_pred is an array, so we reattach the same indices # e.g., if idx contains filenames # which is important in case samples were dropped y_pred = pd.DataFrame(data=y_pred, columns=self.targets, index=X_indices) return y_pred, dropped
def from_file(cls, path, metadata_path=None): """Load an Arbin file to a datapath. Args: path (str, Pathlike): Path to the raw data csv. metadata_path (str, None): Path to metadata file, if it cannot be inferred from the path of the raw file. Returns: (ArbinDatapath) """ data = pd.read_csv(path, index_col=0) data.rename(str.lower, axis="columns", inplace=True) for column, dtype in cls.conversion_config["data_types"].items(): if column in data: if not data[column].isnull().values.any(): data[column] = data[column].astype(dtype) data.rename(cls.conversion_config["data_columns"], axis="columns", inplace=True) metadata_path = metadata_path if metadata_path else path.replace( ".csv", "_Metadata.csv") if os.path.exists(metadata_path): metadata = pd.read_csv(metadata_path) metadata.rename(str.lower, axis="columns", inplace=True) metadata.rename(cls.conversion_config["metadata_fields"], axis="columns", inplace=True) # Note the to_dict, which scrubs numpy typing metadata = { col: item[0] for col, item in metadata.to_dict("list").items() } else: logger.warning(f"No associated metadata file for Arbin: " f"'{metadata_path}'. No metadata loaded.") metadata = {} # standardizing time format data["date_time_iso"] = data["date_time"].apply( lambda x: datetime.utcfromtimestamp(x).replace(tzinfo=pytz.UTC ).isoformat()) paths = {"raw": path, "metadata": metadata_path if metadata else None} # Set schema from filename, if possible; otherwise, use default arbin schema project_schema = loadfn(PROJECT_SCHEMA) name = os.path.basename(path) special_schema_filename = project_schema.get(name.split("_")[0], {}).get("arbin") if special_schema_filename: schema = os.path.join(VALIDATION_SCHEMA_DIR, special_schema_filename) else: schema = os.path.join(VALIDATION_SCHEMA_DIR, "schema-arbin-lfp.yaml") return cls(data, metadata, paths=paths, schema=schema)
def process_file_list_from_json(file_list_json, processed_dir="data-share/structure/", omit_raw=True): """Function to take a json filename corresponding to a data structure with a 'file_list' and a 'validity' attribute, process each file with a corresponding True validity, dump the processed file into a predetermined directory, and return a jsonable dict of processed cycler run file locations Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list and validity attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string. processed_dir (str): location for processed cycler run output files to be placed. omit_raw (bool): Omit the raw_data from being saved to file. Creates legacy file structure for all structured datapaths. Returns: (str): json string of processed files (with key "processed_file_list"). Note that this list contains None values for every file that had a corresponding False in the validity list. """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup workflow outputs = WorkflowOutputs() # Prepend optional root to output directory processed_dir = os.path.join(os.environ.get("BEEP_PROCESSING_DIR", "/"), processed_dir) if not os.path.exists(processed_dir): os.makedirs(processed_dir) file_list = file_list_data["file_list"] validities = file_list_data["validity"] run_ids = file_list_data["run_list"] processed_file_list = [] processed_run_list = [] processed_result_list = [] processed_message_list = [] invalid_file_list = [] for filename, validity, run_id in zip(file_list, validities, run_ids): logger.info("run_id=%s structuring=%s", str(run_id), filename, extra=SERVICE_CONFIG) if validity == "valid": # Process datapath and dump to file dp = auto_load(filename) dp.autostructure() # raw_cycler_run = RawCyclerRun.from_file(filename) # processed_cycler_run = raw_cycler_run.to_processed_cycler_run() new_filename, ext = os.path.splitext(os.path.basename(filename)) new_filename = new_filename + ".json" new_filename = add_suffix_to_filename(new_filename, "_structure") structured_run_loc = os.path.join(processed_dir, new_filename) structured_run_loc = os.path.abspath(structured_run_loc) dp.to_json_file(structured_run_loc, omit_raw) # Append file loc to list to be returned processed_file_list.append(structured_run_loc) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({"comment": "", "error": ""}) else: invalid_file_list.append(filename) output_json = { "file_list": processed_file_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list, "invalid_file_list": invalid_file_list, } # Workflow outputs file_list_size = len(output_json["file_list"]) if file_list_size > 1 or file_list_size == 0: logger.warning("{file_list_size} files being validated, should be 1") output_data = { "filename": output_json["file_list"][0], "run_id": output_json["run_list"][0], "result": output_json["result_list"][0], } outputs.put_workflow_outputs(output_data, "structuring") # Return jsonable file list return json.dumps(output_json)
def __init__( self, feature_matrix: BEEPFeatureMatrix, features: List[str], targets: List[str], train_feature_drop_nan_thresh: float = 0.75, train_sample_drop_nan_thresh: float = 0.50, drop_nan_training_targets: bool = True, impute_strategy: str = "median", n_splits: int = 5, homogenize_features: bool = True, random_state: int = 10, split_columns: List[str] = None, exclusion_columns: List[str] = None, drop_split_threshold: float = 0.5, ): self.feature_matrix = feature_matrix if homogenize_features: self.feature_matrix.matrix = self._remove_param_hash_from_features(self.feature_matrix.matrix) # Form the clean feature and target matrices missing_columns = [t for t in targets+features if t not in self.feature_matrix.matrix.columns] if split_columns is not None: missing_columns += [t for t in split_columns if t not in self.feature_matrix.matrix.columns] if exclusion_columns is not None: missing_columns += [t for t in exclusion_columns if t not in self.feature_matrix.matrix.columns] if missing_columns: raise BEEPDataSplitterError( f"Required columns missing from " f"feature matrix: {missing_columns}" ) retain_columns = features + (split_columns if split_columns is not None else []) + \ (exclusion_columns if exclusion_columns is not None else []) X = self.feature_matrix.matrix[retain_columns] y = self.feature_matrix.matrix[targets] X = X.replace([np.inf, -np.inf], np.nan) y = y.replace([np.inf, -np.inf], np.nan) # Form the clean feature matrix X = X.dropna(axis=1, thresh=train_feature_drop_nan_thresh * X.shape[0]) X = X.dropna(axis=0, thresh=train_sample_drop_nan_thresh * X.shape[1]) if exclusion_columns is not None: X[exclusion_columns] = X[exclusion_columns].fillna(value=False, axis='columns') X = self._impute_df(X, method=impute_strategy) self.impute_strategy = impute_strategy # Create an aggregate column to group splits on by concatenating split column values if split_columns is not None: X["grouping_column"] = X.apply(lambda x: "::".join([str(x[s]) for s in split_columns]), axis=1) unique_grouping_values = X["grouping_column"].unique() if exclusion_columns is not None: if len(exclusion_columns) > 1: is_included_condition = reduce(lambda c1, c2: c1 & c2, [ X[e] for e in exclusion_columns[1:]], X[exclusion_columns[0]]) else: is_included_condition = X[exclusion_columns[0]] X_incl = X[is_included_condition] # Check if any entire split should be excluded if split_columns is not None: exclude_groups = [] for group in unique_grouping_values: X_group = X[X["grouping_column"] == group] X_incl_group = X_incl[X_incl["grouping_column"] == group] if len(X_incl_group)/len(X_group) < drop_split_threshold: exclude_groups.append(group) self.exclude_groups = exclude_groups X_incl = X_incl[~X_incl["grouping_column"].isin(exclude_groups)] X = X_incl if X.shape[0] < X.shape[1]: logger.warning( f"Number of samples ({X.shape[0]}) less than number of " f"features ({X.shape[1]}); may cause overfitting." ) if X.shape[0] < 2 or X.shape[1] < 1: raise BEEPDataSplitterError( "Cleaned feature matrix has dimensions of less " "than 1 feature or less than 2 samples. Try adjusting " "the thresholds for cleaning or examine your feature " "matrix." ) y = y.loc[X.index] if y.isna().any().any(): if drop_nan_training_targets: y = y.dropna(axis=0) else: raise BEEPDataSplitterError( "Target matrix contains nans and drop_nan_targets is " "set to False." ) if y.shape[0] < 2: raise BEEPDataSplitterError( "Target matrix after dropping nans is less than 2 samples." ) # Ensure there will be an equal number of X samples # and y samples self.X = X.loc[y.index] self.y = y self.feature_labels = [c for c in self.X.columns if c in features] self.targets = targets self.multi = len(self.targets) > 1 self.train_feature_drop_nan_thresh = train_feature_drop_nan_thresh self.train_sample_drop_nan_thresh = train_sample_drop_nan_thresh self.drop_nan_training_targets = drop_nan_training_targets self.homogenize_features = homogenize_features self.n_splits = n_splits self.random_state = random_state self.split_columns = split_columns self.datasets = None
def validate_from_paths(self, paths, record_results=False, skip_existing=False, record_path=DEFAULT_VALIDATION_RECORDS): """ This method streamlines validation of multiple Arbin csv files given a list of paths. It can also do bookkeeping of validations by dumping results in a json file, locally until a more centralized method is implemented. Args: paths (list): a list of paths to csv files record_results (bool): Whether to record the validation results locally or not (defaults to False) skip_existing (bool): Whether to skip already validated files. This is done by checking if the file is in the validation_records. skip_existing only matters if record_results is True. (defaults to False) record_path (str): path to the json file storing the past validation results. Returns: dict: Results of the validation in the form of a key,value pairs where each key corresponds to the filename validated. For each file, the results contain a field "validated", True if validation was successful or False if not. "errors", "method" and "time" are simply the errors encountered during validation, method used for validation, and time of validation, respectively. """ if record_results: if os.path.isfile(record_path): self.validation_records = loadfn(record_path) if skip_existing: paths = [ path for path in paths if os.path.basename(path) not in self.validation_records ] else: self.validation_records = {} results = {} for path in tqdm(paths): name = os.path.basename(path) results[name] = {} if re.match(ARBIN_CONFIG['file_pattern'], path): schema_filename = os.path.join(VALIDATION_SCHEMA_DIR, "schema-arbin-lfp.yaml") self.schema = loadfn(schema_filename) df = pd.read_csv(path, index_col=0) validated, reason = self.validate(df) method = "simple_arbin" elif re.match(MACCOR_CONFIG['file_pattern'], path): schema_filename = os.path.join(VALIDATION_SCHEMA_DIR, "schema-maccor-2170.yaml") self.schema = loadfn(schema_filename) self.allow_unknown = True df = pd.read_csv(path, delimiter='\t', skiprows=1) # Columns need to be retyped and renamed for validation, # conversion will happen during structuring df['State'] = df['State'].astype(str) df['current'] = df['Amps'] validated, reason = self.validate(df) method = "simple_maccor" else: validated, reason = False, "File type not recognized" method = None results[name].update({ "validated": validated, "method": method, "errors": reason, "time": json.dumps(datetime.now(), indent=4, sort_keys=True, default=str) }) if validated: logger.info("%s method=%s errors=%s", name, method, reason, extra=s) else: logger.warning("%s method=%s errors=%s", name, method, reason, extra=s) if record_results: self.validation_records.update(results) dumpfn(self.validation_records, record_path) return results
def process_file_list_from_json( file_list_json, model_dir="/data-share/models/", processed_dir="data-share/predictions/", hyperparameters=None, model_name=None, predict_only=True, ): """ Function to take a json file containing featurized json locations, train a new model if necessary, write files containing predictions into a predetermined directory, and return a jsonable dict of prediction file locations Args: file_list_json (str): json string or json filename corresponding to a dictionary with a file_list attribute, if this string ends with ".json", a json file is assumed and loaded, otherwise interpreted as a json string model_dir (str): location where models are serialized and stored processed_dir (str): location for processed cycler run output files to be placed hyperparameters (dict): dictionary of hyperparameters to optimize/use for training model_name (str): name of feature generation method predict_only (bool): Returns: str: json string of feature files (with key "feature_file_list"). """ # Get file list and validity from json, if ends with .json, # assume it's a file, if not assume it's a json string if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup workflow TODO # Add BEEP_PROCESSING_DIR to processed_dir processed_dir = os.path.join(os.environ.get("BEEP_PROCESSING_DIR", "/"), processed_dir) if not os.path.exists(processed_dir): os.makedirs(processed_dir) file_list = file_list_data["file_list"] run_ids = file_list_data["run_list"] processed_run_list = [] processed_result_list = [] processed_message_list = [] processed_paths_list = [] project_name = get_project_name_from_list(file_list) if predict_only: features = loadfn(file_list[0]) if model_name is None and project_name in DEFAULT_MODEL_PROJECTS: if features.prediction_type == "multi": model = DegradationModel.from_serialized_model( model_dir=model_dir, serialized_model="d3batt_multi_point.model") else: model = DegradationModel.from_serialized_model( model_dir=model_dir, serialized_model="d3batt_single_point.model") elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS: output_data = { "file_list": [], "run_list": [], "result_list": [], "message_list": [], } # Return jsonable file list return json.dumps(output_data) else: model = DegradationModel.from_serialized_model( model_dir=model_dir, serialized_model=model_name) else: if hyperparameters is None: hyperparameters = { "random_state": 1, "test_size": 0.3, "k_fold": 5, "tol": 0.001, "l1_ratio": [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], } dataset_id = file_list_data.get("dataset_id") model = DegradationModel.train( file_list_json, dataset_id=dataset_id, model_type="linear", regularization_type="elasticnet", model_name=model_name, hyperparameters=hyperparameters, ) logger.warning("fitting=%s dataset=%s", model.name, str(dataset_id), extra=s) for path, run_id in zip(file_list, run_ids): logger.info("model=%s run_id=%s predicting=%s", model.name, str(run_id), path, extra=s) features = loadfn(path) prediction = model.predict(features) prediction_dict = model.prediction_to_dict(prediction, features.nominal_capacity) new_filename = os.path.basename(path) new_filename = scrub_underscore_suffix(new_filename) new_filename = add_suffix_to_filename(new_filename, "_predictions") processed_path = os.path.join(processed_dir, new_filename) processed_path = os.path.abspath(processed_path) dumpfn(prediction_dict, processed_path) # Append file loc to list to be returned processed_paths_list.append(processed_path) processed_run_list.append(run_id) processed_result_list.append("success") processed_message_list.append({"comment": "", "error": ""}) output_data = { "file_list": processed_paths_list, "run_list": processed_run_list, "result_list": processed_result_list, "message_list": processed_message_list, } # Return jsonable file list return json.dumps(output_data)
def generate_protocol_files_from_csv(csv_filename, output_directory=None): """ Generates a set of protocol files from csv filename input by reading protocol file input corresponding to each line of the csv file. Writes a csv file that. Args: csv_filename (str): CSV containing protocol file parameters. output_directory (str): directory in which to place the output files """ # Read csv file protocol_params_df = pd.read_csv(csv_filename) successfully_generated_files = [] file_generation_failures = [] names = [] result = "" message = {"comment": "", "error": ""} if output_directory is None: output_directory = PROCEDURE_TEMPLATE_DIR for index, protocol_params in protocol_params_df.iterrows(): template = protocol_params["template"] protocol = None # Filename for the output filename_prefix = "_".join( [ protocol_params["project_name"], "{:06d}".format(protocol_params["seq_num"]), ] ) if ".000" in template: # Extension for maccor procedure files template_fullpath = os.path.join(PROCEDURE_TEMPLATE_DIR, template) template_length = template_detection(template_fullpath) if "diagnostic_parameter_set" in protocol_params: # For parameters include diagnostics load those values diag_params_df = pd.read_csv( os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv") ) diagnostic_params = diag_params_df[ diag_params_df["diagnostic_parameter_set"] == protocol_params["diagnostic_parameter_set"] ].squeeze() if template_length == 23 and template == "EXP.000": # length and name for initial procedure files protocol = Procedure.from_exp( **protocol_params[["cutoff_voltage", "charge_rate", "discharge_rate"]] ) elif template_length == 72: # length for V1 and V1 diagnostic templates without ending diagnostics protocol = Procedure.from_regcyclev2(protocol_params) protocol.add_procedure_diagcyclev2( protocol_params["capacity_nominal"], diagnostic_params ) elif template_length == 96: # template length for diagnostic type cycling mwf_dir = os.path.join(output_directory, "mwf_files") if protocol_params["project_name"] == "RapidC": # Project with charging waveform waveform_name = insert_charging_parametersv1(protocol_params, waveform_directory=mwf_dir) protocol = Procedure.generate_procedure_chargingv1(index, protocol_params, waveform_name, template=template_fullpath) elif protocol_params["project_name"] == "Drive": # Project with discharging waveform waveform_name = insert_driving_parametersv1(protocol_params, waveform_directory=mwf_dir) protocol = Procedure.generate_procedure_drivingv1(index, protocol_params, waveform_name, template=template_fullpath) else: # Use the default parameterization for PreDiag/Prediction Diagnostic projects protocol = Procedure.generate_procedure_regcyclev3(index, protocol_params, template=template_fullpath) protocol.generate_procedure_diagcyclev3( protocol_params["capacity_nominal"], diagnostic_params ) else: # Case where its not possible to match the procedure template failure = { "comment": "Unable to find template: " + template, "error": "Not Found", } file_generation_failures.append(failure) warnings.warn("Unsupported file template {}, skipping.".format(template)) result = "error" continue filename = "{}.000".format(filename_prefix) filename = os.path.join(output_directory, "procedures", filename) elif ".mps" in template and template == "formationV1.mps": # biologic settings template and formation project protocol = Settings.from_file(os.path.join(BIOLOGIC_TEMPLATE_DIR, template)) protocol = protocol.formation_protocol_bcs(protocol_params) filename = "{}.mps".format(filename_prefix) filename = os.path.join(output_directory, "settings", filename) elif ".sdu" in template: # No schedule file templates implemented failure = { "comment": "Schedule file generation is not yet implemented", "error": "Not Implemented" } file_generation_failures.append(failure) logger.warning("Schedule file generation not yet implemented", extra=s) result = "error" continue else: # Unable to match to any known template format failure = { "comment": "Unable to find template: " + template, "error": "Not Found", } file_generation_failures.append(failure) warnings.warn("Unsupported file template {}, skipping.".format(template)) result = "error" continue logger.info(filename, extra=s) protocol.to_file(filename) successfully_generated_files.append(filename) names.append(filename_prefix + "_") # This block of code produces the file containing all of the run file # names produced in this function call. This is to make starting tests easier _, namefile = os.path.split(csv_filename) namefile = namefile.split("_")[0] + "_names_" namefile = namefile + datetime.datetime.now().strftime("%Y%m%d_%H%M") + ".csv" names_dir = os.path.join(output_directory, "names") os.makedirs(names_dir, exist_ok=True) with open(os.path.join(names_dir, namefile), "w", newline="") as outputfile: wr = csv.writer(outputfile) for name in names: wr.writerow([name]) outputfile.close() num_generated_files = len(successfully_generated_files) num_generation_failures = len(file_generation_failures) num_files = num_generated_files + num_generation_failures message = { "comment": "Generated {} of {} protocols".format(num_generated_files, num_files), "error": "" } if not result: result = "success" else: message["error"] = "Failed to generate {} of {} protocols".format(num_generation_failures, num_files) logger.error(message["error"]) return successfully_generated_files, file_generation_failures, result, message
def validate_file_list_from_json( file_list_json, record_results=False, skip_existing=False, validator_class=SimpleValidator, ): """ Validates a list of files from json input Args: file_list_json (str): input for validation files, should be a json string with attribute "file_list" or a filename (e. g. something.json) corresponding to a json object with a similar attribute. record_results (bool): Whether to record the validation results locally or not (defaults to False). skip_existing (bool): Whether to skip already validated files. This is done by checking if the file is in the validation_records. skip_existing only matters if record_results is True. (defaults to False) validator_class (ValidatorBeep or SimpleValidator): validator class to use in validation. Returns: str: json dump of the validator results. """ # Process input json if file_list_json.endswith(".json"): file_list_data = loadfn(file_list_json) else: file_list_data = json.loads(file_list_json) # Setup workflow outputs = WorkflowOutputs() file_list = file_list_data["file_list"] validator = validator_class() all_results = validator.validate_from_paths( file_list, record_results=record_results, skip_existing=skip_existing, ) # Get validities and recast to strings (valid/invalid) based on result validity = [ all_results[os.path.split(file)[-1]]["validated"] for file in file_list ] validity = list(map(lambda x: "valid" if x else "invalid", validity)) # Get errors errors = [ all_results[os.path.split(file)[-1]]["errors"] for file in file_list ] messages = [{"comment": "", "error": error} for error in errors] output_json = { "file_list": file_list, "run_list": file_list_data["run_list"], "validity": validity, "message_list": messages, } # Workflow outputs file_list_size = len(output_json["file_list"]) if file_list_size > 1 or file_list_size == 0: logger.warning("{file_list_size} files being validated, should be 1") output_data = { "filename": output_json["file_list"][0], "run_id": output_json["run_list"][0], "result": output_json["validity"][0], } outputs.put_workflow_outputs(output_data, "validating") return json.dumps(output_json)