Exemple #1
0
def dummy(ctx, test):
    logger.debug(f"Test msg: {test}")
    logger.info("Info msg")
    logger.warning("Warning message")
    logger.error("Error message")
    logger.critical("CRITICAL MESSAGE!")

    if test == "throw_error":
        raise ValueError("Some error!")

    dumpfn({"example": "status"}, ctx.obj.output_status_json)
Exemple #2
0
    def from_file(cls, path, metadata_path=None):
        """Load an Arbin file to a datapath.

        Args:
            path (str, Pathlike): Path to the raw data csv.

        Returns:
            (ArbinDatapath)
        """
        data = pd.read_csv(path)
        data.rename(str.lower, axis="columns", inplace=True)

        for column, dtype in ARBIN_CONFIG["data_types"].items():
            if column in data:
                if not data[column].isnull().values.any():
                    data[column] = data[column].astype(dtype)

        data.rename(ARBIN_CONFIG["data_columns"], axis="columns", inplace=True)

        metadata_path = metadata_path if metadata_path else path.replace(
            ".csv", "_Metadata.csv")

        if os.path.exists(metadata_path):
            metadata = pd.read_csv(metadata_path)
            metadata.rename(str.lower, axis="columns", inplace=True)
            metadata.rename(ARBIN_CONFIG["metadata_fields"],
                            axis="columns",
                            inplace=True)
            # Note the to_dict, which scrubs numpy typing
            metadata = {
                col: item[0]
                for col, item in metadata.to_dict("list").items()
            }
        else:
            logger.warning(f"No associated metadata file for Arbin: "
                           f"'{metadata_path}'. No metadata loaded.")
            metadata = {}

        # standardizing time format
        data["date_time_iso"] = data["date_time"].apply(
            lambda x: datetime.utcfromtimestamp(x).replace(tzinfo=pytz.UTC
                                                           ).isoformat())

        paths = {"raw": path, "metadata": metadata_path if metadata else None}

        return cls(data, metadata, paths)
Exemple #3
0
def process_file_list_from_json(file_list_json, model_dir="/data-share/models/",
                                processed_dir='data-share/predictions/',
                                hyperparameters=None, model_name=None, predict_only=True):
    """
    Function to take a json file containing featurized json locations,
    train a new model if necessary, write files containing predictions into a
    predetermined directory, and return a jsonable dict of prediction file locations

    Args:
        file_list_json (str): json string or json filename corresponding
            to a dictionary with a file_list attribute,
            if this string ends with ".json", a json file is assumed
            and loaded, otherwise interpreted as a json string
        model_dir (str): location where models are serialized and stored
        processed_dir (str): location for processed cycler run output files
            to be placed
        hyperparameters (dict): dictionary of hyperparameters to optimize/use for training
        model_name (str): name of feature generation method
        predict_only (bool):

    Returns:
        str: json string of feature files (with key "feature_file_list").

    """
    # Get file list and validity from json, if ends with .json,
    # assume it's a file, if not assume it's a json string
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup Events
    events = KinesisEvents(service='DataAnalyzer', mode=file_list_data['mode'])

    # Add BEEP_ROOT to processed_dir
    processed_dir = os.path.join(os.environ.get("BEEP_ROOT", "/"),
                                 processed_dir)
    file_list = file_list_data['file_list']
    run_ids = file_list_data['run_list']
    processed_run_list = []
    processed_result_list = []
    processed_message_list = []
    processed_paths_list = []
    project_name = get_project_name_from_list(file_list)
    if predict_only:
        features = loadfn(file_list[0])
        if model_name is None and project_name in DEFAULT_MODEL_PROJECTS:

            if features.prediction_type == 'multi':
                model = DegradationModel.from_serialized_model(model_dir=model_dir,
                                                               serialized_model='d3batt_multi_point.model')
            else:
                model = DegradationModel.from_serialized_model(model_dir=model_dir,
                                                               serialized_model='d3batt_single_point.model')

        elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS:
            output_data = {"file_list": [],
                           "run_list": [],
                           "result_list": [],
                           "message_list": []
                           }

            events.put_analyzing_event(output_data, 'predicting', 'error')

            # Return jsonable file list
            return json.dumps(output_data)

        else:
            model = DegradationModel.from_serialized_model(model_dir=model_dir,
                                                           serialized_model=model_name)

    else:
        if hyperparameters is None:
            hyperparameters = {'random_state': 1,
                               'test_size': .3,
                               'k_fold': 5,
                               'tol': 0.001,
                               'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]
                               }

        dataset_id = file_list_data.get("dataset_id")
        model = DegradationModel.train(file_list_json, dataset_id=dataset_id,
                                       model_type='linear', regularization_type='elasticnet',
                                       model_name=model_name, hyperparameters=hyperparameters)
        logger.warning('fitting=%s dataset=%s', model.name, str(dataset_id), extra=s)

    for path, run_id in zip(file_list, run_ids):
        logger.info('model=%s run_id=%s predicting=%s', model.name, str(run_id), path, extra=s)
        features = loadfn(path)
        prediction = model.predict(features)
        prediction_dict = model.prediction_to_dict(prediction, features.nominal_capacity)
        new_filename = os.path.basename(path)
        new_filename = scrub_underscore_suffix(new_filename)
        new_filename = add_suffix_to_filename(new_filename, "_predictions")
        processed_path = os.path.join(processed_dir, new_filename)
        processed_path = os.path.abspath(processed_path)
        dumpfn(prediction_dict, processed_path)

        # Append file loc to list to be returned
        processed_paths_list.append(processed_path)
        processed_run_list.append(run_id)
        processed_result_list.append("success")
        processed_message_list.append({'comment': '',
                                       'error': ''})

    output_data = {"file_list": processed_paths_list,
                   "run_list": processed_run_list,
                   "result_list": processed_result_list,
                   "message_list": processed_message_list
                   }

    events.put_analyzing_event(output_data, 'predicting', 'complete')

    # Return jsonable file list
    return json.dumps(output_data)
Exemple #4
0
def generate_protocol_files_from_csv(csv_filename, output_directory=None):

    """
    Generates a set of protocol files from csv filename input by
    reading protocol file input corresponding to each line of
    the csv file. Writes a csv file that.

    Args:
        csv_filename (str): CSV containing protocol file parameters.
        output_directory (str): directory in which to place the output files
    """
    # Read csv file
    protocol_params_df = pd.read_csv(csv_filename)

    new_files = []
    names = []
    result = ""
    message = {"comment": "", "error": ""}
    if output_directory is None:
        output_directory = PROCEDURE_TEMPLATE_DIR
    for index, protocol_params in protocol_params_df.iterrows():
        template = protocol_params["template"]
        # Filename for the output
        filename_prefix = "_".join(
            [
                protocol_params["project_name"],
                "{:06d}".format(protocol_params["seq_num"]),
            ]
        )

        # Switch for template invocation
        if template == "EXP.000":
            protocol = Procedure.from_exp(
                **protocol_params[["cutoff_voltage", "charge_rate", "discharge_rate"]]
            )
            filename = "{}.000".format(filename_prefix)
            filename = os.path.join(output_directory, "procedures", filename)
        elif template == "diagnosticV2.000":
            diag_params_df = pd.read_csv(
                os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")
            )
            diagnostic_params = diag_params_df[
                diag_params_df["diagnostic_parameter_set"]
                == protocol_params["diagnostic_parameter_set"]
            ].squeeze()

            # TODO: should these be separated?
            protocol = Procedure.from_regcyclev2(protocol_params)
            protocol.add_procedure_diagcyclev2(
                protocol_params["capacity_nominal"], diagnostic_params
            )
            filename = "{}.000".format(filename_prefix)
            filename = os.path.join(output_directory, "procedures", filename)
        # TODO: how are these different?
        elif template in ["diagnosticV3.000", "diagnosticV4.000"]:
            diag_params_df = pd.read_csv(
                os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")
            )
            diagnostic_params = diag_params_df[
                diag_params_df["diagnostic_parameter_set"]
                == protocol_params["diagnostic_parameter_set"]
            ].squeeze()

            protocol = Procedure.generate_procedure_regcyclev3(index, protocol_params)
            protocol.generate_procedure_diagcyclev3(
                protocol_params["capacity_nominal"], diagnostic_params
            )
            filename = "{}.000".format(filename_prefix)
            filename = os.path.join(output_directory, "procedures", filename)
        elif template == "formationV1.mps":
            protocol = Settings.from_file(os.path.join(BIOLOGIC_TEMPLATE_DIR, template))
            protocol = protocol.formation_protocol_bcs(protocol, protocol_params)
            filename = "{}.mps".format(filename_prefix)
            filename = os.path.join(output_directory, "settings", filename)
        else:
            warnings.warn("Unsupported file template {}, skipping.".format(template))
            result = "error"
            message = {
                "comment": "Unable to find template: " + template,
                "error": "Not Found",
            }
            continue

        logger.info(filename, extra=s)
        if not os.path.isfile(filename):
            protocol.to_file(filename)
            new_files.append(filename)
            names.append(filename_prefix + "_")

        elif ".sdu" in template:
            logger.warning("Schedule file generation not yet implemented", extra=s)
            result = "error"
            message = {
                "comment": "Schedule file generation is not yet implemented",
                "error": "Not Implemented",
            }

    # This block of code produces the file containing all of the run file
    # names produced in this function call. This is to make starting tests easier
    _, namefile = os.path.split(csv_filename)
    namefile = namefile.split("_")[0] + "_names_"
    namefile = namefile + datetime.datetime.now().strftime("%Y%m%d_%H%M") + ".csv"
    with open(
        os.path.join(output_directory, "names", namefile), "w", newline=""
    ) as outputfile:
        wr = csv.writer(outputfile)
        for name in names:
            wr.writerow([name])
    outputfile.close()

    if not result:
        result = "success"
        message = {
            "comment": "Generated {} protocols".format(str(len(new_files))),
            "error": "",
        }

    return new_files, result, message
def generate_protocol_files_from_csv(csv_filename, output_directory, **kwargs):
    """
    Generates a set of protocol files from csv filename input by
    reading protocol file input corresponding to each line of
    the csv file. Writes a csv file that.

    Args:
        csv_filename (str): CSV containing protocol file parameters.
        output_directory (str): directory in which to place the output files
        **kwargs: kwargs to ProcedureFile, the object which does the protocol
            file generation
    """
    # Invoke ProcedureFile object from **kwargs
    procedure_file_generator = ProcedureFile(**kwargs)

    # Read csv file
    protocol_params_df = pd.read_csv(csv_filename)

    new_files = []
    names = []
    result = ''
    message = {'comment': '',
               'error': ''}
    for index, protocol_params in protocol_params_df.iterrows():
        template = protocol_params['template']
        if template not in ["EXP.000", "diagnosticV1.000", "diagnosticV2.000", "diagnosticV3.000"]:
            warnings.warn("Unsupported file template {}, skipping.".format(template))
            result = "error"
            message = {'comment': 'Unable to find template: ' + template,
                       'error': 'Not Found'}
            continue

        if ".000" in template:
            # Generate primary procedure dictionary
            proc_dict, sp = procedure_file_generator.to_dict(
                os.path.join(PROCEDURE_TEMPLATE_DIR, "{}".format(template)),
                os.path.join(PROCEDURE_TEMPLATE_DIR, "{}.json".format(template.split('.')[0]))
            )

            # Generate EXP-based proc_dict
            if template == "EXP.000":
                proc_dict = procedure_file_generator.generate_procedure_exp(
                    proc_dict, **protocol_params[["cutoff_voltage", "charge_rate", "discharge_rate"]])
            elif template == 'diagnosticV2.000':
                diag_params_df = pd.read_csv(os.path.join(PROCEDURE_TEMPLATE_DIR,
                                                          "PreDiag_parameters - DP.csv"))
                diagnostic_params = diag_params_df[diag_params_df['diagnostic_parameter_set'] ==
                                                   protocol_params['diagnostic_parameter_set']].squeeze()

                proc_dict = procedure_file_generator.generate_procedure_regcyclev2(
                    proc_dict, protocol_params)
                proc_dict = procedure_file_generator.generate_procedure_diagcyclev2(
                    proc_dict, protocol_params["capacity_nominal"], diagnostic_params)
            elif template == 'diagnosticV3.000':
                diag_params_df = pd.read_csv(os.path.join(PROCEDURE_TEMPLATE_DIR,
                                                          "PreDiag_parameters - DP.csv"))
                diagnostic_params = diag_params_df[diag_params_df['diagnostic_parameter_set'] ==
                                                   protocol_params['diagnostic_parameter_set']].squeeze()

                proc_dict = procedure_file_generator.generate_procedure_regcyclev3(index,
                    proc_dict, protocol_params)
                proc_dict = procedure_file_generator.generate_procedure_diagcyclev3(
                    proc_dict, protocol_params["capacity_nominal"], diagnostic_params)

            filename_prefix = '_'.join(
                [protocol_params["project_name"], '{:06d}'.format(protocol_params["seq_num"])])
            filename = "{}.000".format(filename_prefix)
            filename = os.path.join(output_directory, 'procedures', filename)
            logger.info(filename, extra=s)
            if not os.path.isfile(filename):
                proc_dict = procedure_file_generator.maccor_format_dict(proc_dict)
                procedure_file_generator.dict_to_xml(
                    proc_dict=proc_dict, xml_file=filename, sp=sp)
                new_files.append(filename)
                names.append(filename_prefix + '_')

        elif '.sdu' in template:
            logger.warning('Schedule file generation not yet implemented', extra=s)
            result = "error"
            message = {'comment': 'Schedule file generation is not yet implemented',
                       'error': 'Not Implemented'}

    # This block of code produces the file containing all of the run file
    # names produced in this function call. This is to make starting tests easier
    _, namefile = os.path.split(csv_filename)
    namefile = namefile.split('_')[0] + '_names_'
    namefile = namefile + datetime.datetime.now().strftime("%Y%m%d_%H%M") + '.csv'
    with open(os.path.join(output_directory, "names", namefile), 'w') as outputfile:
        wr = csv.writer(outputfile)
        for name in names:
            wr.writerow([name])
    outputfile.close()

    if not result:
        result = "success"
        message = {'comment': 'Generated {} protocols'.format(str(len(new_files))),
                   'error': ''}

    return new_files, result, message
Exemple #6
0
def generate_protocol_files_from_csv(csv_filename, output_directory=None):
    """
    Generates a set of protocol files from csv filename input by
    reading protocol file input corresponding to each line of
    the csv file. Writes a csv file that.

    Args:
        csv_filename (str): CSV containing protocol file parameters.
        output_directory (str): directory in which to place the output files
    """
    # Read csv file
    protocol_params_df = pd.read_csv(csv_filename)

    new_files = []
    names = []
    result = ''
    message = {'comment': '', 'error': ''}
    if output_directory is None:
        output_directory = PROCEDURE_TEMPLATE_DIR
    for index, protocol_params in protocol_params_df.iterrows():
        template = protocol_params['template']

        # Switch for template invocation
        if template == "EXP.000":
            procedure = Procedure.from_exp(**protocol_params[
                ["cutoff_voltage", "charge_rate", "discharge_rate"]])
        elif template == 'diagnosticV2.000':
            diag_params_df = pd.read_csv(
                os.path.join(PROCEDURE_TEMPLATE_DIR,
                             "PreDiag_parameters - DP.csv"))
            diagnostic_params = diag_params_df[
                diag_params_df['diagnostic_parameter_set'] ==
                protocol_params['diagnostic_parameter_set']].squeeze()

            # TODO: should these be separated?
            procedure = Procedure.from_regcyclev2(protocol_params)
            procedure.add_procedure_diagcyclev2(
                protocol_params["capacity_nominal"], diagnostic_params)

        # TODO: how are these different?
        elif template in ['diagnosticV3.000', 'diagnosticV4.000']:
            diag_params_df = pd.read_csv(
                os.path.join(PROCEDURE_TEMPLATE_DIR,
                             "PreDiag_parameters - DP.csv"))
            diagnostic_params = diag_params_df[
                diag_params_df['diagnostic_parameter_set'] ==
                protocol_params['diagnostic_parameter_set']].squeeze()

            procedure = Procedure.generate_procedure_regcyclev3(
                index, protocol_params)
            procedure.generate_procedure_diagcyclev3(
                protocol_params["capacity_nominal"], diagnostic_params)
        else:
            warnings.warn(
                "Unsupported file template {}, skipping.".format(template))
            result = "error"
            message = {
                'comment': 'Unable to find template: ' + template,
                'error': 'Not Found'
            }
            continue

        filename_prefix = '_'.join([
            protocol_params["project_name"],
            '{:06d}'.format(protocol_params["seq_num"])
        ])
        filename = "{}.000".format(filename_prefix)
        filename = os.path.join(output_directory, 'procedures', filename)
        logger.info(filename, extra=s)
        if not os.path.isfile(filename):
            procedure.to_file(filename)
            new_files.append(filename)
            names.append(filename_prefix + '_')

        elif '.sdu' in template:
            logger.warning('Schedule file generation not yet implemented',
                           extra=s)
            result = "error"
            message = {
                'comment': 'Schedule file generation is not yet implemented',
                'error': 'Not Implemented'
            }

    # This block of code produces the file containing all of the run file
    # names produced in this function call. This is to make starting tests easier
    _, namefile = os.path.split(csv_filename)
    namefile = namefile.split('_')[0] + '_names_'
    namefile = namefile + datetime.datetime.now().strftime(
        "%Y%m%d_%H%M") + '.csv'
    with open(os.path.join(output_directory, "names", namefile),
              'w',
              newline='') as outputfile:
        wr = csv.writer(outputfile)
        for name in names:
            wr.writerow([name])
    outputfile.close()

    if not result:
        result = "success"
        message = {
            'comment': 'Generated {} protocols'.format(str(len(new_files))),
            'error': ''
        }

    return new_files, result, message
Exemple #7
0
    def __init__(
            self,
            feature_matrix: BEEPFeatureMatrix,
            target_matrix: BEEPFeatureMatrix,
            targets: List[str],
            model_name: str,
            alphas: Union[None, Iterable[float]] = None,
            train_feature_drop_nan_thresh: float = 0.95,
            train_sample_drop_nan_thresh: float = 0.50,
            predict_sample_nan_thresh: float = 0.75,
            drop_nan_training_targets: bool = False,
            impute_strategy: str = "median",
            kfold: int = 5,
            max_iter: int = 1e6,
            tol: float = 1e-4,
            # only relevant for elasticnet
            l1_ratio: Union[Tuple[float], List[float]] = (0.001, 0.1, 0.5, 0.7,
                                                          0.9, 0.95, 1),
            homogenize_features: bool = True):
        if model_name not in self.ALLOWED_MODELS:
            raise ValueError(
                f"Model {model_name} not supported by {self.__class__.__name__}"
            )

        if len(targets) < 1:
            raise ValueError("At least one target must be specified")

        self.feature_matrix = feature_matrix
        self.target_matrix = target_matrix

        X = self.feature_matrix.matrix.replace([np.inf, -np.inf], np.nan)
        y = self.target_matrix.matrix.replace([np.inf, -np.inf], np.nan)

        if homogenize_features:
            X = self._remove_param_hash_from_features(X)
            y = self._remove_param_hash_from_features(y)

        if X.shape[0] != y.shape[0]:
            raise BEEPMLExperimentError(
                "Can't run experiment on unequal numbers of input samples.")
        if X.shape[0] < X.shape[1]:
            logger.warning(
                f"Number of samples ({X.shape[0]}) less than number of "
                f"features ({X.shape[1]}); may cause overfitting.")

        # Form the clean feature matrix
        X = X.dropna(axis=1, thresh=train_sample_drop_nan_thresh * X.shape[0])
        X = X.dropna(axis=0, thresh=train_sample_drop_nan_thresh * X.shape[1])
        X = self._impute_df(X, method=impute_strategy)
        self.impute_strategy = impute_strategy
        if X.shape[0] < 2 or X.shape[1] < 1:
            raise BEEPMLExperimentError(
                "Cleaned feature matrix has dimensions of less "
                "than 1 feature or less than 2 samples. Try adjusting "
                "the thresholds for cleaning or examine your feature "
                "matrix.")

        # Form the clean target matrix
        missing_targets = [t for t in targets if t not in y.columns]
        if missing_targets:
            raise BEEPMLExperimentError(
                f"Required target columns missing from "
                f"target matrix: {missing_targets}")
        y = y[targets].loc[X.index]
        if y.isna().any().any():
            if drop_nan_training_targets:
                y = y.dropna(axis=0)
            else:
                raise BEEPMLExperimentError(
                    "Target matrix contains nans and drop_nan_targets is "
                    "set to False.")
        if y.shape[0] < 2:
            raise BEEPMLExperimentError(
                "Target matrix after dropping nans is less than 2 samples.")

        # Ensure there will be an equal number of X samples
        # and y samples
        self.X = X.loc[y.index]
        self.y = y

        # These features must be present in passed dfs for predictions to work
        self.feature_labels = self.X.columns.tolist()

        self.targets = targets

        self.multi = len(self.targets) > 1

        if self.multi and model_name != "elasticnet":
            raise BEEPMLExperimentError(
                f"Model {model_name} not supported for multiple target "
                f"regression.")

        self.model_name = model_name if model_name else "elasticnet"
        self.model = None

        self.train_feature_drop_thresh = train_feature_drop_nan_thresh
        self.train_sample_drop_thresh = train_sample_drop_nan_thresh
        self.predict_sample_nan_thresh = predict_sample_nan_thresh
        self.drop_nan_training_targets = drop_nan_training_targets

        # todo: this is only to help with deserialization, this could cause
        # todo: contamination in judging test scores when used with
        # todo: train_and_score()
        self.scaler = StandardScaler().fit(X)
        self.kfold = kfold
        self.alphas = alphas
        self.max_iter = max_iter
        self.tol = tol
        self.l1_ratio = l1_ratio

        self.optimal_hyperparameters = None
        self.homogenize_features = homogenize_features
Exemple #8
0
    def predict(
        self,
        feature_matrix: Union[BEEPFeatureMatrix, pd.DataFrame],
        homogenize_features: Union[None, bool] = None,
    ):
        """Use the trained model to predict new degradation characteristics
        based on an incoming feature matrix.


        Args:
            feature_matrix (BEEPFeatureMatrix): The feature matrix to use
                for predicting degradation character.
            homogenize_features (bool, None): Whether to homogenize the
                incoming matrix's features. Overrides homogenize_features
                as set in __init__.

        Returns:
            y_pred (pd.DataFrame): The predictions, in dataframe format.
            dropped (list): List of dropped samples, by incoming df
                index (e.g., filename).

        """
        if not self.model:
            raise BEEPMLExperimentError("No model has been trained.")

        # condense features down to those required, throwing error if not present

        if isinstance(feature_matrix, BEEPFeatureMatrix):
            X = feature_matrix.matrix
        else:
            X = feature_matrix

        # make sure features will have the same names if homogenize features
        # even if featurizer' hyperparameters are different
        homogenize_features = self.homogenize_features if homogenize_features is None else homogenize_features
        if homogenize_features:
            X = self._remove_param_hash_from_features(X)

        missing_features = [
            f for f in self.feature_labels if f not in X.columns
        ]
        extra_features = [f for f in X.columns if f not in self.feature_labels]
        if missing_features:
            raise BEEPMLExperimentError(
                f"{len(missing_features)} features present in training set not present "
                f"in prediction: "
                f"\n{pprint.pformat(missing_features)}")
        if extra_features:
            logger.warning(
                f"{len(extra_features)} extra features not in training set present in "
                f"prediction set due to fitting with nan threshold ({self.train_feature_drop_thresh}) - "
                f"these will be dropped: \n{pprint.pformat(extra_features)}")

        # Assemble the correct data while retaining all features
        X_old = copy.deepcopy(X)
        X = X[self.feature_labels].dropna(
            axis=0, thresh=self.predict_sample_nan_thresh * X.shape[1])
        X = self._impute_df(X, self.impute_strategy)

        dropped = []
        if X_old.shape[0] != X.shape[0]:
            dropped = [s for s in X_old.index if s not in X]
            logger.warning(
                f"{len(dropped)} samples dropped due to nan sample threshold "
                f"of {self.predict_sample_nan_thresh}. List of those dropped "
                f"indices is returned by .predict().")

        X_indices = X.index
        X = self.scaler.transform(X)
        y_pred = self.model.predict(X)

        # y_pred is an array, so we reattach the same indices
        # e.g., if idx contains filenames
        # which is important in case samples were dropped
        y_pred = pd.DataFrame(data=y_pred,
                              columns=self.targets,
                              index=X_indices)
        return y_pred, dropped
Exemple #9
0
    def from_file(cls, path, metadata_path=None):
        """Load an Arbin file to a datapath.

        Args:
            path (str, Pathlike): Path to the raw data csv.
            metadata_path (str, None): Path to metadata file, if it
                cannot be inferred from the path of the raw file.

        Returns:
            (ArbinDatapath)
        """
        data = pd.read_csv(path, index_col=0)
        data.rename(str.lower, axis="columns", inplace=True)

        for column, dtype in cls.conversion_config["data_types"].items():
            if column in data:
                if not data[column].isnull().values.any():
                    data[column] = data[column].astype(dtype)

        data.rename(cls.conversion_config["data_columns"],
                    axis="columns",
                    inplace=True)

        metadata_path = metadata_path if metadata_path else path.replace(
            ".csv", "_Metadata.csv")

        if os.path.exists(metadata_path):
            metadata = pd.read_csv(metadata_path)
            metadata.rename(str.lower, axis="columns", inplace=True)
            metadata.rename(cls.conversion_config["metadata_fields"],
                            axis="columns",
                            inplace=True)
            # Note the to_dict, which scrubs numpy typing
            metadata = {
                col: item[0]
                for col, item in metadata.to_dict("list").items()
            }
        else:
            logger.warning(f"No associated metadata file for Arbin: "
                           f"'{metadata_path}'. No metadata loaded.")
            metadata = {}

        # standardizing time format
        data["date_time_iso"] = data["date_time"].apply(
            lambda x: datetime.utcfromtimestamp(x).replace(tzinfo=pytz.UTC
                                                           ).isoformat())

        paths = {"raw": path, "metadata": metadata_path if metadata else None}

        # Set schema from filename, if possible; otherwise, use default arbin schema
        project_schema = loadfn(PROJECT_SCHEMA)
        name = os.path.basename(path)
        special_schema_filename = project_schema.get(name.split("_")[0],
                                                     {}).get("arbin")

        if special_schema_filename:
            schema = os.path.join(VALIDATION_SCHEMA_DIR,
                                  special_schema_filename)
        else:
            schema = os.path.join(VALIDATION_SCHEMA_DIR,
                                  "schema-arbin-lfp.yaml")

        return cls(data, metadata, paths=paths, schema=schema)
Exemple #10
0
def process_file_list_from_json(file_list_json,
                                processed_dir="data-share/structure/",
                                omit_raw=True):
    """Function to take a json filename corresponding to a data structure
    with a 'file_list' and a 'validity' attribute, process each file
    with a corresponding True validity, dump the processed file into
    a predetermined directory, and return a jsonable dict of processed
    cycler run file locations

    Args:
        file_list_json (str): json string or json filename corresponding
            to a dictionary with a file_list and validity attribute,
            if this string ends with ".json", a json file is assumed
            and loaded, otherwise interpreted as a json string.
        processed_dir (str): location for processed cycler run output
            files to be placed.
        omit_raw (bool): Omit the raw_data from being saved to file. Creates
            legacy file structure for all structured datapaths.

    Returns:
        (str): json string of processed files (with key "processed_file_list").
            Note that this list contains None values for every file that
            had a corresponding False in the validity list.

    """
    # Get file list and validity from json, if ends with .json,
    # assume it's a file, if not assume it's a json string
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup workflow
    outputs = WorkflowOutputs()

    # Prepend optional root to output directory
    processed_dir = os.path.join(os.environ.get("BEEP_PROCESSING_DIR", "/"),
                                 processed_dir)

    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)

    file_list = file_list_data["file_list"]
    validities = file_list_data["validity"]
    run_ids = file_list_data["run_list"]
    processed_file_list = []
    processed_run_list = []
    processed_result_list = []
    processed_message_list = []
    invalid_file_list = []
    for filename, validity, run_id in zip(file_list, validities, run_ids):
        logger.info("run_id=%s structuring=%s",
                    str(run_id),
                    filename,
                    extra=SERVICE_CONFIG)
        if validity == "valid":
            # Process datapath and dump to file

            dp = auto_load(filename)
            dp.autostructure()

            # raw_cycler_run = RawCyclerRun.from_file(filename)
            # processed_cycler_run = raw_cycler_run.to_processed_cycler_run()
            new_filename, ext = os.path.splitext(os.path.basename(filename))
            new_filename = new_filename + ".json"
            new_filename = add_suffix_to_filename(new_filename, "_structure")
            structured_run_loc = os.path.join(processed_dir, new_filename)
            structured_run_loc = os.path.abspath(structured_run_loc)
            dp.to_json_file(structured_run_loc, omit_raw)

            # Append file loc to list to be returned
            processed_file_list.append(structured_run_loc)
            processed_run_list.append(run_id)
            processed_result_list.append("success")
            processed_message_list.append({"comment": "", "error": ""})

        else:
            invalid_file_list.append(filename)

    output_json = {
        "file_list": processed_file_list,
        "run_list": processed_run_list,
        "result_list": processed_result_list,
        "message_list": processed_message_list,
        "invalid_file_list": invalid_file_list,
    }

    # Workflow outputs
    file_list_size = len(output_json["file_list"])
    if file_list_size > 1 or file_list_size == 0:
        logger.warning("{file_list_size} files being validated, should be 1")

    output_data = {
        "filename": output_json["file_list"][0],
        "run_id": output_json["run_list"][0],
        "result": output_json["result_list"][0],
    }

    outputs.put_workflow_outputs(output_data, "structuring")

    # Return jsonable file list
    return json.dumps(output_json)
Exemple #11
0
    def __init__(
            self,
            feature_matrix: BEEPFeatureMatrix,
            features: List[str],
            targets: List[str],
            train_feature_drop_nan_thresh: float = 0.75,
            train_sample_drop_nan_thresh: float = 0.50,
            drop_nan_training_targets: bool = True,
            impute_strategy: str = "median",
            n_splits: int = 5,
            homogenize_features: bool = True,
            random_state: int = 10,
            split_columns: List[str] = None,
            exclusion_columns: List[str] = None,
            drop_split_threshold: float = 0.5,
    ):

        self.feature_matrix = feature_matrix

        if homogenize_features:
            self.feature_matrix.matrix = self._remove_param_hash_from_features(self.feature_matrix.matrix)

        # Form the clean feature and target matrices
        missing_columns = [t for t in targets+features if t not in self.feature_matrix.matrix.columns]

        if split_columns is not None:
            missing_columns += [t for t in split_columns if t not in self.feature_matrix.matrix.columns] 
        if exclusion_columns is not None:
            missing_columns += [t for t in exclusion_columns if t not in self.feature_matrix.matrix.columns] 

        if missing_columns:
            raise BEEPDataSplitterError(
                f"Required columns missing from "
                f"feature matrix: {missing_columns}"
            )

        retain_columns = features + (split_columns if split_columns is not None else []) + \
            (exclusion_columns if exclusion_columns is not None else []) 
        X = self.feature_matrix.matrix[retain_columns]
        y = self.feature_matrix.matrix[targets]

        X = X.replace([np.inf, -np.inf], np.nan)
        y = y.replace([np.inf, -np.inf], np.nan)

        # Form the clean feature matrix
        X = X.dropna(axis=1, thresh=train_feature_drop_nan_thresh * X.shape[0])
        X = X.dropna(axis=0, thresh=train_sample_drop_nan_thresh * X.shape[1])

        if exclusion_columns is not None:
            X[exclusion_columns] = X[exclusion_columns].fillna(value=False, axis='columns')

        X = self._impute_df(X, method=impute_strategy)

        self.impute_strategy = impute_strategy

        # Create an aggregate column to group splits on by concatenating split column values
        if split_columns is not None:
            X["grouping_column"] = X.apply(lambda x: "::".join([str(x[s]) for s in split_columns]), axis=1)
            unique_grouping_values = X["grouping_column"].unique()

        if exclusion_columns is not None:

            if len(exclusion_columns) > 1:
                is_included_condition = reduce(lambda c1, c2: c1 & c2, [
                                               X[e] for e in exclusion_columns[1:]], X[exclusion_columns[0]])
            else:
                is_included_condition = X[exclusion_columns[0]]

            X_incl = X[is_included_condition]
            # Check if any entire split should be excluded
            if split_columns is not None:
                exclude_groups = []
                for group in unique_grouping_values:
                    X_group = X[X["grouping_column"] == group]
                    X_incl_group = X_incl[X_incl["grouping_column"] == group]

                    if len(X_incl_group)/len(X_group) < drop_split_threshold:
                        exclude_groups.append(group)

                self.exclude_groups = exclude_groups
                X_incl = X_incl[~X_incl["grouping_column"].isin(exclude_groups)]

            X = X_incl

        if X.shape[0] < X.shape[1]:
            logger.warning(
                f"Number of samples ({X.shape[0]}) less than number of "
                f"features ({X.shape[1]}); may cause overfitting."
            )

        if X.shape[0] < 2 or X.shape[1] < 1:
            raise BEEPDataSplitterError(
                "Cleaned feature matrix has dimensions of less "
                "than 1 feature or less than 2 samples. Try adjusting "
                "the thresholds for cleaning or examine your feature "
                "matrix."
            )

        y = y.loc[X.index]
        if y.isna().any().any():
            if drop_nan_training_targets:
                y = y.dropna(axis=0)
            else:
                raise BEEPDataSplitterError(
                    "Target matrix contains nans and drop_nan_targets is "
                    "set to False."
                )
        if y.shape[0] < 2:
            raise BEEPDataSplitterError(
                "Target matrix after dropping nans is less than 2 samples."
            )

        # Ensure there will be an equal number of X samples
        # and y samples
        self.X = X.loc[y.index]
        self.y = y

        self.feature_labels = [c for c in self.X.columns if c in features]

        self.targets = targets

        self.multi = len(self.targets) > 1

        self.train_feature_drop_nan_thresh = train_feature_drop_nan_thresh
        self.train_sample_drop_nan_thresh = train_sample_drop_nan_thresh
        self.drop_nan_training_targets = drop_nan_training_targets
        self.homogenize_features = homogenize_features
        self.n_splits = n_splits
        self.random_state = random_state
        self.split_columns = split_columns
        self.datasets = None
Exemple #12
0
    def validate_from_paths(self,
                            paths,
                            record_results=False,
                            skip_existing=False,
                            record_path=DEFAULT_VALIDATION_RECORDS):
        """
        This method streamlines validation of multiple Arbin csv files given a list of paths.

        It can also do bookkeeping of validations by dumping results in a json file,
        locally until a more centralized method is implemented.

        Args:
            paths (list): a list of paths to csv files
            record_results (bool): Whether to record the validation results locally or not (defaults to False)
            skip_existing (bool): Whether to skip already validated files. This is done by checking if the
                                    file is in the validation_records. skip_existing only matters if record_results
                                    is True. (defaults to False)
            record_path (str): path to the json file storing the past validation results.
        Returns:
            dict: Results of the validation in the form of a key,value pairs where each key corresponds to the filename
                validated. For each file, the results contain a field "validated", True if validation was successful or
                False if not. "errors", "method" and "time" are simply the errors encountered during validation, method
                used for validation, and time of validation, respectively.

        """
        if record_results:
            if os.path.isfile(record_path):
                self.validation_records = loadfn(record_path)
                if skip_existing:
                    paths = [
                        path for path in paths if os.path.basename(path) not in
                        self.validation_records
                    ]
            else:
                self.validation_records = {}

        results = {}
        for path in tqdm(paths):
            name = os.path.basename(path)
            results[name] = {}
            if re.match(ARBIN_CONFIG['file_pattern'], path):
                schema_filename = os.path.join(VALIDATION_SCHEMA_DIR,
                                               "schema-arbin-lfp.yaml")
                self.schema = loadfn(schema_filename)
                df = pd.read_csv(path, index_col=0)
                validated, reason = self.validate(df)
                method = "simple_arbin"
            elif re.match(MACCOR_CONFIG['file_pattern'], path):
                schema_filename = os.path.join(VALIDATION_SCHEMA_DIR,
                                               "schema-maccor-2170.yaml")
                self.schema = loadfn(schema_filename)
                self.allow_unknown = True
                df = pd.read_csv(path, delimiter='\t', skiprows=1)

                # Columns need to be retyped and renamed for validation,
                # conversion will happen during structuring
                df['State'] = df['State'].astype(str)
                df['current'] = df['Amps']

                validated, reason = self.validate(df)
                method = "simple_maccor"
            else:
                validated, reason = False, "File type not recognized"
                method = None
            results[name].update({
                "validated":
                validated,
                "method":
                method,
                "errors":
                reason,
                "time":
                json.dumps(datetime.now(),
                           indent=4,
                           sort_keys=True,
                           default=str)
            })

            if validated:
                logger.info("%s method=%s errors=%s",
                            name,
                            method,
                            reason,
                            extra=s)
            else:
                logger.warning("%s method=%s errors=%s",
                               name,
                               method,
                               reason,
                               extra=s)

        if record_results:
            self.validation_records.update(results)
            dumpfn(self.validation_records, record_path)

        return results
Exemple #13
0
def process_file_list_from_json(
    file_list_json,
    model_dir="/data-share/models/",
    processed_dir="data-share/predictions/",
    hyperparameters=None,
    model_name=None,
    predict_only=True,
):
    """
    Function to take a json file containing featurized json locations,
    train a new model if necessary, write files containing predictions into a
    predetermined directory, and return a jsonable dict of prediction file locations

    Args:
        file_list_json (str): json string or json filename corresponding
            to a dictionary with a file_list attribute,
            if this string ends with ".json", a json file is assumed
            and loaded, otherwise interpreted as a json string
        model_dir (str): location where models are serialized and stored
        processed_dir (str): location for processed cycler run output files
            to be placed
        hyperparameters (dict): dictionary of hyperparameters to optimize/use for training
        model_name (str): name of feature generation method
        predict_only (bool):

    Returns:
        str: json string of feature files (with key "feature_file_list").

    """
    # Get file list and validity from json, if ends with .json,
    # assume it's a file, if not assume it's a json string
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup workflow TODO

    # Add BEEP_PROCESSING_DIR to processed_dir
    processed_dir = os.path.join(os.environ.get("BEEP_PROCESSING_DIR", "/"),
                                 processed_dir)
    if not os.path.exists(processed_dir):
        os.makedirs(processed_dir)

    file_list = file_list_data["file_list"]
    run_ids = file_list_data["run_list"]
    processed_run_list = []
    processed_result_list = []
    processed_message_list = []
    processed_paths_list = []
    project_name = get_project_name_from_list(file_list)
    if predict_only:
        features = loadfn(file_list[0])
        if model_name is None and project_name in DEFAULT_MODEL_PROJECTS:

            if features.prediction_type == "multi":
                model = DegradationModel.from_serialized_model(
                    model_dir=model_dir,
                    serialized_model="d3batt_multi_point.model")
            else:
                model = DegradationModel.from_serialized_model(
                    model_dir=model_dir,
                    serialized_model="d3batt_single_point.model")

        elif model_name is None and project_name not in DEFAULT_MODEL_PROJECTS:
            output_data = {
                "file_list": [],
                "run_list": [],
                "result_list": [],
                "message_list": [],
            }

            # Return jsonable file list
            return json.dumps(output_data)

        else:
            model = DegradationModel.from_serialized_model(
                model_dir=model_dir, serialized_model=model_name)

    else:
        if hyperparameters is None:
            hyperparameters = {
                "random_state": 1,
                "test_size": 0.3,
                "k_fold": 5,
                "tol": 0.001,
                "l1_ratio": [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
            }

        dataset_id = file_list_data.get("dataset_id")
        model = DegradationModel.train(
            file_list_json,
            dataset_id=dataset_id,
            model_type="linear",
            regularization_type="elasticnet",
            model_name=model_name,
            hyperparameters=hyperparameters,
        )
        logger.warning("fitting=%s dataset=%s",
                       model.name,
                       str(dataset_id),
                       extra=s)

    for path, run_id in zip(file_list, run_ids):
        logger.info("model=%s run_id=%s predicting=%s",
                    model.name,
                    str(run_id),
                    path,
                    extra=s)
        features = loadfn(path)
        prediction = model.predict(features)
        prediction_dict = model.prediction_to_dict(prediction,
                                                   features.nominal_capacity)
        new_filename = os.path.basename(path)
        new_filename = scrub_underscore_suffix(new_filename)
        new_filename = add_suffix_to_filename(new_filename, "_predictions")
        processed_path = os.path.join(processed_dir, new_filename)
        processed_path = os.path.abspath(processed_path)
        dumpfn(prediction_dict, processed_path)

        # Append file loc to list to be returned
        processed_paths_list.append(processed_path)
        processed_run_list.append(run_id)
        processed_result_list.append("success")
        processed_message_list.append({"comment": "", "error": ""})

    output_data = {
        "file_list": processed_paths_list,
        "run_list": processed_run_list,
        "result_list": processed_result_list,
        "message_list": processed_message_list,
    }

    # Return jsonable file list
    return json.dumps(output_data)
Exemple #14
0
def generate_protocol_files_from_csv(csv_filename, output_directory=None):

    """
    Generates a set of protocol files from csv filename input by
    reading protocol file input corresponding to each line of
    the csv file. Writes a csv file that.

    Args:
        csv_filename (str): CSV containing protocol file parameters.
        output_directory (str): directory in which to place the output files
    """
    # Read csv file
    protocol_params_df = pd.read_csv(csv_filename)

    successfully_generated_files = []
    file_generation_failures = []
    names = []
    result = ""
    message = {"comment": "", "error": ""}
    if output_directory is None:
        output_directory = PROCEDURE_TEMPLATE_DIR

    for index, protocol_params in protocol_params_df.iterrows():
        template = protocol_params["template"]
        protocol = None
        # Filename for the output
        filename_prefix = "_".join(
            [
                protocol_params["project_name"],
                "{:06d}".format(protocol_params["seq_num"]),
            ]
        )
        if ".000" in template:  # Extension for maccor procedure files
            template_fullpath = os.path.join(PROCEDURE_TEMPLATE_DIR, template)
            template_length = template_detection(template_fullpath)
            if "diagnostic_parameter_set" in protocol_params:  # For parameters include diagnostics load those values
                diag_params_df = pd.read_csv(
                    os.path.join(PROCEDURE_TEMPLATE_DIR, "PreDiag_parameters - DP.csv")
                )
                diagnostic_params = diag_params_df[
                    diag_params_df["diagnostic_parameter_set"]
                    == protocol_params["diagnostic_parameter_set"]
                    ].squeeze()

            if template_length == 23 and template == "EXP.000":  # length and name for initial procedure files
                protocol = Procedure.from_exp(
                    **protocol_params[["cutoff_voltage", "charge_rate", "discharge_rate"]]
                )
            elif template_length == 72:  # length for V1 and V1 diagnostic templates without ending diagnostics
                protocol = Procedure.from_regcyclev2(protocol_params)
                protocol.add_procedure_diagcyclev2(
                    protocol_params["capacity_nominal"], diagnostic_params
                )
            elif template_length == 96:  # template length for diagnostic type cycling
                mwf_dir = os.path.join(output_directory, "mwf_files")
                if protocol_params["project_name"] == "RapidC":  # Project with charging waveform
                    waveform_name = insert_charging_parametersv1(protocol_params,
                                                                 waveform_directory=mwf_dir)
                    protocol = Procedure.generate_procedure_chargingv1(index,
                                                                       protocol_params,
                                                                       waveform_name,
                                                                       template=template_fullpath)
                elif protocol_params["project_name"] == "Drive":  # Project with discharging waveform
                    waveform_name = insert_driving_parametersv1(protocol_params,
                                                                waveform_directory=mwf_dir)
                    protocol = Procedure.generate_procedure_drivingv1(index,
                                                                      protocol_params,
                                                                      waveform_name,
                                                                      template=template_fullpath)
                else:  # Use the default parameterization for PreDiag/Prediction Diagnostic projects
                    protocol = Procedure.generate_procedure_regcyclev3(index,
                                                                       protocol_params,
                                                                       template=template_fullpath)
                protocol.generate_procedure_diagcyclev3(
                    protocol_params["capacity_nominal"], diagnostic_params
                )
            else:  # Case where its not possible to match the procedure template
                failure = {
                    "comment": "Unable to find template: " + template,
                    "error": "Not Found",
                }
                file_generation_failures.append(failure)
                warnings.warn("Unsupported file template {}, skipping.".format(template))
                result = "error"
                continue

            filename = "{}.000".format(filename_prefix)
            filename = os.path.join(output_directory, "procedures", filename)

        elif ".mps" in template and template == "formationV1.mps":  # biologic settings template and formation project
            protocol = Settings.from_file(os.path.join(BIOLOGIC_TEMPLATE_DIR, template))
            protocol = protocol.formation_protocol_bcs(protocol_params)
            filename = "{}.mps".format(filename_prefix)
            filename = os.path.join(output_directory, "settings", filename)
        elif ".sdu" in template:  # No schedule file templates implemented
            failure = {
                "comment": "Schedule file generation is not yet implemented",
                "error": "Not Implemented"
            }
            file_generation_failures.append(failure)
            logger.warning("Schedule file generation not yet implemented", extra=s)
            result = "error"
            continue
        else:  # Unable to match to any known template format
            failure = {
                "comment": "Unable to find template: " + template,
                "error": "Not Found",
            }
            file_generation_failures.append(failure)
            warnings.warn("Unsupported file template {}, skipping.".format(template))
            result = "error"
            continue

        logger.info(filename, extra=s)
        protocol.to_file(filename)
        successfully_generated_files.append(filename)
        names.append(filename_prefix + "_")

    # This block of code produces the file containing all of the run file
    # names produced in this function call. This is to make starting tests easier
    _, namefile = os.path.split(csv_filename)
    namefile = namefile.split("_")[0] + "_names_"
    namefile = namefile + datetime.datetime.now().strftime("%Y%m%d_%H%M") + ".csv"

    names_dir = os.path.join(output_directory, "names")
    os.makedirs(names_dir, exist_ok=True)

    with open(os.path.join(names_dir, namefile), "w", newline="") as outputfile:
        wr = csv.writer(outputfile)
        for name in names:
            wr.writerow([name])
    outputfile.close()

    num_generated_files = len(successfully_generated_files)
    num_generation_failures = len(file_generation_failures)
    num_files = num_generated_files + num_generation_failures

    message = {
        "comment": "Generated {} of {} protocols".format(num_generated_files, num_files),
        "error": ""
    }
    if not result:
        result = "success"
    else:
        message["error"] = "Failed to generate {} of {} protocols".format(num_generation_failures, num_files)
        logger.error(message["error"])

    return successfully_generated_files, file_generation_failures, result, message
Exemple #15
0
def validate_file_list_from_json(
    file_list_json,
    record_results=False,
    skip_existing=False,
    validator_class=SimpleValidator,
):
    """
    Validates a list of files from json input

    Args:
        file_list_json (str): input for validation files, should be a json string
            with attribute "file_list" or a filename (e. g. something.json)
            corresponding to a json object with a similar attribute.
        record_results (bool): Whether to record the validation results locally
            or not (defaults to False).
        skip_existing (bool): Whether to skip already validated files. This
            is done by checking if the file is in the validation_records.
            skip_existing only matters if record_results is True. (defaults to False)
        validator_class (ValidatorBeep or SimpleValidator): validator class
            to use in validation.

    Returns:
        str: json dump of the validator results.

    """
    # Process input json
    if file_list_json.endswith(".json"):
        file_list_data = loadfn(file_list_json)
    else:
        file_list_data = json.loads(file_list_json)

    # Setup workflow
    outputs = WorkflowOutputs()

    file_list = file_list_data["file_list"]

    validator = validator_class()
    all_results = validator.validate_from_paths(
        file_list,
        record_results=record_results,
        skip_existing=skip_existing,
    )

    # Get validities and recast to strings (valid/invalid) based on result
    validity = [
        all_results[os.path.split(file)[-1]]["validated"] for file in file_list
    ]

    validity = list(map(lambda x: "valid" if x else "invalid", validity))

    # Get errors
    errors = [
        all_results[os.path.split(file)[-1]]["errors"] for file in file_list
    ]
    messages = [{"comment": "", "error": error} for error in errors]
    output_json = {
        "file_list": file_list,
        "run_list": file_list_data["run_list"],
        "validity": validity,
        "message_list": messages,
    }

    # Workflow outputs
    file_list_size = len(output_json["file_list"])
    if file_list_size > 1 or file_list_size == 0:
        logger.warning("{file_list_size} files being validated, should be 1")

    output_data = {
        "filename": output_json["file_list"][0],
        "run_id": output_json["run_list"][0],
        "result": output_json["validity"][0],
    }

    outputs.put_workflow_outputs(output_data, "validating")

    return json.dumps(output_json)