Beispiel #1
0
def cache_s3_objs(obj_names,
                  bucket_name="kitware",
                  filter_existing_files=True):
    """
    Quick function to download relevant s3 files to cache.

    Args:
        obj_names ([str]): s3 object keys.
        bucket_name (str): bucket name.
        filter_existing_files (bool): whether or not to filter existing files.

    Returns:
        None

    """

    # make cache dir
    if not os.path.isdir(S3_CACHE):
        os.mkdir(S3_CACHE)

    # Initialize s3 resource
    s3 = boto3.resource("s3")
    bucket = s3.Bucket(bucket_name)

    # Put more s3 objects here if desired
    if filter_existing_files:
        obj_names = filter(
            lambda x: not os.path.isfile(os.path.join(S3_CACHE, x)), obj_names)
        obj_names = list(obj_names)

    for obj_name in tqdm(obj_names):
        path, filename = os.path.split(obj_name)
        # Make directory if it doesn't exist
        if not os.path.isdir(os.path.join(S3_CACHE, path)):
            os.makedirs(os.path.join(S3_CACHE, path))

        file_object = s3.Object(bucket_name, obj_name)
        file_size = file_object.content_length
        with tqdm(total=file_size, unit_scale=True, desc=filename) as t:
            bucket.download_file(obj_name,
                                 os.path.join(S3_CACHE, obj_name),
                                 Callback=hook(t))
Beispiel #2
0
    def validate_arbin_dataframe(self, df, schema=DEFAULT_ARBIN_SCHEMA):
        """
        Validator for large, cyclic dataframes coming from Arbin.
        Requires a valid Cycle_Index column of type int.
        Designed for performance - will stop at the first encounter of issues.

        Args:
            df (pandas.DataFrame): Arbin output as DataFrame.
            schema (str): Path to the validation schema. Defaults to arbin for now.
        Returns:
            bool: True if validated with out errors. If validation fails, errors
                are listed at ValidatorBeep.errors.
        """

        try:
            schema = loadfn(schema)
            self.arbin_schema = schema
        except Exception as e:
            warnings.warn('Arbin schema could not be found: {}'.format(e))

        df = df.rename(str.lower, axis='columns')

        # Validation cycle index data and cast to int
        if not self._prevalidate_nonnull_column(df, 'cycle_index'):
            return False
        df.cycle_index = df.cycle_index.astype(int, copy=False)

        # Validation starts here
        self.schema = self.arbin_schema

        for cycle_index, cycle_df in tqdm(df.groupby("cycle_index")):
            cycle_dict = cycle_df.replace({np.nan,
                                           'None'}).to_dict(orient='list')
            result = self.validate(cycle_dict)
            if not result:
                return False
        return True
Beispiel #3
0
    def validate_maccor_dataframe(self, df, schema=DEFAULT_MACCOR_SCHEMA):
        """
        Validator for large, cyclic dataframes coming from Maccor.
        Requires a valid Cyc# column of type int.
        Designed for performance - will stop at the first encounter of issues.

        Args:
            df (pandas.DataFrame): Maccor output as DataFrame.
            schema (str): Path to the validation schema. Defaults to maccor for now.
        Returns:
            bool: True if validated with out errors. If validation fails, errors
            are listed at ValidatorBeep.errors.
        """

        try:
            schema = loadfn(schema)
            self.maccor_schema = schema
        except Exception as e:
            warnings.warn("Maccor schema could not be found: {}".format(e))

        df = df.rename(str.lower, axis="columns")

        # Validation cycle index data and cast to int
        if not self._prevalidate_nonnull_column(df, "cyc#"):
            return False
        df["cyc#"] = df["cyc#"].astype(int, copy=False)

        # Validation starts here
        self.schema = self.maccor_schema

        for cycle_index, cycle_df in tqdm(df.groupby("cyc#")):
            cycle_dict = cycle_df.replace({np.nan,
                                           "None"}).to_dict(orient="list")
            result = self.validate(cycle_dict)
            if not result:
                return False
        return True
Beispiel #4
0
def process_files_json():
    """
    Inspects the BEEP_PROCESSING_DIR directory and renames
    files according to the prescribed system of protocol/date/run ID
    associated with the file metadata.  Since this script operates
    only on filesystem assumptions, no input is required.

    Returns:
        (str): json string corresponding to the locations of the renamed files.
    """
    # chdir into beep root
    pwd = os.getcwd()
    os.chdir(os.environ.get("BEEP_PROCESSING_DIR", "/"))

    if not os.path.exists(SRC_DIR):
        os.makedirs(SRC_DIR)
    if not os.path.exists(DEST_DIR):
        os.makedirs(DEST_DIR)

    meta_list = list(
        filter(lambda x: '_Metadata.csv' in x, os.listdir(SRC_DIR)))
    file_list = list(
        filter(lambda x: '.csv' in x if x not in meta_list else None,
               os.listdir(SRC_DIR)))
    all_list = list(filter(lambda x: '.csv' in x, os.listdir(SRC_DIR)))

    all_list = sorted(all_list)
    dumpfn(all_list, "all_files.json")

    [file_id, mapdf] = init_map(PROJECT_NAME, DEST_DIR)

    new_file_index = file_id

    for filename in tqdm(sorted(file_list)):
        # If the file has already been renamed another entry should not be made
        if mapdf['filename'].str.contains(filename).sum() > 0:
            continue
        old_file = os.path.join(SRC_DIR, filename)
        new_path = os.path.join(DEST_DIR, PROJECT_NAME)
        shutil.copy(old_file, new_path)  # copy main data file
        shutil.copy(old_file.replace(".csv", '_Metadata.csv'),
                    new_path)  # copy meta data file

        if PROJECT_NAME == 'FastCharge':
            [date, channel_no, strname,
             protocol] = get_parameters_fastcharge(filename, SRC_DIR)
        elif PROJECT_NAME == 'ClosedLoopOED':
            [date, channel_no, strname,
             protocol] = get_parameters_oed(filename, SRC_DIR)
        else:
            raise ValueError(
                "Unsupported PROJECT_NAME: {}".format(PROJECT_NAME))

        df_dup = mapdf.set_index(['protocol', 'date'])
        if (protocol, date) in df_dup.index:
            row = mapdf[(mapdf['protocol'] == protocol)
                        & (mapdf['date'] == date)]
            file_id = row['fid'].iloc[0]
            protocol = row['protocol'].iloc[0]
            date = row['date'].iloc[0]
            strname = row['strname'].iloc[0]
        else:
            file_id = new_file_index
            new_file_index = new_file_index + 1

        new_name = "{}_{}_{}".format(PROJECT_NAME, f'{file_id:06}', channel_no)
        new_file = os.path.join(DEST_DIR, PROJECT_NAME,
                                "{}.csv".format(new_name))

        new_row = pd.DataFrame([[
            file_id, protocol, channel_no, date, strname,
            os.path.abspath(old_file),
            os.path.abspath(new_file)
        ]],
                               columns=METADATA_COLUMN_NAMES)
        mapdf = mapdf.append(new_row)

        os.rename(os.path.join(DEST_DIR, PROJECT_NAME, filename), new_file)
        os.rename(
            os.path.join(DEST_DIR, PROJECT_NAME,
                         filename).replace(".csv", "_Metadata.csv"),
            new_file.replace(".csv", "_Metadata.csv"))

        mapdf.to_csv(os.path.join(DEST_DIR, PROJECT_NAME,
                                  PROJECT_NAME + "map.csv"),
                     index=False)
    mapdf = mapdf.reset_index(drop=True)
    os.chdir(pwd)
    return json.dumps(mapdf.to_dict("list"))
Beispiel #5
0
    def validate_from_paths(self,
                            paths,
                            record_results=False,
                            skip_existing=False,
                            record_path=DEFAULT_VALIDATION_RECORDS):
        """
        This method streamlines validation of multiple Arbin csv files given a list of paths.

        It can also do bookkeeping of validations by dumping results in a json file,
        locally until a more centralized method is implemented.

        Args:
            paths (list): a list of paths to csv files
            record_results (bool): Whether to record the validation results locally or not (defaults to False)
            skip_existing (bool): Whether to skip already validated files. This is done by checking if the
                                    file is in the validation_records. skip_existing only matters if record_results
                                    is True. (defaults to False)
            record_path (str): path to the json file storing the past validation results.
        Returns:
            dict: Results of the validation in the form of a key,value pairs where each key corresponds to the filename
                validated. For each file, the results contain a field "validated", True if validation was successful or
                False if not. "errors", "method" and "time" are simply the errors encountered during validation, method
                used for validation, and time of validation, respectively.

        """
        if record_results:
            if os.path.isfile(record_path):
                self.validation_records = loadfn(record_path)
                if skip_existing:
                    paths = [
                        path for path in paths if os.path.basename(path) not in
                        self.validation_records
                    ]
            else:
                self.validation_records = {}

        results = {}
        for path in tqdm(paths):
            name = os.path.basename(path)
            results[name] = {}
            if re.match(ARBIN_CONFIG['file_pattern'], path):
                schema_filename = os.path.join(VALIDATION_SCHEMA_DIR,
                                               "schema-arbin-lfp.yaml")
                self.schema = loadfn(schema_filename)
                df = pd.read_csv(path, index_col=0)
                validated, reason = self.validate(df)
                method = "simple_arbin"
            elif re.match(MACCOR_CONFIG['file_pattern'], path):
                schema_filename = os.path.join(VALIDATION_SCHEMA_DIR,
                                               "schema-maccor-2170.yaml")
                self.schema = loadfn(schema_filename)
                self.allow_unknown = True
                df = pd.read_csv(path, delimiter='\t', skiprows=1)

                # Columns need to be retyped and renamed for validation,
                # conversion will happen during structuring
                df['State'] = df['State'].astype(str)
                df['current'] = df['Amps']

                validated, reason = self.validate(df)
                method = "simple_maccor"
            else:
                validated, reason = False, "File type not recognized"
                method = None
            results[name].update({
                "validated":
                validated,
                "method":
                method,
                "errors":
                reason,
                "time":
                json.dumps(datetime.now(),
                           indent=4,
                           sort_keys=True,
                           default=str)
            })

            if validated:
                logger.info("%s method=%s errors=%s",
                            name,
                            method,
                            reason,
                            extra=s)
            else:
                logger.warning("%s method=%s errors=%s",
                               name,
                               method,
                               reason,
                               extra=s)

        if record_results:
            self.validation_records.update(results)
            dumpfn(self.validation_records, record_path)

        return results