def cache_s3_objs(obj_names, bucket_name="kitware", filter_existing_files=True): """ Quick function to download relevant s3 files to cache. Args: obj_names ([str]): s3 object keys. bucket_name (str): bucket name. filter_existing_files (bool): whether or not to filter existing files. Returns: None """ # make cache dir if not os.path.isdir(S3_CACHE): os.mkdir(S3_CACHE) # Initialize s3 resource s3 = boto3.resource("s3") bucket = s3.Bucket(bucket_name) # Put more s3 objects here if desired if filter_existing_files: obj_names = filter( lambda x: not os.path.isfile(os.path.join(S3_CACHE, x)), obj_names) obj_names = list(obj_names) for obj_name in tqdm(obj_names): path, filename = os.path.split(obj_name) # Make directory if it doesn't exist if not os.path.isdir(os.path.join(S3_CACHE, path)): os.makedirs(os.path.join(S3_CACHE, path)) file_object = s3.Object(bucket_name, obj_name) file_size = file_object.content_length with tqdm(total=file_size, unit_scale=True, desc=filename) as t: bucket.download_file(obj_name, os.path.join(S3_CACHE, obj_name), Callback=hook(t))
def validate_arbin_dataframe(self, df, schema=DEFAULT_ARBIN_SCHEMA): """ Validator for large, cyclic dataframes coming from Arbin. Requires a valid Cycle_Index column of type int. Designed for performance - will stop at the first encounter of issues. Args: df (pandas.DataFrame): Arbin output as DataFrame. schema (str): Path to the validation schema. Defaults to arbin for now. Returns: bool: True if validated with out errors. If validation fails, errors are listed at ValidatorBeep.errors. """ try: schema = loadfn(schema) self.arbin_schema = schema except Exception as e: warnings.warn('Arbin schema could not be found: {}'.format(e)) df = df.rename(str.lower, axis='columns') # Validation cycle index data and cast to int if not self._prevalidate_nonnull_column(df, 'cycle_index'): return False df.cycle_index = df.cycle_index.astype(int, copy=False) # Validation starts here self.schema = self.arbin_schema for cycle_index, cycle_df in tqdm(df.groupby("cycle_index")): cycle_dict = cycle_df.replace({np.nan, 'None'}).to_dict(orient='list') result = self.validate(cycle_dict) if not result: return False return True
def validate_maccor_dataframe(self, df, schema=DEFAULT_MACCOR_SCHEMA): """ Validator for large, cyclic dataframes coming from Maccor. Requires a valid Cyc# column of type int. Designed for performance - will stop at the first encounter of issues. Args: df (pandas.DataFrame): Maccor output as DataFrame. schema (str): Path to the validation schema. Defaults to maccor for now. Returns: bool: True if validated with out errors. If validation fails, errors are listed at ValidatorBeep.errors. """ try: schema = loadfn(schema) self.maccor_schema = schema except Exception as e: warnings.warn("Maccor schema could not be found: {}".format(e)) df = df.rename(str.lower, axis="columns") # Validation cycle index data and cast to int if not self._prevalidate_nonnull_column(df, "cyc#"): return False df["cyc#"] = df["cyc#"].astype(int, copy=False) # Validation starts here self.schema = self.maccor_schema for cycle_index, cycle_df in tqdm(df.groupby("cyc#")): cycle_dict = cycle_df.replace({np.nan, "None"}).to_dict(orient="list") result = self.validate(cycle_dict) if not result: return False return True
def process_files_json(): """ Inspects the BEEP_PROCESSING_DIR directory and renames files according to the prescribed system of protocol/date/run ID associated with the file metadata. Since this script operates only on filesystem assumptions, no input is required. Returns: (str): json string corresponding to the locations of the renamed files. """ # chdir into beep root pwd = os.getcwd() os.chdir(os.environ.get("BEEP_PROCESSING_DIR", "/")) if not os.path.exists(SRC_DIR): os.makedirs(SRC_DIR) if not os.path.exists(DEST_DIR): os.makedirs(DEST_DIR) meta_list = list( filter(lambda x: '_Metadata.csv' in x, os.listdir(SRC_DIR))) file_list = list( filter(lambda x: '.csv' in x if x not in meta_list else None, os.listdir(SRC_DIR))) all_list = list(filter(lambda x: '.csv' in x, os.listdir(SRC_DIR))) all_list = sorted(all_list) dumpfn(all_list, "all_files.json") [file_id, mapdf] = init_map(PROJECT_NAME, DEST_DIR) new_file_index = file_id for filename in tqdm(sorted(file_list)): # If the file has already been renamed another entry should not be made if mapdf['filename'].str.contains(filename).sum() > 0: continue old_file = os.path.join(SRC_DIR, filename) new_path = os.path.join(DEST_DIR, PROJECT_NAME) shutil.copy(old_file, new_path) # copy main data file shutil.copy(old_file.replace(".csv", '_Metadata.csv'), new_path) # copy meta data file if PROJECT_NAME == 'FastCharge': [date, channel_no, strname, protocol] = get_parameters_fastcharge(filename, SRC_DIR) elif PROJECT_NAME == 'ClosedLoopOED': [date, channel_no, strname, protocol] = get_parameters_oed(filename, SRC_DIR) else: raise ValueError( "Unsupported PROJECT_NAME: {}".format(PROJECT_NAME)) df_dup = mapdf.set_index(['protocol', 'date']) if (protocol, date) in df_dup.index: row = mapdf[(mapdf['protocol'] == protocol) & (mapdf['date'] == date)] file_id = row['fid'].iloc[0] protocol = row['protocol'].iloc[0] date = row['date'].iloc[0] strname = row['strname'].iloc[0] else: file_id = new_file_index new_file_index = new_file_index + 1 new_name = "{}_{}_{}".format(PROJECT_NAME, f'{file_id:06}', channel_no) new_file = os.path.join(DEST_DIR, PROJECT_NAME, "{}.csv".format(new_name)) new_row = pd.DataFrame([[ file_id, protocol, channel_no, date, strname, os.path.abspath(old_file), os.path.abspath(new_file) ]], columns=METADATA_COLUMN_NAMES) mapdf = mapdf.append(new_row) os.rename(os.path.join(DEST_DIR, PROJECT_NAME, filename), new_file) os.rename( os.path.join(DEST_DIR, PROJECT_NAME, filename).replace(".csv", "_Metadata.csv"), new_file.replace(".csv", "_Metadata.csv")) mapdf.to_csv(os.path.join(DEST_DIR, PROJECT_NAME, PROJECT_NAME + "map.csv"), index=False) mapdf = mapdf.reset_index(drop=True) os.chdir(pwd) return json.dumps(mapdf.to_dict("list"))
def validate_from_paths(self, paths, record_results=False, skip_existing=False, record_path=DEFAULT_VALIDATION_RECORDS): """ This method streamlines validation of multiple Arbin csv files given a list of paths. It can also do bookkeeping of validations by dumping results in a json file, locally until a more centralized method is implemented. Args: paths (list): a list of paths to csv files record_results (bool): Whether to record the validation results locally or not (defaults to False) skip_existing (bool): Whether to skip already validated files. This is done by checking if the file is in the validation_records. skip_existing only matters if record_results is True. (defaults to False) record_path (str): path to the json file storing the past validation results. Returns: dict: Results of the validation in the form of a key,value pairs where each key corresponds to the filename validated. For each file, the results contain a field "validated", True if validation was successful or False if not. "errors", "method" and "time" are simply the errors encountered during validation, method used for validation, and time of validation, respectively. """ if record_results: if os.path.isfile(record_path): self.validation_records = loadfn(record_path) if skip_existing: paths = [ path for path in paths if os.path.basename(path) not in self.validation_records ] else: self.validation_records = {} results = {} for path in tqdm(paths): name = os.path.basename(path) results[name] = {} if re.match(ARBIN_CONFIG['file_pattern'], path): schema_filename = os.path.join(VALIDATION_SCHEMA_DIR, "schema-arbin-lfp.yaml") self.schema = loadfn(schema_filename) df = pd.read_csv(path, index_col=0) validated, reason = self.validate(df) method = "simple_arbin" elif re.match(MACCOR_CONFIG['file_pattern'], path): schema_filename = os.path.join(VALIDATION_SCHEMA_DIR, "schema-maccor-2170.yaml") self.schema = loadfn(schema_filename) self.allow_unknown = True df = pd.read_csv(path, delimiter='\t', skiprows=1) # Columns need to be retyped and renamed for validation, # conversion will happen during structuring df['State'] = df['State'].astype(str) df['current'] = df['Amps'] validated, reason = self.validate(df) method = "simple_maccor" else: validated, reason = False, "File type not recognized" method = None results[name].update({ "validated": validated, "method": method, "errors": reason, "time": json.dumps(datetime.now(), indent=4, sort_keys=True, default=str) }) if validated: logger.info("%s method=%s errors=%s", name, method, reason, extra=s) else: logger.warning("%s method=%s errors=%s", name, method, reason, extra=s) if record_results: self.validation_records.update(results) dumpfn(self.validation_records, record_path) return results