def group(calibrated_path, location_path, out_path): """ Write calibrated data and location files into the output path. :param calibrated_path: The input path for calibrated files. :type calibrated_path: str :param location_path: The input path for location files. :type location_path: str :param out_path: The output path for writing grouped files. :type out_path: str :return: """ i = 0 for file_path in file_crawler.crawl(calibrated_path): parts = file_path.parts source_type = parts[3] year = parts[4] month = parts[5] day = parts[6] source_id = parts[7] data_type = parts[8] log.debug(f'year: {year} month: {month} day: {day}') log.debug( f'source type: {source_type} source_id: {source_id} data type: {data_type}' ) target_root = os.path.join(out_path, source_type, year, month, day, source_id) if i == 0: # Only link location once. link_location(location_path, target_root) # Grab all directories and files under the common path (after the data type). target = os.path.join(target_root, data_type, *parts[9:]) log.debug(f'target: {target}') file_linker.link(file_path, target) i += 1
def link(paths, group_names, out_path): """ Loop through the files and link into the output directory including the location context group name in the path. :param paths: File paths to link. :type paths: list :param group_names: A List of associated location context group names. :type group_names: list :param out_path: The output directory for writing. :type out_path: str :return: """ for path in paths: # parse the paths file_path = path.get('file_path') parts = path.get('path_parts') source_type = parts.get("source_type") year = parts.get("year") month = parts.get("month") day = parts.get("day") location = parts.get("location") data_type = parts.get("data_type") remainder = parts.get("remainder") # build the output path log.debug(f't: {source_type} Y: {year} M: {month} D: {day} ' f'loc: {location} type: {data_type} remainder: {remainder}') for group_name in group_names: target_dir = os.path.join(out_path, source_type, year, month, day, group_name, location, data_type) if not os.path.exists(target_dir): os.makedirs(target_dir) destination = os.path.join(target_dir, *remainder[0:]) # link the file log.debug(f'source: {file_path} destination: {destination}') file_linker.link(file_path, destination)
def group(data_path, location_path, out_path): """ Write event data and location files into output path. :param data_path: The path to the data files. :type data_path: str :param location_path: The path to the location file. :type location_path: str :param out_path: The path for writing results. :type out_path: str :return: """ for file_path in file_crawler.crawl(data_path): trimmed_path = target_path.trim_path(file_path) log.debug(f'trimmed_path: {trimmed_path}') parts = trimmed_path.parts source_type = parts[0] year = parts[1] month = parts[2] day = parts[3] source_id = parts[4] filename = parts[5] log.debug(f'filename: {filename}') log.debug(f'source type: {source_type} source_id: {source_id}') target_root = os.path.join(out_path, source_type, year, month, day, source_id) link_location(location_path, target_root) data_target_path = os.path.join(target_root, 'data', filename) log.debug(f'data_target_path: {data_target_path}') file_linker.link(file_path, data_target_path)
def group_related(path, out_path): """ Link related data and location files into the output directory. :param path: Directory or file path. :type path: str :param out_path: The output path for related data. :type out_path: str """ for file_path in file_crawler.crawl(path): trimmed_path = target_path.trim_path(file_path) parts = pathlib.Path(trimmed_path).parts source_type = parts[0] year = parts[1] month = parts[2] day = parts[3] group = parts[4] location = parts[5] data_type = parts[6] remainder = parts[7:] base_output_path = os.path.join(out_path, year, month, day, group) target = os.path.join(base_output_path, source_type, location, data_type, *remainder[0:]) log.debug(f'File target: {target}') file_linker.link(file_path, target)
def link(paths, group_names, out_path): """ Link the paths into the output directory. :param paths: The file paths. :type paths: list :param group_names: The context group names for the location. :type group_names: list :param out_path: The output path for writing results. :type out_path: str :return: """ for path in paths: file_path = path.get('file_path') parts = path.get('path_parts') source_type = parts.get("source_type") source_id = parts.get("source_id") data_type = parts.get("data_type") filename = parts.get("filename") # Build the output path for group_name in group_names: log.debug(f'source_type: {source_type} id: {source_id} data_type: {data_type} file: {filename}') target_dir = os.path.join(out_path, source_type, group_name, source_id, data_type) if not os.path.exists(target_dir): os.makedirs(target_dir) destination = os.path.join(target_dir, filename) # Link the file log.debug(f'source: {file_path} destination: {destination}') file_linker.link(file_path, destination)
def get_data_files(data_path, out_path, start_date=None, end_date=None): """ Get the data file path keys between start and end dates. :param data_path: The path to the data file directory. :type data_path: str :param out_path: The path to write results. :type out_path: str :param start_date: The start date. :type start_date: datetime object :param end_date: The end date. :type end_date: datetime object :return: list of data files. """ keys = [] for file_path in file_crawler.crawl(data_path): parts = file_path.parts source_type = parts[3] year = parts[4] month = parts[5] day = parts[6] location_name = parts[7] data_type = parts[8] filename = parts[9] if not check_date(year, month, day, start_date, end_date): continue target_root = os.path.join(out_path, source_type, year, month, day, location_name) target_path = os.path.join(target_root, data_type, filename) file_linker.link(file_path, target_path) key = '/' + source_type + '/' + year + '/' + month + '/' + day + '/' + location_name keys.append(key) return keys
def group(data_path, location_path, out_path): """ Write data and location files into the output path. :param data_path: The path to the data files. :type data_path: str :param location_path: The path to the location files. :type location_path: str :param out_path: The output path to write grouped files. :type out_path: str :return: """ for file_path in file_crawler.crawl(data_path): parts = file_path.parts source_type = parts[3] year = parts[4] month = parts[5] day = parts[6] filename = parts[7] log.debug(f'data filename: {filename}') name = DataFilename(filename) source_id = name.source_id() log.debug(f'source type: {source_type} source_id: {source_id}') log.debug(f'year: {year} month: {month} day: {day}') log.debug(f'filename: {filename}') target_root = os.path.join(out_path, source_type, year, month, day, source_id) link_location(location_path, target_root) data_target_path = os.path.join(target_root, 'data', filename) log.debug(f'data_target_path: {data_target_path}') file_linker.link(file_path, data_target_path)
def link_path(target_dir, empty_file_path, location_name, year, month, day): """ Link the empty file path into the target path. :param target_dir: The target directory for writing files. :type target_dir: str :param empty_file_path: The source empty file path. :type empty_file_path: str :param location_name: The location name. :type location_name: str :param year: The file year. :type year: str :param month: The file month. :type month: str :param day: The file day. :type day: str :return: """ file_name = pathlib.Path(empty_file_path).name file_name = file_name.replace('location', location_name) file_name = file_name.replace('year', year) file_name = file_name.replace('month', month) file_name = file_name.replace('day', day) target_path = os.path.join(target_dir, file_name) print(f'target_path: {target_path}') file_linker.link(empty_file_path, target_path)
def group_data(data_path, out_path): """ Write data and event files into output path. :param data_path: The path to the data files. :type data_path: str :param out_path: The output path for writing results. :type out_path: str :return: """ target_root = None for file_path in file_crawler.crawl(data_path): trimmed_path = target_path.trim_path(file_path) parts = trimmed_path.parts year = parts[0] month = parts[1] day = parts[2] group_name = parts[3] source_type = parts[4] location = parts[5] data_type = parts[6] filename = parts[7] target_root = os.path.join(out_path, year, month, day, group_name) data_target_path = os.path.join(target_root, source_type, location, data_type, filename) file_linker.link(file_path, data_target_path) return target_root
def group(regularized_dir, quality_dir, out_dir): """ Group matching regularized and quality files in the output directory. :param regularized_dir: The path containing regularized files. :type regularized_dir: str :param quality_dir: The path containing quality files. :type quality_dir: str :param out_dir: The path for writing results. :type out_dir: str :return: """ regularized_files = load_files(regularized_dir, out_dir) quality_files = load_files(quality_dir, out_dir) regularized_keys = set(regularized_files.keys()) quality_keys = set(quality_files.keys()) log.debug(f'regularized_keys: {regularized_keys}') log.debug(f'quality_keys: {quality_keys}') common = regularized_keys.intersection(quality_keys) log.debug(f'common: {common}') for key in common: regularized_paths = regularized_files.get(key) quality_paths = quality_files.get(key) file_linker.link(regularized_paths.get('source'), regularized_paths.get('destination')) file_linker.link(quality_paths.get('source'), quality_paths.get('destination'))
def test_link(self): source = '/test/input/file.foo' target = '/test/output/file.foo' self.fs.create_file(source) self.assertTrue(os.path.isfile(source)) self.assertFalse(os.path.isfile(target)) file_linker.link(source, target) self.assertTrue(os.path.isfile(target))
def group(path, out_path): """ Link files into the output directory. :param path: File or directory paths. :type path: str :param out_path: The output path for writing results. :type out_path: str """ for file_path in file_crawler.crawl(path): target = target_path.get_path(file_path, out_path) log.debug(f'target: {target}') file_linker.link(file_path, target)
def pad(self): """ Pad the data for the window size. :return: """ try: manifests = {} manifest_file_names = {} for file_path in file_crawler.crawl(self.data_path): parts = pathlib.Path(file_path).parts year = parts[self.year_index] month = parts[self.month_index] day = parts[self.day_index] location = parts[self.location_index] sub_dir = parts[self.sub_dir_index] if sub_dir in self.sub_dirs_to_process: location_path = os.path.join(*parts[0:self.location_index + 1]) if location not in manifests: manifests[location] = [] # get data date date = datetime.date(int(year), int(month), int(day)) # get dates in padded range dates_in_padded_range = padder_util.get_dates_in_padded_range(date, self.window_size) # link file into each date in padded range destination_parts = list(parts) for index in range(1, len(self.out_dir_parts)): destination_parts[index] = self.out_dir_parts[index] for date_in_padded_range in dates_in_padded_range: destination_parts[self.year_index] = str(date_in_padded_range.year) destination_parts[self.month_index] = str(date_in_padded_range.month).zfill(2) destination_parts[self.day_index] = str(date_in_padded_range.day).zfill(2) # generate destination path destination_path = os.path.join(*destination_parts) log.debug(f'source: {file_path}') log.debug(f'destination: {destination_path}') file_linker.link(file_path, destination_path) manifests[location].append(date_in_padded_range) if date_in_padded_range == date: # construct manifest filename manifest_path = os.path.dirname(destination_path) # remove data filename manifest_file_names[location] = os.path.join(manifest_path, 'manifest.txt') output_writer.write_thresholds(location_path, destination_path) else: destination_path = os.path.join(self.out_path, *parts[3:len(parts) + 1]) file_linker.link(file_path, destination_path) output_writer.write_manifests(manifests, manifest_file_names) # write manifest files except Exception: exception_type, exception_obj, exception_tb = sys.exc_info() log.error("Exception at line " + str(exception_tb.tb_lineno) + ": " + str(sys.exc_info()))
def link_location(location_path, target_root): """ Link the location file into the target root. :param location_path: The location file path. :type location_path: str :param target_root: The target directory to write the location file. :type target_root: str :return: """ for file in file_crawler.crawl(location_path): location_filename = pathlib.Path(file).name target = os.path.join(target_root, 'location', location_filename) file_linker.link(file, target)
def link_location(location_path, target_root): """ Link the location file into the target directory. :param location_path: The location file path. :type location_path: str :param target_root: The target directory path. :type target_root: str :return: """ for file in file_crawler.crawl(location_path): location_filename = pathlib.Path(file).name location_target_path = os.path.join(target_root, 'location', location_filename) log.debug(f'location_target_path: {location_target_path}') file_linker.link(file, location_target_path)
def link_source(file_paths_by_type, out_path): """ Get file paths by data type and link into output directory. :param file_paths_by_type: File paths by data type. :type file_paths_by_type dict :param out_path: The output path. :type out_path: str """ for path_by_type in file_paths_by_type: for data_type in path_by_type: file_path = path_by_type.get(data_type) parts = pathlib.Path(file_path).parts destination = os.path.join(out_path, *parts[3:]) log.debug(f'source: {file_path} destination: {destination}') file_linker.link(file_path, destination)
def join(pathname, out_path): """ Join paths according to the given pathname and link all matching files into the output directory. :param pathname: The path pattern to match. :type pathname: str :param out_path: The output path for writing results. :type out_path: str """ files = [fn for fn in glob.glob(pathname, recursive=True) if not os.path.basename(fn).startswith(out_path) if os.path.isfile(fn)] for file in files: log.debug(f'found matching file: {file}') target = target_path.get_path(file, out_path) log.debug(f'target: {target}') file_linker.link(file, target)
def group(paths, out_path): """ Link all files into the output directory. :param paths: Comma separated list of environment variable names whose values are full directory paths. :type paths: str :param out_path: The output path for writing results. :type out_path: str """ if ',' in paths: paths = paths.split(',') log.debug(f'paths: {paths}') for p in paths: log.debug(f'path: {p}') path = os.environ[p] for file_path in file_crawler.crawl(path): target = target_path.get_path(file_path, out_path) log.debug(f'target: {target}') file_linker.link(file_path, target)
def filter_directory(in_path, filter_dirs, out_path): """ Link the target directory into the output directory. :param in_path: The input path. :type in_path: str :param filter_dirs: The directories to filter. :type filter_dirs: str :param out_path: The output path for writing results. :type out_path: str :return: """ parsed_dirs = parse_dirs(filter_dirs) for r, d, f in os.walk(in_path): for name in d: if not name.startswith('.') and name in parsed_dirs: source = os.path.join(r, name) destination = target_path.get_path(source, out_path) file_linker.link(source, destination)
def write_ancillary_data(out_dir, root): """ Write any additional files present in the input directory beyond data and thresholds into the output directory. :param out_dir: The output directory for writing results. :type out_dir: str :param root: The threshold root directory. :type root: str :return: """ parent_dir = pathlib.Path(root).parent for file_path in file_crawler.crawl(parent_dir): file_path = str(file_path) if 'data' not in file_path and 'threshold' not in file_path: parts = pathlib.Path(file_path).parts trimmed_path = os.path.join(*parts[3:]) output_path = os.path.join(out_dir, trimmed_path) file_linker.link(file_path, output_path)
def write_thresholds(source_path, destination_path): """ Write the threshold file. :param source_path: The threshold file path. :type source_path: str :param destination_path: The path to write the file. :type destination_path: str :return: """ threshold_dir = 'threshold' threshold_filename = 'thresholds.json' threshold_file = os.path.join(source_path, threshold_dir, threshold_filename) if pathlib.Path(threshold_file).exists(): path = pathlib.Path(destination_path).parent.parent threshold_out = os.path.join(path, threshold_dir, threshold_filename) log.debug(f'Threshold file: {threshold_file}') log.debug(f'Threshold out: {threshold_out}') file_linker.link(threshold_file, threshold_out)
def write_thresholds(source_path, destination_path): """ Write thresholds if they exist in the source repository. :param source_path: The source path for the threshold file. :type source_path: str :param destination_path: The destination path to write results. :type destination_path: str :return: """ threshold_dir = 'threshold' threshold_filename = 'thresholds.json' source_dir = pathlib.Path(source_path).parent.parent destination_dir = pathlib.Path(destination_path).parent.parent source = os.path.join(source_dir, threshold_dir, threshold_filename) if os.path.exists(source): destination = os.path.join(destination_dir, threshold_dir, threshold_filename) log.debug(f'linking {source} to {destination}') file_linker.link(source, destination)
def upload(self): """ Link the source files into the output directory. :return: """ try: for root, dirs, files in os.walk(self.dataPath): for filename in files: if not filename.startswith('.'): sourcePath = os.path.join(root, filename) parts = pathlib.Path(sourcePath).parts # date filenameParts = filename.split(self.filenameDelimiter) dateTime = filenameParts[self.dateIndex] # loc loc = filenameParts[self.locIndex] # construct target filename targetParts = [ self.outPath, self.outputName, dateTime, loc, filenameParts[len(filenameParts) - 2], filenameParts[len(filenameParts) - 1] ] targetFilename = self.filenameDelimiter.join( targetParts[1:]) targetPath = os.path.join( *targetParts[:len(targetParts) - 2], targetFilename) # symlink to target print("sourcepath = " + sourcePath) print("targetpath = " + targetPath) file_linker.link(sourcePath, targetPath) except Exception: exc_type, exc_obj, exc_tb = sys.exc_info() log.error("Exception at line " + str(exc_tb.tb_lineno) + ": " + str(sys.exc_info()))
def group_events(event_path, target_root): """ Group the event files into the target directory. :param event_path: The path to the event files. :type event_path: str :param target_root: The root output path. :type target_root: str :return: """ reference_group = pathlib.Path(target_root).name for file_path in file_crawler.crawl(event_path): trimmed_path = target_path.trim_path(file_path) parts = pathlib.Path(trimmed_path).parts source_type = parts[0] group_name = parts[1] source_id = parts[2] data_type = parts[3] filename = parts[4] event_target = os.path.join(target_root, source_type, source_id, data_type, filename) log.debug(f'event_target: {event_target}') if group_name == reference_group: file_linker.link(file_path, event_target)
def process(data_path, out_path): """ Load events from the asset data path. :param data_path: The data path. :type data_path: str :param out_path: The output path for writing results. :type out_path: str :return: """ for file_path in file_crawler.crawl(data_path): trimmed_path = target_path.trim_path(file_path) parts = trimmed_path.parts source_type = parts[0] source_id = parts[1] filename = parts[2] log.debug(f'source filename: {filename}') log.debug(f'source type: {source_type} source_id: {source_id}') output_filename = source_type + '_' + source_id + '_events.json' output_path = os.path.join(out_path, source_type, source_id, output_filename) log.debug(f'output_path: {output_path}') if not os.path.exists(output_path): file_linker.link(file_path, output_path)
def pad(self): """ Pad the data to the calculated window size. :return: """ try: max_window_size_by_date_and_location = {} min_data_rate_by_date_and_location = {} manifests = {} manifest_file_names = {} for root, dirs, files in os.walk(self.data_path): for filename in files: if not filename.startswith('.'): file_path = os.path.join(root, filename) parts = pathlib.Path(file_path).parts subdir = parts[self.sub_dir_index] if subdir in self.sub_dirs_to_output: year = parts[self.year_index] month = parts[self.month_index] day = parts[self.day_index] config_location = parts[self.config_location_index] date_location_key = year + month + day + config_location config_location_path = os.path.join( *parts[0:self.config_location_index + 1]) if config_location not in manifests: manifests[config_location] = [] # get min of all data rates (to ensure adequate window coverage) if date_location_key not in min_data_rate_by_date_and_location: location_path = os.path.join( config_location_path, 'location') location_files = [ f for f in os.listdir(location_path) if f.endswith('.json') ] location_file = os.path.join( location_path, location_files[0]) min_data_rate_by_date_and_location[date_location_key] = \ padder_util.get_min_data_rate(location_file) data_rate = min_data_rate_by_date_and_location[ date_location_key] # get max of all window sizes if date_location_key not in max_window_size_by_date_and_location: threshold_path = os.path.join( config_location_path, 'threshold') threshold_files = [ f for f in os.listdir(threshold_path) if f.endswith('.json') ] threshold_file = os.path.join( threshold_path, threshold_files[0]) log.debug(f'thresholdFile: {threshold_file}') max_window_size_by_date_and_location[date_location_key] = \ padder_util.get_max_window_size(threshold_file, data_rate) window_size = max_window_size_by_date_and_location[ date_location_key] # get data date date = datetime.date(int(year), int(month), int(day)) # calculate pad size pad_size = padder_util.calculate_pad_size( window_size) # get dates in padded range dates_in_padded_range = padder_util.get_dates_in_padded_range( date, pad_size) # link file into each date in padded range destination_parts = list(parts) for idx in range(1, len(self.out_dir_parts)): destination_parts[idx] = self.out_dir_parts[ idx] for dateInPaddedRange in dates_in_padded_range: destination_parts[self.year_index] = str( dateInPaddedRange.year) destination_parts[self.month_index] = str( dateInPaddedRange.month).zfill(2) destination_parts[self.day_index] = str( dateInPaddedRange.day).zfill(2) # generate destination path destination_path = os.path.join( *destination_parts) log.debug(f'source: {file_path}') log.debug(f'destination: {destination_path}') file_linker.link(file_path, destination_path) manifests[config_location].append( dateInPaddedRange) if dateInPaddedRange == date: # construct manifest filename manifest_path = os.path.dirname( destination_path ) # remove data file name manifest_file_names[ config_location] = os.path.join( manifest_path, 'manifest.txt') output_writer.write_thresholds( config_location_path, destination_path) output_writer.write_manifests( manifests, manifest_file_names) # write manifest files except Exception: exc_type, exc_obj, exc_tb = sys.exc_info() log.error("Exception at line " + str(exc_tb.tb_lineno) + ": " + str(sys.exc_info()))
def process_location_files(location_path, keys, out_path, output_directories, empty_data_path, empty_flags_path, empty_uncertainty_data_path, start_date=None, end_date=None): """ Process the location files. :param location_path: The path to the location file. :type location_path: str :param keys: The path keys to the data files. :type keys: list :param out_path: The path to write results. :type out_path: str :param output_directories: The output directories to write. :type output_directories: list :param empty_data_path: Path to the empty data files. :type empty_data_path: str :param empty_flags_path: Path to the empty flag files. :type empty_flags_path: str :param empty_uncertainty_data_path: Path to the empty uncertainty data file. :type empty_uncertainty_data_path: str :param start_date: The start date. :type start_date datetime object :param end_date: The end date. :type end_date: datetime object :return: """ for file_path in file_crawler.crawl(location_path): parts = file_path.parts source_type = parts[3] year = parts[4] month = parts[5] day = parts[6] named_location_name = parts[7] filename = parts[8] if not check_date(year, month, day, start_date, end_date): continue target_root = os.path.join(out_path, source_type, year, month, day, named_location_name) # link the location file into the output directory location_target = os.path.join(target_root, 'location', filename) file_linker.link(file_path, location_target) # create an empty calibration file in the target directory but do not overwrite calibration_target = os.path.join(target_root, 'calibration') os.makedirs(calibration_target, exist_ok=True) # create key to find corresponding data for the sensor and date key = '/' + source_type + '/' + year + '/' + month + '/' + day + '/' + named_location_name if key not in keys: # key not found, create empty directories and files print(f'Key not found {key}') for directory in output_directories: target_dir = os.path.join(target_root, directory) if directory == 'data': link_path(target_dir, empty_data_path, named_location_name, year, month, day) elif directory == 'flags': link_path(target_dir, empty_flags_path, named_location_name, year, month, day) elif directory == 'uncertainty_data': link_path(target_dir, empty_uncertainty_data_path, named_location_name, year, month, day) elif directory == 'uncertainty_coef': os.makedirs(target_dir, exist_ok=True)
def analyze(data_dir, out_dir): """ Analyze time series data to calculate additional time padding required for processing with thresholds. :param data_dir: The data directory. :type data_dir: str :param out_dir: The output directory. :type out_dir: str :return: """ out_dir_parts = list(pathlib.Path(out_dir).parts) manifest_file = 'manifest.txt' try: for root, dirs, files in os.walk(data_dir): for filename in files: if filename == manifest_file: # read manifest dates = [ date.rstrip() for date in open(os.path.join(root, filename)) ] # check for existence of complete manifest dates_not_found = [] for date in dates: dates_not_found.append(date) for date in dates: for data_file in os.listdir(root): log.debug(f'data_file: {data_file}') if data_file != manifest_file: data_file_date = MergedDataFilename( data_file).date() log.debug( f'checking data file date: {data_file_date} and ' f'manifest date {date} in {dates_not_found}' ) if date in data_file_date and date in dates_not_found: log.debug(f'found data for: {date}') dates_not_found.remove(date) # if complete, symlink to output repository if not dates_not_found: for data_file in os.listdir(root): # TODO: The root is 'data', need to go one directory up. if data_file != manifest_file: source_path = os.path.join(root, data_file) destination_parts = pathlib.Path( source_path).parts destination_parts = list(destination_parts) for index in range(1, len(out_dir_parts)): destination_parts[index] = out_dir_parts[ index] destination_path = os.path.join( *destination_parts) log.debug( f'linking {source_path} to {destination_path}' ) file_linker.link(source_path, destination_path) write_thresholds(source_path, destination_path) # Go up one directory and get any ancillary files to write. write_ancillary_data(out_dir, root) except Exception: exception_type, exception_obj, exception_tb = sys.exc_info() log.error("Exception at line " + str(exception_tb.tb_lineno) + ": " + str(sys.exc_info()))