def group_related(path, out_path): """ Link related data and location files into the output directory. :param path: Directory or file path. :type path: str :param out_path: The output path for related data. :type out_path: str """ for file_path in file_crawler.crawl(path): trimmed_path = target_path.trim_path(file_path) parts = pathlib.Path(trimmed_path).parts source_type = parts[0] year = parts[1] month = parts[2] day = parts[3] group = parts[4] location = parts[5] data_type = parts[6] remainder = parts[7:] base_output_path = os.path.join(out_path, year, month, day, group) target = os.path.join(base_output_path, source_type, location, data_type, *remainder[0:]) log.debug(f'File target: {target}') file_linker.link(file_path, target)
def linkmerge(in_path, out_path, dedup_threshold): filedict = {} for parquet_file_path in file_crawler.crawl(in_path): file_source_id = parquet_file_path.name.split('_')[2] if file_source_id not in filedict: filedict[file_source_id] = [parquet_file_path] else: filedict[file_source_id].append(parquet_file_path) for source_id in filedict: # If there is only one file for the sourceid, we just symlink it if len(filedict[source_id]) == 1: inpath = filedict[source_id][0] inpath_new_filename = '_'.join(inpath.name.split('_')[1:]) # Strip off / pfs /IN_PATH (3 parts) stripped_inpath = pathlib.PurePosixPath().joinpath( *inpath.parts[3:]) outpath = pathlib.Path( os.path.join(out_path, stripped_inpath.parent, inpath_new_filename)) if not os.path.exists(outpath.parent): log.info(f"{outpath.parent} directory not found, creating") os.makedirs(outpath.parent) log.info(f"Linking {inpath} to {outpath}") os.symlink(inpath, outpath) else: write_merged_parquet(inputfiles=filedict[source_id], in_path=in_path, out_path=out_path)
def group(data_path, location_path, out_path): """ Write event data and location files into output path. :param data_path: The path to the data files. :type data_path: str :param location_path: The path to the location file. :type location_path: str :param out_path: The path for writing results. :type out_path: str :return: """ for file_path in file_crawler.crawl(data_path): trimmed_path = target_path.trim_path(file_path) log.debug(f'trimmed_path: {trimmed_path}') parts = trimmed_path.parts source_type = parts[0] year = parts[1] month = parts[2] day = parts[3] source_id = parts[4] filename = parts[5] log.debug(f'filename: {filename}') log.debug(f'source type: {source_type} source_id: {source_id}') target_root = os.path.join(out_path, source_type, year, month, day, source_id) link_location(location_path, target_root) data_target_path = os.path.join(target_root, 'data', filename) log.debug(f'data_target_path: {data_target_path}') file_linker.link(file_path, data_target_path)
def get_data_files(data_path, out_path, start_date=None, end_date=None): """ Get the data file path keys between start and end dates. :param data_path: The path to the data file directory. :type data_path: str :param out_path: The path to write results. :type out_path: str :param start_date: The start date. :type start_date: datetime object :param end_date: The end date. :type end_date: datetime object :return: list of data files. """ keys = [] for file_path in file_crawler.crawl(data_path): parts = file_path.parts source_type = parts[3] year = parts[4] month = parts[5] day = parts[6] location_name = parts[7] data_type = parts[8] filename = parts[9] if not check_date(year, month, day, start_date, end_date): continue target_root = os.path.join(out_path, source_type, year, month, day, location_name) target_path = os.path.join(target_root, data_type, filename) file_linker.link(file_path, target_path) key = '/' + source_type + '/' + year + '/' + month + '/' + day + '/' + location_name keys.append(key) return keys
def group(calibrated_path, location_path, out_path): """ Write calibrated data and location files into the output path. :param calibrated_path: The input path for calibrated files. :type calibrated_path: str :param location_path: The input path for location files. :type location_path: str :param out_path: The output path for writing grouped files. :type out_path: str :return: """ i = 0 for file_path in file_crawler.crawl(calibrated_path): parts = file_path.parts source_type = parts[3] year = parts[4] month = parts[5] day = parts[6] source_id = parts[7] data_type = parts[8] log.debug(f'year: {year} month: {month} day: {day}') log.debug( f'source type: {source_type} source_id: {source_id} data type: {data_type}' ) target_root = os.path.join(out_path, source_type, year, month, day, source_id) if i == 0: # Only link location once. link_location(location_path, target_root) # Grab all directories and files under the common path (after the data type). target = os.path.join(target_root, data_type, *parts[9:]) log.debug(f'target: {target}') file_linker.link(file_path, target) i += 1
def group(data_path, location_path, out_path): """ Write data and location files into the output path. :param data_path: The path to the data files. :type data_path: str :param location_path: The path to the location files. :type location_path: str :param out_path: The output path to write grouped files. :type out_path: str :return: """ for file_path in file_crawler.crawl(data_path): parts = file_path.parts source_type = parts[3] year = parts[4] month = parts[5] day = parts[6] filename = parts[7] log.debug(f'data filename: {filename}') name = DataFilename(filename) source_id = name.source_id() log.debug(f'source type: {source_type} source_id: {source_id}') log.debug(f'year: {year} month: {month} day: {day}') log.debug(f'filename: {filename}') target_root = os.path.join(out_path, source_type, year, month, day, source_id) link_location(location_path, target_root) data_target_path = os.path.join(target_root, 'data', filename) log.debug(f'data_target_path: {data_target_path}') file_linker.link(file_path, data_target_path)
def group_data(data_path, out_path): """ Write data and event files into output path. :param data_path: The path to the data files. :type data_path: str :param out_path: The output path for writing results. :type out_path: str :return: """ target_root = None for file_path in file_crawler.crawl(data_path): trimmed_path = target_path.trim_path(file_path) parts = trimmed_path.parts year = parts[0] month = parts[1] day = parts[2] group_name = parts[3] source_type = parts[4] location = parts[5] data_type = parts[6] filename = parts[7] target_root = os.path.join(out_path, year, month, day, group_name) data_target_path = os.path.join(target_root, source_type, location, data_type, filename) file_linker.link(file_path, data_target_path) return target_root
def get_empty_file_paths(empty_files_path): """ Get the paths to the collection of empty files. :param empty_files_path: The path to the directory containing empty files. :type empty_files_path: str :return: dict of file paths. """ empty_data_path = None empty_flags_path = None empty_uncertainty_data_path = None for file_path in file_crawler.crawl(empty_files_path): parts = pathlib.Path(file_path).parts trimmed = parts[3:] directory_name = trimmed[1] if 'data' == directory_name: empty_data_path = file_path elif 'flags' == directory_name: empty_flags_path = file_path elif 'uncertainty_data' == directory_name: empty_uncertainty_data_path = file_path if empty_data_path is None: log.error('Empty data file not found.') exit(1) if empty_flags_path is None: log.error('Empty flags file not found.') exit(1) if empty_uncertainty_data_path is None: log.error('Empty uncertainty data file not found.') exit(1) return {'empty_data_path': empty_data_path, 'empty_flags_path': empty_flags_path, 'empty_uncertainty_data_path': empty_uncertainty_data_path}
def group(path, out_path): """ Link files into the output directory. :param path: File or directory paths. :type path: str :param out_path: The output path for writing results. :type out_path: str """ for file_path in file_crawler.crawl(path): target = target_path.get_path(file_path, out_path) log.debug(f'target: {target}') file_linker.link(file_path, target)
def link_location(location_path, target_root): """ Link the location file into the target root. :param location_path: The location file path. :type location_path: str :param target_root: The target directory to write the location file. :type target_root: str :return: """ for file in file_crawler.crawl(location_path): location_filename = pathlib.Path(file).name target = os.path.join(target_root, 'location', location_filename) file_linker.link(file, target)
def pad(self): """ Pad the data for the window size. :return: """ try: manifests = {} manifest_file_names = {} for file_path in file_crawler.crawl(self.data_path): parts = pathlib.Path(file_path).parts year = parts[self.year_index] month = parts[self.month_index] day = parts[self.day_index] location = parts[self.location_index] sub_dir = parts[self.sub_dir_index] if sub_dir in self.sub_dirs_to_process: location_path = os.path.join(*parts[0:self.location_index + 1]) if location not in manifests: manifests[location] = [] # get data date date = datetime.date(int(year), int(month), int(day)) # get dates in padded range dates_in_padded_range = padder_util.get_dates_in_padded_range(date, self.window_size) # link file into each date in padded range destination_parts = list(parts) for index in range(1, len(self.out_dir_parts)): destination_parts[index] = self.out_dir_parts[index] for date_in_padded_range in dates_in_padded_range: destination_parts[self.year_index] = str(date_in_padded_range.year) destination_parts[self.month_index] = str(date_in_padded_range.month).zfill(2) destination_parts[self.day_index] = str(date_in_padded_range.day).zfill(2) # generate destination path destination_path = os.path.join(*destination_parts) log.debug(f'source: {file_path}') log.debug(f'destination: {destination_path}') file_linker.link(file_path, destination_path) manifests[location].append(date_in_padded_range) if date_in_padded_range == date: # construct manifest filename manifest_path = os.path.dirname(destination_path) # remove data filename manifest_file_names[location] = os.path.join(manifest_path, 'manifest.txt') output_writer.write_thresholds(location_path, destination_path) else: destination_path = os.path.join(self.out_path, *parts[3:len(parts) + 1]) file_linker.link(file_path, destination_path) output_writer.write_manifests(manifests, manifest_file_names) # write manifest files except Exception: exception_type, exception_obj, exception_tb = sys.exc_info() log.error("Exception at line " + str(exception_tb.tb_lineno) + ": " + str(sys.exc_info()))
def link_location(location_path, target_root): """ Link the location file into the target directory. :param location_path: The location file path. :type location_path: str :param target_root: The target directory path. :type target_root: str :return: """ for file in file_crawler.crawl(location_path): location_filename = pathlib.Path(file).name location_target_path = os.path.join(target_root, 'location', location_filename) log.debug(f'location_target_path: {location_target_path}') file_linker.link(file, location_target_path)
def process(source_path, group, out_path): """ Link source files into the output directory with the related location group in the path. There must be only one location file under the source path. :param source_path: The input path. :type source_path: str :param group: The group to match in the location files. :type group: str :param out_path: The output path. :type out_path: str """ paths = [] group_names = [] for file_path in file_crawler.crawl(source_path): # parse path elements parts = pathlib.Path(file_path).parts source_type = parts[3] year = parts[4] month = parts[5] day = parts[6] location = parts[7] data_type = parts[8] remainder = parts[9:] # everything after the data type # put path parts into dictionary path_parts = { "source_type": source_type, "year": year, "month": month, "day": day, "location": location, "data_type": data_type, "remainder": remainder } # add the original file path and the path parts to the path list paths.append({"file_path": file_path, "path_parts": path_parts}) # get the location context group name from the location file if data_type == 'location': group_names = location_file_context.get_matching_items(file_path, group) # location context group name was not found! if len(group_names) == 0: log.error('No location directory found.') # context group name found, link all the files into the output directory else: link(paths, group_names, out_path)
def write_ancillary_data(out_dir, root): """ Write any additional files present in the input directory beyond data and thresholds into the output directory. :param out_dir: The output directory for writing results. :type out_dir: str :param root: The threshold root directory. :type root: str :return: """ parent_dir = pathlib.Path(root).parent for file_path in file_crawler.crawl(parent_dir): file_path = str(file_path) if 'data' not in file_path and 'threshold' not in file_path: parts = pathlib.Path(file_path).parts trimmed_path = os.path.join(*parts[3:]) output_path = os.path.join(out_dir, trimmed_path) file_linker.link(file_path, output_path)
def group(paths, out_path): """ Link all files into the output directory. :param paths: Comma separated list of environment variable names whose values are full directory paths. :type paths: str :param out_path: The output path for writing results. :type out_path: str """ if ',' in paths: paths = paths.split(',') log.debug(f'paths: {paths}') for p in paths: log.debug(f'path: {p}') path = os.environ[p] for file_path in file_crawler.crawl(path): target = target_path.get_path(file_path, out_path) log.debug(f'target: {target}') file_linker.link(file_path, target)
def process(source_path, group, out_path): """ Link source files into the output directory with the related location group in the path. There must be only one location file under the source path. :param source_path: The input path. :type source_path: str :param group: The group to match in the location files. :type group: str :param out_path: The output path. :type out_path: str :return """ paths = [] group_names = [] for file_path in file_crawler.crawl(source_path): # Parse path elements trimmed_path = target_path.trim_path(file_path) parts = pathlib.Path(trimmed_path).parts source_type = parts[0] source_id = parts[1] data_type = parts[2] filename = parts[3] path_parts = { "source_type": source_type, "source_id": source_id, "data_type": data_type, "filename": filename } paths.append({"file_path": file_path, "path_parts": path_parts}) # Get the full group name from the location file if data_type == 'location': group_names = location_file_context.get_matching_items(file_path, group) if len(group_names) == 0: log.error('No location directory found.') else: link(paths, group_names, out_path)
def filter(self, in_path, out_path, context): """ Group files in the input directory by context. :param in_path: The input path. :type in_path: str :param out_path: The output path. :type out_path: str :param context: The context to match. :type context: str """ sources = {} for file_path in file_crawler.crawl(in_path): parts = pathlib.Path(file_path).parts source_id = parts[self.source_id_index] data_type = parts[self.data_type_index] log.debug(f'source_id: {source_id} data_type: {data_type}') paths = sources.get(source_id) if paths is None: paths = [] paths.append({data_type: file_path}) sources.update({source_id: paths}) self.group_sources(sources, context, out_path)
def process(data_path, out_path): """ Load events from the asset data path. :param data_path: The data path. :type data_path: str :param out_path: The output path for writing results. :type out_path: str :return: """ for file_path in file_crawler.crawl(data_path): trimmed_path = target_path.trim_path(file_path) parts = trimmed_path.parts source_type = parts[0] source_id = parts[1] filename = parts[2] log.debug(f'source filename: {filename}') log.debug(f'source type: {source_type} source_id: {source_id}') output_filename = source_type + '_' + source_id + '_events.json' output_path = os.path.join(out_path, source_type, source_id, output_filename) log.debug(f'output_path: {output_path}') if not os.path.exists(output_path): file_linker.link(file_path, output_path)
def group_events(event_path, target_root): """ Group the event files into the target directory. :param event_path: The path to the event files. :type event_path: str :param target_root: The root output path. :type target_root: str :return: """ reference_group = pathlib.Path(target_root).name for file_path in file_crawler.crawl(event_path): trimmed_path = target_path.trim_path(file_path) parts = pathlib.Path(trimmed_path).parts source_type = parts[0] group_name = parts[1] source_id = parts[2] data_type = parts[3] filename = parts[4] event_target = os.path.join(target_root, source_type, source_id, data_type, filename) log.debug(f'event_target: {event_target}') if group_name == reference_group: file_linker.link(file_path, event_target)
def process_location_files(location_path, keys, out_path, output_directories, empty_data_path, empty_flags_path, empty_uncertainty_data_path, start_date=None, end_date=None): """ Process the location files. :param location_path: The path to the location file. :type location_path: str :param keys: The path keys to the data files. :type keys: list :param out_path: The path to write results. :type out_path: str :param output_directories: The output directories to write. :type output_directories: list :param empty_data_path: Path to the empty data files. :type empty_data_path: str :param empty_flags_path: Path to the empty flag files. :type empty_flags_path: str :param empty_uncertainty_data_path: Path to the empty uncertainty data file. :type empty_uncertainty_data_path: str :param start_date: The start date. :type start_date datetime object :param end_date: The end date. :type end_date: datetime object :return: """ for file_path in file_crawler.crawl(location_path): parts = file_path.parts source_type = parts[3] year = parts[4] month = parts[5] day = parts[6] named_location_name = parts[7] filename = parts[8] if not check_date(year, month, day, start_date, end_date): continue target_root = os.path.join(out_path, source_type, year, month, day, named_location_name) # link the location file into the output directory location_target = os.path.join(target_root, 'location', filename) file_linker.link(file_path, location_target) # create an empty calibration file in the target directory but do not overwrite calibration_target = os.path.join(target_root, 'calibration') os.makedirs(calibration_target, exist_ok=True) # create key to find corresponding data for the sensor and date key = '/' + source_type + '/' + year + '/' + month + '/' + day + '/' + named_location_name if key not in keys: # key not found, create empty directories and files print(f'Key not found {key}') for directory in output_directories: target_dir = os.path.join(target_root, directory) if directory == 'data': link_path(target_dir, empty_data_path, named_location_name, year, month, day) elif directory == 'flags': link_path(target_dir, empty_flags_path, named_location_name, year, month, day) elif directory == 'uncertainty_data': link_path(target_dir, empty_uncertainty_data_path, named_location_name, year, month, day) elif directory == 'uncertainty_coef': os.makedirs(target_dir, exist_ok=True)
def convert(in_path, out_path, dedup_threshold): """ Convert .avro files in in_path into .parquet files in out_path. :param in_path: The input path for the .avro files. :type in_path: str :param out_path: The output path to write .parquet files. :type out_path: str :param dedup_threshold: The duplication percentage for dictionary compression. :type dedup_threshold: float :return: """ for avro_file_path in file_crawler.crawl(in_path): log.info(f"Opening Avro file {avro_file_path}") if not is_avro(str(avro_file_path)): log.error(f"error: {avro_file_path} is not an Avro file") sys.exit(1) with open(avro_file_path, "rb") as open_file: avro_data = reader(open_file) # Get the ordered list of field names from the avro schema avro_file_schema = avro_data.metadata['avro.schema'] log.debug(f"avro_file_schema: {avro_file_schema}") avro_schema = avro_data.writer_schema log.debug(f"avro_schema: {avro_schema}") # Read Avro file into Pandas dataframe data_frame = pd.DataFrame( data=avro_data, # Preserve column ordering columns=[x['name'] for x in avro_schema['fields']]) log.debug(f"Data Frame info: {data_frame}") # Get a list of columns with hashable types log.debug(f"All Columns: {[x for x in data_frame.columns]}") hashable_cols = [ x for x in data_frame.columns if isinstance(data_frame[x][0], Hashable) ] log.debug(f"Hashable columns from the data_frame: {hashable_cols}") # Find columns with high duplication (> 30%) for use with dictionary encoding dupcols = [ x.encode('UTF-8') for x in hashable_cols if (data_frame[x].duplicated().sum() / (int(data_frame[x].size) - 1)) > dedup_threshold ] log.debug(f"Columns to dedup: {dupcols}") table = pa.Table.from_pandas(data_frame).replace_schema_metadata({ 'parquet.avro.schema': avro_file_schema, 'writer.model.name': 'avro' }) parts = avro_file_path.parts parquet_file_path = pathlib.Path(os.path.join(out_path, *parts[3:])) parquet_file_path.parent.mkdir(parents=True, exist_ok=True) parquet_file_path = os.path.splitext(parquet_file_path)[0] + '.parquet' log.info(f"Writing parquet file: {parquet_file_path}") pq.write_table(table, parquet_file_path, compression='gzip', use_dictionary=dupcols, compression_level=5, coerce_timestamps='ms', allow_truncated_timestamps=False)