def test_find_input_files_sortkeys(self): result = file_scanner.find_input_files(ScanSource( include=os.path.join(PROJECT_ROOT, "**/ex*.xls*"), sort_keys=[r'/.*?(\d+).*/\1/i']) ) sort_keys = {r.sort_key for r in result} self.assertSetEqual({'2004', '2005'}, sort_keys)
def test_find_input_files_deep(self): result = file_scanner.find_input_files( ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xlsx"))) self.assertEqual(len(result), 1) filesource = result[0] self.assertEqual('mples/example-B-2004.xlsx', filesource.filename[-25:])
def find_sources( *args: Union[str, ScanSource, List], data_sources: List[SourceConfig], column_report_filename: str = None, file_source: ExcelFileSource = ExcelFileSource() ) -> List[SheetWithHeaders]: """ Search the filesystem for sources and try to automatically discoverer tables and match columns. :param args: Files to scan - can include wildcard characters (glob patterns) :param data_sources: Configuration for tables and columns :param column_report_filename: Optional generation of a report summarising matches. This can be edited and fed back into :func:`~fddc.annex_a.merger.read_sources` function. :param file_source: :return: discovered sources """ input_files = __to_scan_source(args) # First we scan the input files section for all inputs files: List[FileSource] = [] for scan_source in input_files: files += file_scanner.find_input_files(scan_source) logger.info("Found {} candidate input files".format(len(files))) # We then scan the input files for data sources file_sources: List[WorkSheetDetail] = [] for file in files: file_sources += workbook_util.find_worksheets(file, file_source=file_source) logger.info("Found {} candidate data sources".format(len(file_sources))) # Match datasources based on configuration matched_sheets, unmatched_sheets = matcher.match_data_sources( file_sources, data_sources) # Match headers to column configuration sheet_with_columns: List[SheetWithHeaders] = matcher.match_columns( matched_sheets) # Write column report if column_report_filename is not None: matcher_report.column_report(sheet_with_columns, unmatched_sheets, column_report_filename) return sheet_with_columns
def process_report(match_input: Union[Iterable[MatchInput], pd.DataFrame, str], data_sources: List[SourceConfig]): if isinstance(match_input, str) or isinstance(match_input, pd.DataFrame): match_input = parse_report(match_input) files_to_scan = set() columns_per_table = dict() mapping_dict: Dict[Tuple, List[MatchInput]] = dict() for input in match_input: if input.sheetname is None: # First we look for unscanned files sort_key = None if input.sort_key is not None: sort_key = [f'/.*/{input.sort_key}/'] scan_source = ScanSource(include=input.filename, sort_keys=sort_key) files = file_scanner.find_input_files(scan_source) files_to_scan.update(files) else: # Then we build a lookup of files and tables to see if any tables have no columns listed key = (input.filename, input.sort_key, input.sheetname, input.table) columns = columns_per_table.setdefault(key, set()) if input.column_name is not None: columns.add(input.column_name) if input.column_name is not None and input.header_name is not None: key = (input.filename, input.sort_key, input.sheetname, input.table) mapping_dict.setdefault(key, []).append(input) matched_list: List[MatchedSheet] = [] unmatched_list: List[WorkSheetDetail] = [] for file in files_to_scan: worksheets = workbook_util.find_worksheets(file) # Match datasources based on configuration matched, unmatched = matcher.match_data_sources( worksheets, data_sources) matched_list += matched unmatched_list += unmatched for key, columns in columns_per_table.items(): if len(columns) == 0: file, sort_key, sheetname, table = key if table is not None: worksheet_list = workbook_util.find_worksheets( FileSource(file, sort_key=sort_key)) worksheet = next( iter([ w for w in worksheet_list if w.sheetname == sheetname ])) source_config = next( iter([d for d in data_sources if d.name == table])) matched = MatchedSheet(sheet_detail=worksheet, source_config=source_config) matched_list.append(matched) # Match headers to column configuration sheet_with_headers: List[SheetWithHeaders] = matcher.match_columns( matched_list) for key, mapping_list in mapping_dict.items(): file, sort_key, sheetname, table = key worksheet_list = workbook_util.find_worksheets( FileSource(file, sort_key=sort_key)) sheet_detail = next( iter([w for w in worksheet_list if w.sheetname == sheetname])) source_config = next(iter([d for d in data_sources if d.name == table])) sheet = MatchedSheet(sheet_detail=sheet_detail, source_config=source_config) column_list: List[MatchedColumn] = [] for mapping in mapping_list: column_config = next( iter([ c for c in source_config.columns if c.name == mapping.column_name ])) header_config = next( iter([ h for h in sheet_detail.headers if h.value == mapping.header_name ])) column = MatchedColumn(column=column_config, header=header_config) column_list.append(column) sheet_with_headers.append( SheetWithHeaders(sheet=sheet, columns=column_list, unmatched_columns=[])) return sheet_with_headers, unmatched_list
def test_find_input_files_empty(self): result = file_scanner.find_input_files(ScanSource(include=os.path.join(PROJECT_ROOT, "oh-no-I-do-not-exist.xlsx"))) self.assertEqual(result, [])
def test_find_input_files_multiext(self): result = file_scanner.find_input_files(ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xls*"))) self.assertEqual(2, len(result))
def test_find_input_files_sortkeys(self): result = file_scanner.find_input_files( ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xls*"), sort_keys=[r'/.*?(\d+).*/\1/i'])) filesource = result[0] self.assertEqual('2004', filesource.sort_key)