def test_find_input_files_sortkeys(self):
     result = file_scanner.find_input_files(ScanSource(
         include=os.path.join(PROJECT_ROOT, "**/ex*.xls*"),
         sort_keys=[r'/.*?(\d+).*/\1/i'])
     )
     sort_keys = {r.sort_key for r in result}
     self.assertSetEqual({'2004', '2005'}, sort_keys)
Esempio n. 2
0
    def test_find_input_files_deep(self):
        result = file_scanner.find_input_files(
            ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xlsx")))
        self.assertEqual(len(result), 1)

        filesource = result[0]
        self.assertEqual('mples/example-B-2004.xlsx',
                         filesource.filename[-25:])
Esempio n. 3
0
def find_sources(
    *args: Union[str, ScanSource, List],
    data_sources: List[SourceConfig],
    column_report_filename: str = None,
    file_source: ExcelFileSource = ExcelFileSource()
) -> List[SheetWithHeaders]:
    """
    Search the filesystem for sources and try to automatically discoverer tables and match columns.

    :param args: Files to scan - can include wildcard characters (glob patterns)
    :param data_sources: Configuration for tables and columns
    :param column_report_filename: Optional generation of a report summarising matches. This can be edited and
                                   fed back into :func:`~fddc.annex_a.merger.read_sources` function.
    :param file_source:
    :return: discovered sources
    """
    input_files = __to_scan_source(args)

    # First we scan the input files section for all inputs
    files: List[FileSource] = []
    for scan_source in input_files:
        files += file_scanner.find_input_files(scan_source)

    logger.info("Found {} candidate input files".format(len(files)))

    # We then scan the input files for data sources
    file_sources: List[WorkSheetDetail] = []
    for file in files:
        file_sources += workbook_util.find_worksheets(file,
                                                      file_source=file_source)

    logger.info("Found {} candidate data sources".format(len(file_sources)))

    # Match datasources based on configuration
    matched_sheets, unmatched_sheets = matcher.match_data_sources(
        file_sources, data_sources)

    # Match headers to column configuration
    sheet_with_columns: List[SheetWithHeaders] = matcher.match_columns(
        matched_sheets)

    # Write column report
    if column_report_filename is not None:
        matcher_report.column_report(sheet_with_columns, unmatched_sheets,
                                     column_report_filename)

    return sheet_with_columns
def process_report(match_input: Union[Iterable[MatchInput], pd.DataFrame, str],
                   data_sources: List[SourceConfig]):
    if isinstance(match_input, str) or isinstance(match_input, pd.DataFrame):
        match_input = parse_report(match_input)

    files_to_scan = set()
    columns_per_table = dict()
    mapping_dict: Dict[Tuple, List[MatchInput]] = dict()

    for input in match_input:
        if input.sheetname is None:
            # First we look for unscanned files
            sort_key = None
            if input.sort_key is not None:
                sort_key = [f'/.*/{input.sort_key}/']
            scan_source = ScanSource(include=input.filename,
                                     sort_keys=sort_key)
            files = file_scanner.find_input_files(scan_source)
            files_to_scan.update(files)
        else:
            # Then we build a lookup of files and tables to see if any tables have no columns listed
            key = (input.filename, input.sort_key, input.sheetname,
                   input.table)
            columns = columns_per_table.setdefault(key, set())
            if input.column_name is not None:
                columns.add(input.column_name)

        if input.column_name is not None and input.header_name is not None:
            key = (input.filename, input.sort_key, input.sheetname,
                   input.table)
            mapping_dict.setdefault(key, []).append(input)

    matched_list: List[MatchedSheet] = []
    unmatched_list: List[WorkSheetDetail] = []

    for file in files_to_scan:
        worksheets = workbook_util.find_worksheets(file)
        # Match datasources based on configuration
        matched, unmatched = matcher.match_data_sources(
            worksheets, data_sources)
        matched_list += matched
        unmatched_list += unmatched

    for key, columns in columns_per_table.items():
        if len(columns) == 0:
            file, sort_key, sheetname, table = key
            if table is not None:
                worksheet_list = workbook_util.find_worksheets(
                    FileSource(file, sort_key=sort_key))
                worksheet = next(
                    iter([
                        w for w in worksheet_list if w.sheetname == sheetname
                    ]))
                source_config = next(
                    iter([d for d in data_sources if d.name == table]))
                matched = MatchedSheet(sheet_detail=worksheet,
                                       source_config=source_config)
                matched_list.append(matched)

    # Match headers to column configuration
    sheet_with_headers: List[SheetWithHeaders] = matcher.match_columns(
        matched_list)

    for key, mapping_list in mapping_dict.items():
        file, sort_key, sheetname, table = key
        worksheet_list = workbook_util.find_worksheets(
            FileSource(file, sort_key=sort_key))
        sheet_detail = next(
            iter([w for w in worksheet_list if w.sheetname == sheetname]))
        source_config = next(iter([d for d in data_sources
                                   if d.name == table]))
        sheet = MatchedSheet(sheet_detail=sheet_detail,
                             source_config=source_config)

        column_list: List[MatchedColumn] = []

        for mapping in mapping_list:
            column_config = next(
                iter([
                    c for c in source_config.columns
                    if c.name == mapping.column_name
                ]))
            header_config = next(
                iter([
                    h for h in sheet_detail.headers
                    if h.value == mapping.header_name
                ]))
            column = MatchedColumn(column=column_config, header=header_config)
            column_list.append(column)

        sheet_with_headers.append(
            SheetWithHeaders(sheet=sheet,
                             columns=column_list,
                             unmatched_columns=[]))

    return sheet_with_headers, unmatched_list
 def test_find_input_files_empty(self):
     result = file_scanner.find_input_files(ScanSource(include=os.path.join(PROJECT_ROOT,
                                                                            "oh-no-I-do-not-exist.xlsx")))
     self.assertEqual(result, [])
 def test_find_input_files_multiext(self):
     result = file_scanner.find_input_files(ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xls*")))
     self.assertEqual(2, len(result))
Esempio n. 7
0
 def test_find_input_files_sortkeys(self):
     result = file_scanner.find_input_files(
         ScanSource(include=os.path.join(PROJECT_ROOT, "**/ex*.xls*"),
                    sort_keys=[r'/.*?(\d+).*/\1/i']))
     filesource = result[0]
     self.assertEqual('2004', filesource.sort_key)