Python match_columns Examples, fddc.annex_a.merger.matcher.match_columns Python Examples

Example #1

0

Show file

File: workflow.py Project: CSCDP/AnnexA_CiNCensus_Cleaner

def find_sources(
    *args: Union[str, ScanSource, List],
    data_sources: List[SourceConfig],
    column_report_filename: str = None,
    file_source: ExcelFileSource = ExcelFileSource()
) -> List[SheetWithHeaders]:
    """
    Search the filesystem for sources and try to automatically discoverer tables and match columns.

    :param args: Files to scan - can include wildcard characters (glob patterns)
    :param data_sources: Configuration for tables and columns
    :param column_report_filename: Optional generation of a report summarising matches. This can be edited and
                                   fed back into :func:`~fddc.annex_a.merger.read_sources` function.
    :param file_source:
    :return: discovered sources
    """
    input_files = __to_scan_source(args)

    # First we scan the input files section for all inputs
    files: List[FileSource] = []
    for scan_source in input_files:
        files += file_scanner.find_input_files(scan_source)

    logger.info("Found {} candidate input files".format(len(files)))

    # We then scan the input files for data sources
    file_sources: List[WorkSheetDetail] = []
    for file in files:
        file_sources += workbook_util.find_worksheets(file,
                                                      file_source=file_source)

    logger.info("Found {} candidate data sources".format(len(file_sources)))

    # Match datasources based on configuration
    matched_sheets, unmatched_sheets = matcher.match_data_sources(
        file_sources, data_sources)

    # Match headers to column configuration
    sheet_with_columns: List[SheetWithHeaders] = matcher.match_columns(
        matched_sheets)

    # Write column report
    if column_report_filename is not None:
        matcher_report.column_report(sheet_with_columns, unmatched_sheets,
                                     column_report_filename)

    return sheet_with_columns

Example #2

0

Show file

File: test_matcher_columns.py Project: CSCDP/AnnexA_CiNCensus_Cleaner

    def test_column_report(self):
        sheet = self._get_test_sheet()
        result_sheet_list = matcher.match_columns([sheet])
        report = fddc.annex_a.merger.matcher_report.column_report(
            result_sheet_list)

        self.assertEqual([
            'filename',
            'sort_key',
            'header_starts',
            'sheetname',
            'table',
            'column_name',
            'header_name',
        ], report.columns.tolist())
        self.assertEqual(report.column_name.tolist(),
                         ['Header 1', 'Header X', 'Header Y', np.nan])
        self.assertEqual(report.header_name.tolist(),
                         ['Header 1', 'Header   X', '', 'Header T'])

Example #3

0

Show file

File: matcher_report.py Project: CSCDP/AnnexA_CiNCensus_Cleaner

def process_report(match_input: Union[Iterable[MatchInput], pd.DataFrame, str],
                   data_sources: List[SourceConfig]):
    if isinstance(match_input, str) or isinstance(match_input, pd.DataFrame):
        match_input = parse_report(match_input)

    files_to_scan = set()
    columns_per_table = dict()
    mapping_dict: Dict[Tuple, List[MatchInput]] = dict()

    for input in match_input:
        if input.sheetname is None:
            # First we look for unscanned files
            sort_key = None
            if input.sort_key is not None:
                sort_key = [f'/.*/{input.sort_key}/']
            scan_source = ScanSource(include=input.filename,
                                     sort_keys=sort_key)
            files = file_scanner.find_input_files(scan_source)
            files_to_scan.update(files)
        else:
            # Then we build a lookup of files and tables to see if any tables have no columns listed
            key = (input.filename, input.sort_key, input.sheetname,
                   input.table)
            columns = columns_per_table.setdefault(key, set())
            if input.column_name is not None:
                columns.add(input.column_name)

        if input.column_name is not None and input.header_name is not None:
            key = (input.filename, input.sort_key, input.sheetname,
                   input.table)
            mapping_dict.setdefault(key, []).append(input)

    matched_list: List[MatchedSheet] = []
    unmatched_list: List[WorkSheetDetail] = []

    for file in files_to_scan:
        worksheets = workbook_util.find_worksheets(file)
        # Match datasources based on configuration
        matched, unmatched = matcher.match_data_sources(
            worksheets, data_sources)
        matched_list += matched
        unmatched_list += unmatched

    for key, columns in columns_per_table.items():
        if len(columns) == 0:
            file, sort_key, sheetname, table = key
            if table is not None:
                worksheet_list = workbook_util.find_worksheets(
                    FileSource(file, sort_key=sort_key))
                worksheet = next(
                    iter([
                        w for w in worksheet_list if w.sheetname == sheetname
                    ]))
                source_config = next(
                    iter([d for d in data_sources if d.name == table]))
                matched = MatchedSheet(sheet_detail=worksheet,
                                       source_config=source_config)
                matched_list.append(matched)

    # Match headers to column configuration
    sheet_with_headers: List[SheetWithHeaders] = matcher.match_columns(
        matched_list)

    for key, mapping_list in mapping_dict.items():
        file, sort_key, sheetname, table = key
        worksheet_list = workbook_util.find_worksheets(
            FileSource(file, sort_key=sort_key))
        sheet_detail = next(
            iter([w for w in worksheet_list if w.sheetname == sheetname]))
        source_config = next(iter([d for d in data_sources
                                   if d.name == table]))
        sheet = MatchedSheet(sheet_detail=sheet_detail,
                             source_config=source_config)

        column_list: List[MatchedColumn] = []

        for mapping in mapping_list:
            column_config = next(
                iter([
                    c for c in source_config.columns
                    if c.name == mapping.column_name
                ]))
            header_config = next(
                iter([
                    h for h in sheet_detail.headers
                    if h.value == mapping.header_name
                ]))
            column = MatchedColumn(column=column_config, header=header_config)
            column_list.append(column)

        sheet_with_headers.append(
            SheetWithHeaders(sheet=sheet,
                             columns=column_list,
                             unmatched_columns=[]))

    return sheet_with_headers, unmatched_list

Example #4

0

Show file

File: test_matcher_columns.py Project: CSCDP/AnnexA_CiNCensus_Cleaner

    def test_match_multiple_column(self):
        sheet = self._get_test_sheet()

        result_sheet_list = matcher.match_columns([sheet])
        self.assertEqual(len(result_sheet_list), 1)
        self.assert_sheet(result_sheet_list[0], sheet)