def _xsv_parse_success_with_numeric_headers(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]): s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: test_file.writelines([ "Data type: some_type; Columns: 4; Version: 1\n", f"1{s} 2.0{s} 3{s} 4.1\n", # test trimming f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", f"val3 {s} val4{s} 1{s} 8.9\n", ]) res = parser(input_) assert res == ParseResults( frozendict({ "some_type": ParseResult( SpecificationSource(input_), tuple([ frozendict({ "1": "val3", "2.0": "val4", "3": 1, "4.1": 8.9 }), ])) }))
def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]): s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: test_file.writelines([ f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n", f"spec1{s} spec2{s} spec3 {s} spec4\n", # test trimming f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", f"val1 {s} val2 {s} 7 {s} 3.2\n", # test trimming f"val3 {s} val4{s} 1{s} 8.9\n", f"val5 {s}{s}{s} 42.42\n", # test missing values w/o whitespace f"val6 {s} {s} {s} 3.14\n" # test missing values w/ whitespace ]) res = parser(input_) assert res == ParseResults( frozendict({ "some_type": ParseResult( SpecificationSource(input_), tuple([ frozendict({ "spec1": "val1", "spec2": "val2", "spec3": 7, "spec4": 3.2 }), frozendict({ "spec1": "val3", "spec2": "val4", "spec3": 1, "spec4": 8.9 }), frozendict({ "spec1": "val5", "spec2": None, "spec3": None, "spec4": 42.42 }), frozendict({ "spec1": "val6", "spec2": None, "spec3": None, "spec4": 3.14 }), ])) }))
def _process_excel_tab( excel: pandas.ExcelFile, spcsrc: SpecificationSource) -> (O[str], O[ParseResult]): df = excel.parse(sheet_name=spcsrc.tab) if df.shape[ 0] < 3: # might as well not error check headers in sheets with no data return (None, None) # at this point we know that at least 4 lines are present - expecting the data type header, # parameter ID header, display name header, and at least one data row header = df.columns.get_level_values(0)[0] datatype, columns = _parse_header(header, spcsrc, _VERSION) it = df.itertuples(index=False, name=None) hd1 = _process_excel_row(next(it), 2, columns, spcsrc) param_ids = _normalize_headers(hd1, 2, spcsrc) _process_excel_row(next(it), 3, columns, spcsrc) results = [] for i, row in enumerate(it, start=4): row = _process_excel_row(row, i, columns, spcsrc) if any(map(lambda x: not pandas.isna(x), row)): # skip empty rows results.append( frozendict({ param_ids[j]: _normalize_pandas(row[j]) for j in range(len(row)) })) return datatype, ParseResult(spcsrc, tuple(results))
def _parse( paths: tuple[Path, ...], file_type_resolver: Callable[[Path], FileTypeResolution], ) -> ParseResults: results = {} errors = [] for p in paths: file_type = file_type_resolver(p) if file_type.unsupported_type: errors.append(Error( ErrorType.PARSE_FAIL, f"{file_type.unsupported_type} " + "is not a supported file type for import specifications", SpecificationSource(p) )) continue res = file_type.parser(p) if res.errors: errors.extend(res.errors) else: for data_type in res.results: if data_type in results: errors.append(Error( ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, f"Data type {data_type} appears in two importer specification sources", results[data_type].source, res.results[data_type].source )) else: results[data_type] = res.results[data_type] if errors: return ParseResults(errors=tuple(errors)) else: return ParseResults(frozendict(results))
def _parse_xsv(path: Path, sep: str) -> ParseResults: spcsrc = SpecificationSource(path) try: filetype = magic.from_file(str(path), mime=True) if filetype not in _MAGIC_TEXT_FILES: return _error( Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc)) with open(path, newline='') as input_: rdr = csv.reader(input_, delimiter=sep) # let parser handle quoting dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header") datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION) hd1 = _csv_next(rdr, 2, columns, spcsrc, "Missing 2nd header line") param_ids = _normalize_headers(hd1, 2, spcsrc) _csv_next(rdr, 3, columns, spcsrc, "Missing 3rd header line") results = [] for i, row in enumerate(rdr, start=4): if row: # skip empty rows if len(row) != columns: # could collect errors (first 10?) and throw an exception with a list # lets wait and see if that's really needed raise _ParseException( Error( ErrorType.INCORRECT_COLUMN_COUNT, f"Incorrect number of items in line {i}, " + f"expected {columns}, got {len(row)}", spcsrc)) results.append( frozendict({ param_ids[j]: _normalize_xsv(row[j]) for j in range(len(row)) })) if not results: raise _ParseException( Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) return ParseResults( frozendict({datatype: ParseResult(spcsrc, tuple(results))})) except FileNotFoundError: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: return _error( Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) except _ParseException as e: return _error(e.args[0])
def test_excel_parse_success(): """ Tests files with * 3 different tabs with data, including * numeric headers * empty cells * empty rows * whitespace only cells * 2 tabs with no data * 1 tab with a single row, which should be ignored * 1 tab with two rows, which should be ignored * one completely empty tab """ for ext in ["xls", "xlsx"]: ex = _get_test_file("testtabs3full2nodata1empty." + ext) res = parse_excel(ex) assert res == ParseResults( frozendict({ "type1": ParseResult(SpecificationSource(ex, "tab1"), ( frozendict({ "header1": "foo", "header2": 1, "header3": 6.7 }), frozendict({ "header1": "bar", "header2": 2, "header3": 8.9 }), frozendict({ "header1": "baz", "header2": None, "header3": 3.4 }), frozendict({ "header1": "bat", "header2": 4, "header3": None }), )), "type2": ParseResult(SpecificationSource(ex, "tab2"), (frozendict({ "h1": "golly gee", "2": 42, "h3": "super" }), )), "type3": ParseResult(SpecificationSource(ex, "tab3"), (frozendict({ "head1": "some data", "head2": 1 }), )), }))
def parse_excel(path: Path) -> ParseResults: """ Parse the provided Excel file. xls and xlsx files are supported. """ spcsrc = SpecificationSource(path) errors = [] try: with pandas.ExcelFile(path) as ex: results = {} datatype_to_tab = {} for tab in ex.sheet_names: spcsrc_tab = SpecificationSource(path, tab) try: datatype, result = _process_excel_tab(ex, spcsrc_tab) if not datatype: continue elif datatype in results: errors.append( Error( ErrorType. MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, f"Found datatype {datatype} in multiple tabs", SpecificationSource(path, datatype_to_tab[datatype]), spcsrc_tab, )) else: datatype_to_tab[datatype] = tab results[datatype] = result except _ParseException as e: errors.append(e.args[0]) except FileNotFoundError: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: return _error( Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) except ValueError as e: if "Excel file format cannot be determined" in str(e): return _error( Error(ErrorType.PARSE_FAIL, "Not a supported Excel file type", source_1=spcsrc)) raise e # bail out, not sure what's wrong, not sure how to test either if errors: return ParseResults(errors=tuple(errors)) elif results: return ParseResults(frozendict(results)) else: return _error( Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc))
def _xsv_parse_success_with_internal_and_trailing_empty_lines( temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]): s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: test_file.writelines([ "Data type: other_type; Columns: 4; Version: 1\n", f"spec1{s} spec2{s} spec3{s} spec4\n", f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", f"val3 {s} val4{s} 1{s} 8.9\n", "\n", f"val1 {s} val2{s} 7 {s} 3.2\n", "\n", "\n", "\n", ]) res = parser(input_) assert res == ParseResults( frozendict({ "other_type": ParseResult( SpecificationSource(input_), tuple([ frozendict({ "spec1": "val3", "spec2": "val4", "spec3": 1, "spec4": 8.9 }), frozendict({ "spec1": "val1", "spec2": "val2", "spec3": 7, "spec4": 3.2 }), ])) }))