Exemple #1
0
def test_parse_import_specification_resolver_exception():
    """
    This tests an "oh shit" error scenario where we get a completely unexpected error that
    gets caught in the top level catch block and we bail out.
    """
    resolver, logger, parser1 = _get_mocks(3)

    resolver.side_effect = [_ftr(parser1), ArithmeticError("crapsticks")]

    # test that other errors aren't included in the result
    parser1.return_value = ParseResults(
        errors=tuple([Error(ErrorType.OTHER, 'foo')]))

    res = parse_import_specifications(
        (Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger)

    assert res == ParseResults(
        errors=tuple([Error(ErrorType.OTHER, "crapsticks")]))

    resolver.assert_has_calls(
        [call(Path("myfile.xlsx")),
         call(Path("somefile.csv"))])
    parser1.assert_called_once_with(Path("myfile.xlsx"))
    # In [1]: ArithmeticError("a") == ArithmeticError("a")
    # Out[1]: False
    # so assert_called_once_with doesn't work
    assert_exception_correct(logger.call_args[0][0],
                             ArithmeticError("crapsticks"))
Exemple #2
0
def parse_excel(path: Path) -> ParseResults:
    """
    Parse the provided Excel file.
    xls and xlsx files are supported.
    """
    spcsrc = SpecificationSource(path)
    errors = []
    try:
        with pandas.ExcelFile(path) as ex:
            results = {}
            datatype_to_tab = {}
            for tab in ex.sheet_names:
                spcsrc_tab = SpecificationSource(path, tab)
                try:
                    datatype, result = _process_excel_tab(ex, spcsrc_tab)
                    if not datatype:
                        continue
                    elif datatype in results:
                        errors.append(
                            Error(
                                ErrorType.
                                MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE,
                                f"Found datatype {datatype} in multiple tabs",
                                SpecificationSource(path,
                                                    datatype_to_tab[datatype]),
                                spcsrc_tab,
                            ))
                    else:
                        datatype_to_tab[datatype] = tab
                        results[datatype] = result
                except _ParseException as e:
                    errors.append(e.args[0])
    except FileNotFoundError:
        return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc))
    except IsADirectoryError:
        return _error(
            Error(ErrorType.PARSE_FAIL, "The given path is a directory",
                  spcsrc))
    except ValueError as e:
        if "Excel file format cannot be determined" in str(e):
            return _error(
                Error(ErrorType.PARSE_FAIL,
                      "Not a supported Excel file type",
                      source_1=spcsrc))
        raise e  # bail out, not sure what's wrong, not sure how to test either
    if errors:
        return ParseResults(errors=tuple(errors))
    elif results:
        return ParseResults(frozendict(results))
    else:
        return _error(
            Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc))
Exemple #3
0
def test_Error_init_w_FILE_NOT_FOUND_success():
    # minimal
    e = Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo"))

    assert e.error == ErrorType.FILE_NOT_FOUND
    assert e.message is None
    assert e.source_1 == spcsrc("foo")
    assert e.source_2 is None

    # all
    e = Error(ErrorType.FILE_NOT_FOUND, message="bar", source_1=spcsrc("foo"))

    assert e.error == ErrorType.FILE_NOT_FOUND
    assert e.message == "bar"
    assert e.source_1 == spcsrc("foo")
    assert e.source_2 is None
Exemple #4
0
def test_Error_init_w_PARSE_FAIL_success():
    e = Error(ErrorType.PARSE_FAIL, message="foo", source_1=spcsrc("foo2"))

    assert e.error == ErrorType.PARSE_FAIL
    assert e.message == "foo"
    assert e.source_1 == spcsrc("foo2")
    assert e.source_2 is None
def test_format_import_spec_errors_one_error():
    errors = [Error(ErrorType.OTHER, "foobar")]
    assert format_import_spec_errors(errors, {}) == [{
        "type": "unexpected_error",
        "message": "foobar",
        "file": None,
    }]
Exemple #6
0
def test_Error_init_w_OTHER_success():
    # minimal
    e = Error(ErrorType.OTHER, message="foo")

    assert e.error == ErrorType.OTHER
    assert e.message == "foo"
    assert e.source_1 is None
    assert e.source_2 is None

    # all
    e = Error(ErrorType.OTHER, message="foo", source_1=spcsrc("wooo"))

    assert e.error == ErrorType.OTHER
    assert e.message == "foo"
    assert e.source_1 == spcsrc("wooo")
    assert e.source_2 is None
Exemple #7
0
def test_Error_init_w_NO_FILES_PROVIDED_success():
    e = Error(ErrorType.NO_FILES_PROVIDED)

    assert e.error == ErrorType.NO_FILES_PROVIDED
    assert e.message is None
    assert e.source_1 is None
    assert e.source_2 is None
Exemple #8
0
def _parse_xsv(path: Path, sep: str) -> ParseResults:
    spcsrc = SpecificationSource(path)
    try:
        filetype = magic.from_file(str(path), mime=True)
        if filetype not in _MAGIC_TEXT_FILES:
            return _error(
                Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype,
                      spcsrc))
        with open(path, newline='') as input_:
            rdr = csv.reader(input_,
                             delimiter=sep)  # let parser handle quoting
            dthd = _csv_next(rdr, 1, None, spcsrc,
                             "Missing data type / version header")
            datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION)
            hd1 = _csv_next(rdr, 2, columns, spcsrc, "Missing 2nd header line")
            param_ids = _normalize_headers(hd1, 2, spcsrc)
            _csv_next(rdr, 3, columns, spcsrc, "Missing 3rd header line")
            results = []
            for i, row in enumerate(rdr, start=4):
                if row:  # skip empty rows
                    if len(row) != columns:
                        # could collect errors (first 10?) and throw an exception with a list
                        # lets wait and see if that's really needed
                        raise _ParseException(
                            Error(
                                ErrorType.INCORRECT_COLUMN_COUNT,
                                f"Incorrect number of items in line {i}, " +
                                f"expected {columns}, got {len(row)}", spcsrc))
                    results.append(
                        frozendict({
                            param_ids[j]: _normalize_xsv(row[j])
                            for j in range(len(row))
                        }))
        if not results:
            raise _ParseException(
                Error(ErrorType.PARSE_FAIL, "No non-header data in file",
                      spcsrc))
        return ParseResults(
            frozendict({datatype: ParseResult(spcsrc, tuple(results))}))
    except FileNotFoundError:
        return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc))
    except IsADirectoryError:
        return _error(
            Error(ErrorType.PARSE_FAIL, "The given path is a directory",
                  spcsrc))
    except _ParseException as e:
        return _error(e.args[0])
Exemple #9
0
def test_Error_init_w_MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE_success():
    e = Error(ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foo",
              spcsrc("foo2"), spcsrc("yay"))

    assert e.error == ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE
    assert e.message == "foo"
    assert e.source_1 == spcsrc("foo2")
    assert e.source_2 == spcsrc("yay")
Exemple #10
0
def _parse_header(header: str, spec_source: SpecificationSource,
                  maximum_version: int) -> tuple[str, int]:
    # return is (data type, column count)
    match = _HEADER_REGEX.fullmatch(header)
    if not match:
        raise _ParseException(
            Error(
                ErrorType.PARSE_FAIL,
                f'Invalid header; got "{header}", expected "{_EXPECTED_HEADER}"',
                spec_source))
    version = int(match[3])
    if version > maximum_version:
        raise _ParseException(
            Error(
                ErrorType.PARSE_FAIL,
                f"Schema version {version} is larger than maximum processable "
                + f"version {maximum_version}", spec_source))
    return match[1], int(match[2])
Exemple #11
0
def test_Error_init_w_INCORRECT_COLUMN_COUNT_success():
    e = Error(ErrorType.INCORRECT_COLUMN_COUNT,
              message="42",
              source_1=spcsrc("somefile"))

    assert e.error == ErrorType.INCORRECT_COLUMN_COUNT
    assert e.message == "42"
    assert e.source_1 == spcsrc("somefile")
    assert e.source_2 is None
Exemple #12
0
def _csv_next(
        input_: Any,  # Any really means a csv reader object
        line_number: int,
        expected_line_count: Union[None, int],  # None = skip columns check
        spec_source: SpecificationSource,
        error: str) -> list[str]:
    try:
        line = next(input_)
    except StopIteration:
        raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source))
    if expected_line_count and len(line) != expected_line_count:
        raise _ParseException(
            Error(
                ErrorType.INCORRECT_COLUMN_COUNT,
                f"Incorrect number of items in line {line_number}, " +
                f"expected {expected_line_count}, got {len(line)}",
                spec_source))
    return line
Exemple #13
0
def _normalize_headers(headers: list[Any], line_number: int,
                       spec_source: SpecificationSource) -> list[str]:
    seen = set()
    ret = [str(s).strip() if not pandas.isna(s) else None for s in headers]
    for i, name in enumerate(ret, start=1):
        if not name:
            raise _ParseException(
                Error(
                    ErrorType.PARSE_FAIL,
                    f"Missing header entry in row {line_number}, position {i}",
                    spec_source))

        if name in seen:
            raise _ParseException(
                Error(ErrorType.PARSE_FAIL,
                      f"Duplicate header name in row {line_number}: {name}",
                      spec_source))
        seen.add(name)
    return ret
Exemple #14
0
def test_parse_import_specification_unsupported_type_and_parser_error():
    """
    This test really tests 4 things:
    1. a parser returning an error and that error showing up in the final results
    2. an invalid file type being submitted and having an error show up in the final results
    3. results from a parser being ignored if an error is produced
    4. errors from multiple sources being integrated into the final results
    It's not possible to split the test up further and still test #4
    """
    resolver, logger, parser1, parser2 = _get_mocks(4)

    resolver.side_effect = [_ftr(parser1), _ftr(parser2), _ftr(notype="JPEG")]

    # check that other errors are also returned, and the results are ignored
    parser1.return_value = ParseResults(errors=tuple([
        Error(ErrorType.OTHER, 'foo'),
        Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv"))
    ]))
    parser2.return_value = ParseResults(
        frozendict(
            {"foo": ParseResult(spcsrc("a"), tuple([frozendict({"a":
                                                                "b"})]))}))

    res = parse_import_specifications(
        (Path("myfile.xlsx"), Path("somefile.csv"), Path("x.jpeg")), resolver,
        logger)

    assert res == ParseResults(errors=tuple([
        Error(ErrorType.OTHER, "foo"),
        Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")),
        Error(ErrorType.PARSE_FAIL,
              "JPEG is not a supported file type for import specifications",
              spcsrc(Path("x.jpeg")))
    ]))

    resolver.assert_has_calls([
        call(Path("myfile.xlsx")),
        call(Path("somefile.csv")),
        call(Path("x.jpeg")),
    ])
    parser1.assert_called_once_with(Path("myfile.xlsx"))
    parser2.assert_called_once_with(Path("somefile.csv"))
    logger.assert_not_called()
Exemple #15
0
def _process_excel_row(row: tuple[Any], rownum: int, expected_columns: int,
                       spcsrc: SpecificationSource) -> list[Any]:
    while len(row) > expected_columns:
        if pandas.isna(row[-1]):  # inefficient, but premature optimization...
            row = row[0:-1]
        else:
            raise _ParseException(
                Error(
                    ErrorType.INCORRECT_COLUMN_COUNT,
                    f"Incorrect number of items in line {rownum}, " +
                    f"expected {expected_columns}, got {len(row)}", spcsrc))
    return row
def test_format_import_spec_errors_all_the_errors_with_tabs():
    errors = [
        Error(ErrorType.PARSE_FAIL, "foobar1", _ss("file1", "tab1")),
        Error(ErrorType.INCORRECT_COLUMN_COUNT, "foobar2",
              _ss("file2", "tab2")),
        Error(ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foobar3",
              _ss("file3", "tab3"), _ss("file4", "tab4")),
    ]
    paths = {
        Path("file1"): Path("f1"),
        Path("file2"): Path("f2"),
        Path("file3"): Path("f3"),
        Path("file4"): Path("f4"),
    }
    assert format_import_spec_errors(errors, paths) == [
        {
            "type": "cannot_parse_file",
            "message": "foobar1",
            "file": "f1",
            "tab": "tab1"
        },
        {
            "type": "incorrect_column_count",
            "message": "foobar2",
            "file": "f2",
            "tab": "tab2",
        },
        {
            "type": "multiple_specifications_for_data_type",
            "message": "foobar3",
            "file_1": "f3",
            "tab_1": "tab3",
            "file_2": "f4",
            "tab_2": "tab4",
        },
    ]
def test_format_import_spec_errors_all_the_errors_no_tabs():
    errors = [
        Error(ErrorType.OTHER, "foobar1", _ss("file1")),
        Error(ErrorType.PARSE_FAIL, "foobar2", _ss("file2")),
        Error(ErrorType.INCORRECT_COLUMN_COUNT, "foobar3", _ss("file3")),
        Error(ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foobar4",
              _ss("file4"), _ss("file5")),
        Error(ErrorType.NO_FILES_PROVIDED),
        Error(ErrorType.FILE_NOT_FOUND, source_1=_ss("file6")),
    ]
    paths = {
        Path("file1"): Path("f1"),
        Path("file2"): Path("f2"),
        Path("file3"): Path("f3"),
        Path("file4"): Path("f4"),
        Path("file5"): Path("f5"),
        Path("file6"): Path("f6"),
    }
    assert format_import_spec_errors(errors, paths) == [
        {
            "type": "unexpected_error",
            "message": "foobar1",
            "file": "f1",
        },
        {
            "type": "cannot_parse_file",
            "message": "foobar2",
            "file": "f2",
            "tab": None
        },
        {
            "type": "incorrect_column_count",
            "message": "foobar3",
            "file": "f3",
            "tab": None,
        },
        {
            "type": "multiple_specifications_for_data_type",
            "message": "foobar4",
            "file_1": "f4",
            "tab_1": None,
            "file_2": "f5",
            "tab_2": None,
        },
        {
            "type": "no_files_provided"
        },
        {
            "type": "cannot_find_file",
            "file": "f6",
        },
    ]
Exemple #18
0
def test_FileTypeResolution_init_w_parser_success():
    p = lambda path: ParseResults(errors=(Error(ErrorType.OTHER, "foo"), ))
    ftr = FileTypeResolution(p)

    assert ftr.parser is p  # Here only identity equality makes sense
    assert ftr.unsupported_type is None
Exemple #19
0
def error_init_fail(errortype: O[ErrorType], message: O[str],
                    source_1: O[SpecificationSource],
                    source_2: O[SpecificationSource], expected: Exception):
    with raises(Exception) as got:
        Error(errortype, message, source_1, source_2)
    assert_exception_correct(got.value, expected)
Exemple #20
0
def test_parse_import_specifications_fail_no_paths():
    res = parse_import_specifications(tuple(), lambda p: None, lambda e: None)
    assert res == ParseResults(
        errors=tuple([Error(ErrorType.NO_FILES_PROVIDED)]))
Exemple #21
0
def test_parse_import_specification_multiple_specs_and_parser_error():
    """
    This test really tests 4 things:
    1. a parser returning an error and that error showing up in the final results
    2. two specifications for the same data type being submitted and having an error show up
       in the final results
    3. results from a parser being ignored if an error is produced
    4. errors from multiple sources being integrated into the final results
    It's not possible to split the test up further and still test #4
    """
    resolver, logger, parser1, parser2, parser3 = _get_mocks(5)

    resolver.side_effect = [_ftr(parser1), _ftr(parser2), _ftr(parser3)]

    # check that other errors are also returned, and the results are ignored
    parser1.return_value = ParseResults(errors=tuple([
        Error(ErrorType.OTHER, "other"),
        Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx"))
    ]))
    parser2.return_value = ParseResults(
        frozendict(
            {
                "foo": ParseResult(spcsrc("a1"), tuple(
                    [frozendict({"a": "b"})])),
                "bar": ParseResult(spcsrc("b1"), tuple(
                    [frozendict({"a": "b"})])),
                "baz": ParseResult(spcsrc("c1"), tuple(
                    [frozendict({"a": "b"})]))
            }, ))
    parser3.return_value = ParseResults(
        frozendict(
            {
                "foo2": ParseResult(spcsrc("a2"),
                                    tuple([frozendict({"a": "b"})])),
                "bar": ParseResult(spcsrc("b2"), tuple(
                    [frozendict({"a": "b"})])),
                "baz": ParseResult(spcsrc("c2"), tuple(
                    [frozendict({"a": "b"})]))
            }, ))

    res = parse_import_specifications(
        (Path("myfile.xlsx"), Path("somefile.csv"), Path("x.tsv")), resolver,
        logger)

    assert res == ParseResults(errors=tuple([
        Error(ErrorType.OTHER, "other"),
        Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")),
        Error(ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE,
              "Data type bar appears in two importer specification sources",
              spcsrc("b1"), spcsrc("b2")),
        Error(ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE,
              "Data type baz appears in two importer specification sources",
              spcsrc("c1"), spcsrc("c2"))
    ]))

    resolver.assert_has_calls([
        call(Path("myfile.xlsx")),
        call(Path("somefile.csv")),
        call(Path("x.tsv")),
    ])
    parser1.assert_called_once_with(Path("myfile.xlsx"))
    parser2.assert_called_once_with(Path("somefile.csv"))
    parser3.assert_called_once_with(Path("x.tsv"))
    logger.assert_not_called()
Exemple #22
0
def test_FileTypeResolution_init_fail():
    err = "Exectly one of parser or unsupported_type must be supplied"
    pr = ParseResults(errors=(Error(ErrorType.OTHER, "foo"), ))
    fileTypeResolution_init_fail(None, None, ValueError(err))
    fileTypeResolution_init_fail(lambda path: pr, "mp-2", ValueError(err))
Exemple #23
0
        ParseResult(source, result)
    assert_exception_correct(got.value, expected)


PR_RESULTS = frozendict({
    "data_type":
    ParseResult(
        spcsrc("some_file", "tab"),
        (frozendict({
            "fasta_file": "foo.fa",
            "do_thing": 1
        }), )  # make a tuple!
    )
})

PR_ERROR = (Error(ErrorType.OTHER, message="foo"),
            Error(ErrorType.PARSE_FAIL,
                  message="bar",
                  source_1=spcsrc("some_file", "tab3")))


def test_ParseResults_init_w_results_success():
    results_copy = frozendict(PR_RESULTS)  # prevent identity equality

    pr = ParseResults(PR_RESULTS)
    assert pr.results == results_copy
    assert pr.errors is None

    assert pr == ParseResults(results_copy)