コード例 #1
0
def test_it_raises_a_value_error_for_a_file_name_that_does_not_exist_on_disk():
    missing_name = os.path.join(DATA_DIR, "missing.csv")
    with pytest.raises(ValueError) as e:
        for _ in fables.parse(io=missing_name):
            pass

    assert "parse" in str(e.value)
コード例 #2
0
ファイル: test_api.py プロジェクト: payscale/fables
def test_parse_accepts_a_stream_file_name():
    nameless_stream = io.BytesIO(b"a,b\n1,2\n3,4\n")
    stream_file_name = "fables.txt"
    parse_results = list(
        fables.parse(io=nameless_stream, stream_file_name=stream_file_name)
    )
    assert parse_results[0].tables[0].name == stream_file_name
コード例 #3
0
def test_it_parses_nested_encrypted_files():
    """
    nested_encrypted.zip       - pw: feebles
        encrypted_xlsx_csv.zip - pw: foobles
            basic.csv
            encrypted.xlsx     - pw: fables
        encrypted.xls          - pw: fables
    """
    nested_encrypted_name = os.path.join(DATA_DIR, "nested_encrypted.zip")
    parse_results = list(
        fables.parse(
            io=nested_encrypted_name,
            passwords={
                nested_encrypted_name: "feebles",
                "encrypted_xlsx_csv.zip": "foobles",
                "encrypted.xlsx": "fables",
                "encrypted.xls": "fables",
            },
        ))
    assert len(parse_results) == 3

    for filename in ["basic.csv", "encrypted.xlsx", "encrypted.xls"]:
        result = [pr for pr in parse_results if pr.name.endswith(filename)][0]

        assert len(result.errors) == 0
        assert len(result.tables) == 1

        table = result.tables[0]
        assert table.name.endswith(filename)
        if not table.name.endswith("csv"):
            assert table.sheet == "Sheet1"
        else:
            assert table.sheet is None
        pd.testing.assert_frame_equal(table.df, AB_DF, check_dtype=False)
コード例 #4
0
def test_force_numeric(file_name, force_numeric, pandas_kwargs, expected_df):
    """
    a,,
    ,,
    ,,
    x,y,z
    001,a,b
    002,a,b
    003,a,b

    and

    x,y,z
    001,a,b
    002,a,b
    003,a,b
    """
    path = os.path.join(DATA_DIR, file_name)
    parse_results = list(
        fables.parse(path,
                     force_numeric=force_numeric,
                     pandas_kwargs=pandas_kwargs))
    assert len(parse_results) == 1
    parse_result = parse_results[0]
    assert len(parse_result.errors) == 0
    tables = parse_result.tables
    assert len(tables) == 1

    df = tables[0].df

    pd.testing.assert_frame_equal(df, expected_df, check_dtype=False)
コード例 #5
0
def test_it_parses_files_in_a_zip_in_a_zip():
    """
    nested.zip
        basic.zip
            basic.csv
            basic.xlsx
        basic.xls
    """
    zip_file = "nested.zip"
    zip_path = os.path.join(DATA_DIR, zip_file)
    inner_zip_file = "basic.zip"

    parse_results = list(fables.parse(io=zip_path))

    assert len(parse_results) == 3

    xls_name = os.path.join(zip_file, "basic.xls")
    xls_result = [pr for pr in parse_results if pr.name == xls_name][0]

    # NOTE: paths inside the zipfile lib always use unix paths
    csv_name = os.path.join(inner_zip_file, "basic.csv")
    csv_result = [pr for pr in parse_results if pr.name == csv_name][0]

    xlsx_name = os.path.join(inner_zip_file, "basic.xlsx")
    xlsx_result = [pr for pr in parse_results if pr.name == xlsx_name][0]

    _validate_side_xls_file(xls_result, xls_name)
    _validate_basic_csv_and_basic_xlsx_together([csv_result, xlsx_result],
                                                [csv_name, xlsx_name])
コード例 #6
0
def test_it_parses_a_xls_with_many_sheets():
    xls_name = os.path.join(DATA_DIR, "two_sheets.xls")

    parse_results = list(fables.parse(io=xls_name))

    assert len(parse_results) == 1

    parse_result = parse_results[0]
    assert parse_result.name == xls_name
    tables = parse_result.tables
    errors = parse_result.errors
    assert len(tables) == 2
    assert len(errors) == 0

    table1, table2 = tables
    assert table1.name == xls_name
    assert table1.sheet == "Sheet1"
    assert table2.name == xls_name
    assert table2.sheet == "Sheet2"

    expected_sheet1_df = AB_DF
    pd.testing.assert_frame_equal(table1.df,
                                  expected_sheet1_df,
                                  check_dtype=False)

    expected_sheet2_df = pd.DataFrame(columns=["c", "d"],
                                      data=[[5, 6], [7, 8]])
    pd.testing.assert_frame_equal(table2.df,
                                  expected_sheet2_df,
                                  check_dtype=False)
コード例 #7
0
def test_it_parses_files_using_pandas_kwargs(file_name, pandas_kwargs):
    """
    Will parse to this if pandas kwarg keep_default_na is False:
    a,b,c
    1,2,N/A
    4,5,N/A

    Will parse to this if pandas kwarg keep_default_na is True
    (it is True by default):
    a,b,c
    1,2,NaN
    4,5,NaN
    """
    path = os.path.join(DATA_DIR, file_name)
    parse_results = list(fables.parse(path, pandas_kwargs=pandas_kwargs))
    assert len(parse_results) == 1
    parse_result = parse_results[0]
    assert len(parse_result.errors) == 0
    tables = parse_result.tables
    assert len(tables) == 1

    df = tables[0].df

    if pandas_kwargs == {}:
        expected_df = pd.DataFrame(columns=["a", "b", "c"],
                                   data=[[1, 2, np.nan], [4, 5, np.nan]])
        pd.testing.assert_frame_equal(df, expected_df, check_dtype=False)
    else:
        assert pandas_kwargs == {"keep_default_na": False}
        expected_df = pd.DataFrame(columns=["a", "b", "c"],
                                   data=[[1, 2, "N/A"], [4, 5, "N/A"]])
        pd.testing.assert_frame_equal(df, expected_df, check_dtype=False)
コード例 #8
0
ファイル: test_api.py プロジェクト: thomasjohns/fables
def test_parse_raises_value_error_when_no_io_or_tree_is_given():
    with pytest.raises(ValueError) as e:
        list(fables.parse(io=None, tree=None))

    exception_message = str(e.value)
    assert "parse" in exception_message
    assert "io" in exception_message
    assert "tree" in exception_message
コード例 #9
0
ファイル: test_api.py プロジェクト: thomasjohns/fables
def test_node_stream_property_returns_at_byte_0_after_parse():
    stream = io.BytesIO(b"a,b\n1,2\n3,4\n")
    node = fables.detect(stream)
    for _ in fables.parse(tree=node):
        pass
    assert node._stream.tell() == 0
    with node.stream as node_stream:
        assert node_stream.tell() == 0
コード例 #10
0
def test_it_raises_a_type_error_for_stream_not_read_in_bytes_mode():
    csv_name = os.path.join(DATA_DIR, "basic.csv")

    with open(csv_name, "r") as textio:
        with pytest.raises(TypeError) as e:
            for _ in fables.parse(io=textio):
                pass

    assert "parse" in str(e.value)
    assert "io.BufferedIOBase" in str(e.value)
コード例 #11
0
def _it_parses_flat_files_in_a_basic_zip(zip_file, zip_path):
    parse_results = list(fables.parse(io=zip_path))

    # NOTE: The name fields on files inside a zip files do not
    #       retain the full path.
    child_names = [
        os.path.join(zip_file, child_file)
        for child_file in ["basic.csv", "basic.xlsx"]
    ]
    _validate_basic_csv_and_basic_xlsx_together(parse_results, child_names)
コード例 #12
0
def test_it_parses_all_files_in_a_directory():
    """
    sub_dir/
        basic.csv
        basic.xlsx
    """
    sub_dir = os.path.join(DATA_DIR, "sub_dir")
    parse_results = list(fables.parse(io=sub_dir))
    child_names = [
        os.path.join(sub_dir, "basic.csv"),
        os.path.join(sub_dir, "basic.xlsx"),
    ]
    _validate_basic_csv_and_basic_xlsx_together(parse_results, child_names)
コード例 #13
0
def test_stream_parse_is_the_same_as_disk_parse():
    csv_name = os.path.join(DATA_DIR, "basic.csv")

    with open(csv_name, "rb") as buffered_io:
        stream_results = list(fables.parse(io=buffered_io))

    disk_results = list(fables.parse(io=csv_name))

    assert len(stream_results) == len(disk_results)
    assert len(stream_results[0].tables) == len(disk_results[0].tables)

    stream_result = stream_results[0]
    disk_result = disk_results[0]
    assert stream_result.name == csv_name
    assert disk_result.name == csv_name

    stream_table = stream_result.tables[0]
    disk_table = disk_result.tables[0]

    assert stream_table.name == disk_table.name
    pd.testing.assert_frame_equal(stream_table.df,
                                  disk_table.df,
                                  check_dtype=False)
コード例 #14
0
def test_it_creates_a_parse_error_for_malformed_csv():
    csv_name = os.path.join(DATA_DIR, "malformed.csv")

    parse_results = list(fables.parse(io=csv_name))
    assert len(parse_results) == 1

    parse_result = parse_results[0]
    assert len(parse_result.tables) == 0
    assert len(parse_result.errors) == 1

    error = parse_result.errors[0]
    assert (
        error.message ==
        "Error tokenizing data. C error: Expected 2 fields in line 3, saw 3\n")
    assert error.exception_type is pd.errors.ParserError
    assert error.name == csv_name
コード例 #15
0
def test_it_parses_files_in_an_encrypted_zip_with_password():
    """
    encrypted.zip
        basic.csv
        basic.xlsx
    """
    zip_file = "encrypted.zip"
    zip_path = os.path.join(DATA_DIR, zip_file)

    parse_results = list(
        fables.parse(io=zip_path, passwords={zip_file: "fables"}))
    child_names = [
        os.path.join(zip_file, child_file)
        for child_file in ["basic.csv", "basic.xlsx"]
    ]
    _validate_basic_csv_and_basic_xlsx_together(parse_results, child_names)
コード例 #16
0
def _it_parses_a_csv(csv_name, expected_df):
    parse_results = list(fables.parse(io=csv_name))
    assert len(parse_results) == 1

    parse_result = parse_results[0]
    assert parse_result.name == csv_name
    assert len(parse_result.errors) == 0
    assert len(parse_result.tables) == 1

    table = parse_result.tables[0]
    assert table.name == csv_name
    assert table.sheet is None

    pd.testing.assert_frame_equal(table.df, expected_df, check_dtype=False)

    assert not parse_results[0].errors
コード例 #17
0
def test_it_finds_no_tables_in_an_invalid_csv_plain_text_file():
    """Invalid plain text csv content
    ---------------
    header
    there are two hard things in computer science
    off by one errors, naming things, and cache invalidation
    """
    txt_file_name = os.path.join(DATA_DIR, "invalid_plain_text.txt")
    parse_results = list(fables.parse(io=txt_file_name))
    assert len(parse_results) == 1
    parse_result = parse_results[0]
    assert len(parse_result.tables) == 0
    assert len(parse_result.errors) == 1
    error = parse_result.errors[0]
    assert error.name == txt_file_name
    assert error.exception_type is pd.errors.ParserError
    assert "Error tokenizing data." in error.message
コード例 #18
0
def test_it_finds_tables_in_a_valid_csv_plain_text_file():
    """Valid plain text csv content
    ---------------
    some text
    some other text
    """
    txt_file_name = os.path.join(DATA_DIR, "valid_plain_text.txt")
    parse_results = list(fables.parse(io=txt_file_name))
    assert len(parse_results) == 1
    parse_result = parse_results[0]
    assert len(parse_result.tables) == 1
    assert len(parse_result.errors) == 0
    table = parse_result.tables[0]
    assert table.name == txt_file_name
    assert table.sheet is None
    expected_df = pd.DataFrame(columns=["some text"],
                               data=[["some other text"]])
    pd.testing.assert_frame_equal(table.df, expected_df, check_dtype=False)
コード例 #19
0
def _it_parses_an_excel_file_with_one_sheet(excel_name,
                                            expected_df,
                                            passwords={}):
    parse_results = list(fables.parse(io=excel_name, passwords=passwords))

    assert len(parse_results) == 1

    parse_result = parse_results[0]
    assert parse_result.name == excel_name
    tables = parse_result.tables
    errors = parse_result.errors
    assert len(tables) == 1
    assert len(errors) == 0

    table = tables[0]
    assert table.name == excel_name
    assert table.sheet == "Sheet1"

    pd.testing.assert_frame_equal(table.df, expected_df, check_dtype=False)
コード例 #20
0
def test_it_parses_a_xlsx_with_only_one_cell_filled():
    xlsx_name = os.path.join(DATA_DIR, "only_one_cell_filled.xlsx")

    parse_results = list(fables.parse(io=xlsx_name))

    assert len(parse_results) == 1

    parse_result = parse_results[0]
    assert parse_result.name == xlsx_name
    tables = parse_result.tables
    errors = parse_result.errors
    assert len(tables) == 1
    assert len(errors) == 0

    table = tables[0]
    assert table.sheet == "Sheet1"
    assert table.name == xlsx_name

    expected_df = pd.DataFrame(columns=["a"], data=[])
    pd.testing.assert_frame_equal(table.df, expected_df, check_dtype=False)
コード例 #21
0
def test_it_creates_a_parse_error_for_no_valid_headers(file_name):
    file_path = os.path.join(DATA_DIR, file_name)

    parse_results = list(fables.parse(io=file_path))
    assert len(parse_results) == 1

    parse_result = parse_results[0]
    assert parse_result.name == file_path

    tables = parse_result.tables
    assert len(tables) == 0

    errors = parse_result.errors
    assert len(errors) == 1

    error = errors[0]
    assert error.name == file_path

    assert error.exception_type is ValueError
    assert "Error during pre-header row removal" in error.message
コード例 #22
0
def test_it_creates_a_parse_error_for_corrupt_file():
    """To reproduce, open file in xlsx file in a text editor,
    and delete some characters from the worksheet xml file."""
    corrupt_xlsx_name = os.path.join(DATA_DIR, "corrupt.xlsx")

    parse_results = list(fables.parse(io=corrupt_xlsx_name))
    assert len(parse_results) == 1

    parse_result = parse_results[0]
    assert parse_result.name == corrupt_xlsx_name

    tables = parse_result.tables
    errors = parse_result.errors
    assert len(tables) == 0
    assert len(errors) == 1

    error = errors[0]
    assert error.name == corrupt_xlsx_name

    assert error.exception_type is xml.etree.ElementTree.ParseError
    assert "not well-formed" in error.message
コード例 #23
0
def test_it_removes_columns_that_have_no_headers_and_have_only_null_data_before_parsing(
    file_name, ):
    file_path = os.path.join(DATA_DIR, file_name)

    parse_results = list(fables.parse(io=file_path))
    assert len(parse_results) == 1

    parse_result = parse_results[0]
    assert parse_result.name == file_path

    tables = parse_result.tables
    assert len(tables) == 1

    errors = parse_result.errors
    assert len(errors) == 0

    expected_df = pd.DataFrame(
        columns=["this", "here's", "here's.1"],
        data=[["that", "something", "some"],
              ["the other", "else", "other stuff"]],
    )
    pd.testing.assert_frame_equal(tables[0].df, expected_df, check_dtype=False)
コード例 #24
0
def test_it_finds_no_tables_in_a_png_file():
    png_name = os.path.join(DATA_DIR, "terminal.png")
    parse_results = list(fables.parse(io=png_name))
    assert len(parse_results) == 0