Example #1
0
def s3_base(worker_id):
    """
    Fixture for mocking S3 interaction.

    Sets up moto server in separate process locally
    Return url for motoserver/moto CI service
    """
    pytest.importorskip("s3fs")
    pytest.importorskip("boto3")

    with tm.ensure_safe_environment_variables():
        # temporary workaround as moto fails for botocore >= 1.11 otherwise,
        # see https://github.com/spulec/moto/issues/1924 & 1952
        os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
        os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
        if is_ci_environment():
            if is_platform_arm() or is_platform_mac() or is_platform_windows():
                # NOT RUN on Windows/MacOS/ARM, only Ubuntu
                # - subprocess in CI can cause timeouts
                # - Github Actions do not support
                #   container services for the above OSs
                # - CircleCI will probably hit the Docker rate pull limit
                pytest.skip("S3 tests do not have a corresponding service in "
                            "Windows, MacOS or ARM platforms")
            else:
                yield "http://localhost:5000"
        else:
            requests = pytest.importorskip("requests")
            pytest.importorskip("moto", minversion="1.3.14")
            pytest.importorskip("flask")  # server mode needs flask too

            # Launching moto in server mode, i.e., as a separate process
            # with an S3 endpoint on localhost

            worker_id = "5" if worker_id == "master" else worker_id.lstrip(
                "gw")
            endpoint_port = f"555{worker_id}"
            endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"

            # pipe to null to avoid logging in terminal
            with subprocess.Popen(
                    shlex.split(f"moto_server s3 -p {endpoint_port}"),
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.DEVNULL,
            ) as proc:

                timeout = 5
                while timeout > 0:
                    try:
                        # OK to go once server is accepting connections
                        r = requests.get(endpoint_uri)
                        if r.ok:
                            break
                    except Exception:
                        pass
                    timeout -= 0.1
                    time.sleep(0.1)
                yield endpoint_uri

                proc.terminate()
Example #2
0
def s3so(worker_id):
    if is_ci_environment():
        url = "http://localhost:5000/"
    else:
        worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw")
        url = f"http://127.0.0.1:555{worker_id}/"
    return {"client_kwargs": {"endpoint_url": url}}
Example #3
0
def test_close_file_handle_on_invalid_usecols(all_parsers):
    # GH 45384
    parser = all_parsers

    error = ValueError
    if parser.engine == "pyarrow":
        pyarrow = pytest.importorskip("pyarrow")
        error = pyarrow.lib.ArrowKeyError
        if is_ci_environment() and (is_platform_windows() or is_platform_mac()):
            # GH#45547 causes timeouts on windows/mac builds
            pytest.skip("GH#45547 causing timeouts on windows/mac builds 2022-01-22")

    with tm.ensure_clean("test.csv") as fname:
        Path(fname).write_text("col1,col2\na,b\n1,2")
        with tm.assert_produces_warning(False):
            with pytest.raises(error, match="col3"):
                parser.read_csv(fname, usecols=["col1", "col2", "col3"])
        # unlink fails on windows if file handles still point to it
        os.unlink(fname)
Example #4
0

def test_unsuported_compression(parser):
    with pytest.raises(ValueError, match="Unrecognized compression type"):
        with tm.ensure_clean() as path:
            read_xml(path, parser=parser, compression="7z")


# STORAGE OPTIONS


@pytest.mark.network
@td.skip_if_no("s3fs")
@td.skip_if_no("lxml")
@pytest.mark.skipif(
    is_ci_environment(),
    reason="2022.1.17: Hanging on the CI min versions build.",
)
@tm.network
def test_s3_parser_consistency():
    # Python Software Foundation (2019 IRS-990 RETURN)
    s3 = "s3://irs-form-990/201923199349319487_public.xml"

    df_lxml = read_xml(
        s3,
        xpath=".//irs:Form990PartVIISectionAGrp",
        namespaces={"irs": "http://www.irs.gov/efile"},
        parser="lxml",
        storage_options={"anon": True},
    )
    # Unfortunately, Python's CSV library can't handle
    # tarfile objects (expects string, not bytes when
    # iterating through a file-like).
    parser = c_parser_only
    tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)

    with tarfile.open(tar_path, "r") as tar:
        data_file = tar.extractfile("tar_data.csv")

        out = parser.read_csv(data_file)
        expected = DataFrame({"a": [1]})
        tm.assert_frame_equal(out, expected)


@pytest.mark.single_cpu
@pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.")
def test_bytes_exceed_2gb(c_parser_only):
    # see gh-16798
    #
    # Read from a "CSV" that has a column larger than 2GB.
    parser = c_parser_only

    if parser.low_memory:
        pytest.skip("not a low_memory test")

    csv = StringIO("strings\n" +
                   "\n".join(["x" * (1 << 20) for _ in range(2100)]))
    df = parser.read_csv(csv)
    assert not df.empty

Example #6
0
from pandas.compat import (
    is_ci_environment,
    is_platform_mac,
    is_platform_windows,
)
import pandas.util._test_decorators as td

from pandas import (
    DataFrame,
    Series,
)
import pandas._testing as tm

# TODO(GH#44584): Mark these as pytest.mark.single_cpu
pytestmark = pytest.mark.skipif(
    is_ci_environment() and (is_platform_windows() or is_platform_mac()),
    reason="On GHA CI, Windows can fail with "
    "'Windows fatal exception: stack overflow' "
    "and MacOS can timeout",
)


@td.skip_if_no("numba")
@pytest.mark.filterwarnings("ignore:\n")
class TestEWM:
    def test_invalid_update(self):
        df = DataFrame({"a": range(5), "b": range(5)})
        online_ewm = df.head(2).ewm(0.5).online()
        with pytest.raises(
                ValueError,
                match=
Example #7
0
class TestS3:
    @td.skip_if_no("s3fs")
    def test_parse_public_s3_bucket(self, tips_df, s3so):

        # more of an integration test due to the not-public contents portion
        # can probably mock this though.
        for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
            df = read_csv(
                "s3://pandas-test/tips.csv" + ext,
                compression=comp,
                storage_options=s3so,
            )
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(df, tips_df)

        # Read public file from bucket with not-public contents
        df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3so)
        assert isinstance(df, DataFrame)
        assert not df.empty
        tm.assert_frame_equal(df, tips_df)

    def test_parse_public_s3n_bucket(self, tips_df, s3so):

        # Read from AWS s3 as "s3n" URL
        df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so)
        assert isinstance(df, DataFrame)
        assert not df.empty
        tm.assert_frame_equal(tips_df.iloc[:10], df)

    def test_parse_public_s3a_bucket(self, tips_df, s3so):
        # Read from AWS s3 as "s3a" URL
        df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so)
        assert isinstance(df, DataFrame)
        assert not df.empty
        tm.assert_frame_equal(tips_df.iloc[:10], df)

    def test_parse_public_s3_bucket_nrows(self, tips_df, s3so):
        for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
            df = read_csv(
                "s3://pandas-test/tips.csv" + ext,
                nrows=10,
                compression=comp,
                storage_options=s3so,
            )
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(tips_df.iloc[:10], df)

    def test_parse_public_s3_bucket_chunked(self, tips_df, s3so):
        # Read with a chunksize
        chunksize = 5
        for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
            with read_csv(
                "s3://pandas-test/tips.csv" + ext,
                chunksize=chunksize,
                compression=comp,
                storage_options=s3so,
            ) as df_reader:
                assert df_reader.chunksize == chunksize
                for i_chunk in [0, 1, 2]:
                    # Read a couple of chunks and make sure we see them
                    # properly.
                    df = df_reader.get_chunk()
                    assert isinstance(df, DataFrame)
                    assert not df.empty
                    true_df = tips_df.iloc[
                        chunksize * i_chunk : chunksize * (i_chunk + 1)
                    ]
                    tm.assert_frame_equal(true_df, df)

    def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so):
        # Read with a chunksize using the Python parser
        chunksize = 5
        for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
            with read_csv(
                "s3://pandas-test/tips.csv" + ext,
                chunksize=chunksize,
                compression=comp,
                engine="python",
                storage_options=s3so,
            ) as df_reader:
                assert df_reader.chunksize == chunksize
                for i_chunk in [0, 1, 2]:
                    # Read a couple of chunks and make sure we see them properly.
                    df = df_reader.get_chunk()
                    assert isinstance(df, DataFrame)
                    assert not df.empty
                    true_df = tips_df.iloc[
                        chunksize * i_chunk : chunksize * (i_chunk + 1)
                    ]
                    tm.assert_frame_equal(true_df, df)

    def test_parse_public_s3_bucket_python(self, tips_df, s3so):
        for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
            df = read_csv(
                "s3://pandas-test/tips.csv" + ext,
                engine="python",
                compression=comp,
                storage_options=s3so,
            )
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(df, tips_df)

    def test_infer_s3_compression(self, tips_df, s3so):
        for ext in ["", ".gz", ".bz2"]:
            df = read_csv(
                "s3://pandas-test/tips.csv" + ext,
                engine="python",
                compression="infer",
                storage_options=s3so,
            )
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(df, tips_df)

    def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so):
        for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
            df = read_csv(
                "s3://pandas-test/tips.csv" + ext,
                engine="python",
                nrows=10,
                compression=comp,
                storage_options=s3so,
            )
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(tips_df.iloc[:10], df)

    def test_read_s3_fails(self, s3so):
        msg = "The specified bucket does not exist"
        with pytest.raises(OSError, match=msg):
            read_csv("s3://nyqpug/asdf.csv", storage_options=s3so)

        # Receive a permission error when trying to read a private bucket.
        # It's irrelevant here that this isn't actually a table.
        with pytest.raises(OSError, match=msg):
            read_csv("s3://cant_get_it/file.csv")

    @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
    def test_write_s3_csv_fails(self, tips_df, s3so):
        # GH 32486
        # Attempting to write to an invalid S3 path should raise
        import botocore

        # GH 34087
        # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
        # Catch a ClientError since AWS Service Errors are defined dynamically
        error = (FileNotFoundError, botocore.exceptions.ClientError)

        with pytest.raises(error, match="The specified bucket does not exist"):
            tips_df.to_csv(
                "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so
            )

    @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
    @td.skip_if_no("pyarrow")
    def test_write_s3_parquet_fails(self, tips_df, s3so):
        # GH 27679
        # Attempting to write to an invalid S3 path should raise
        import botocore

        # GH 34087
        # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
        # Catch a ClientError since AWS Service Errors are defined dynamically
        error = (FileNotFoundError, botocore.exceptions.ClientError)

        with pytest.raises(error, match="The specified bucket does not exist"):
            tips_df.to_parquet(
                "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet",
                storage_options=s3so,
            )

    @pytest.mark.single_cpu
    def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file):
        # see gh-16135

        s3_object = s3_resource.meta.client.get_object(
            Bucket="pandas-test", Key="tips.csv"
        )

        with BytesIO(s3_object["Body"].read()) as buffer:
            result = read_csv(buffer, encoding="utf8")
        assert isinstance(result, DataFrame)
        assert not result.empty

        expected = read_csv(tips_file)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.single_cpu
    @pytest.mark.skipif(
        is_ci_environment(),
        reason="This test can hang in our CI min_versions build "
        "and leads to '##[error]The runner has "
        "received a shutdown signal...' in GHA. GH: 45651",
    )
    def test_read_csv_chunked_download(self, s3_resource, caplog, s3so):
        # 8 MB, S3FS uses 5MB chunks
        import s3fs

        df = DataFrame(np.random.randn(100000, 4), columns=list("abcd"))
        str_buf = StringIO()

        df.to_csv(str_buf)

        buf = BytesIO(str_buf.getvalue().encode("utf-8"))

        s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf)

        # Possibly some state leaking in between tests.
        # If we don't clear this cache, we saw `GetObject operation: Forbidden`.
        # Presumably the s3fs instance is being cached, with the directory listing
        # from *before* we add the large-file.csv in the pandas-test bucket.
        s3fs.S3FileSystem.clear_instance_cache()

        with caplog.at_level(logging.DEBUG, logger="s3fs"):
            read_csv("s3://pandas-test/large-file.csv", nrows=5, storage_options=s3so)
            # log of fetch_range (start, stop)
            assert (0, 5505024) in (x.args[-2:] for x in caplog.records)

    def test_read_s3_with_hash_in_key(self, tips_df, s3so):
        # GH 25945
        result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so)
        tm.assert_frame_equal(tips_df, result)

    @td.skip_if_no("pyarrow")
    def test_read_feather_s3_file_path(self, feather_file, s3so):
        # GH 29055
        expected = read_feather(feather_file)
        res = read_feather(
            "s3://pandas-test/simple_dataset.feather", storage_options=s3so
        )
        tm.assert_frame_equal(expected, res)
Example #8
0
    # No non-mapping subtypes, class
    with pytest.raises(TypeError, match=msg):
        com.standardize_mapping(list)

    fill = {"bad": "data"}
    assert com.standardize_mapping(fill) == dict

    # Convert instance to type
    assert com.standardize_mapping({}) == dict

    dd = collections.defaultdict(list)
    assert isinstance(com.standardize_mapping(dd), partial)


@pytest.mark.xfail(is_ci_environment() and not IS64,
                   reason="Failing on 32 bit Python CI job")
def test_git_version():
    # GH 21295
    git_version = pd.__git_version__
    assert len(git_version) == 40
    assert all(c in string.hexdigits for c in git_version)


def test_version_tag():
    version = Version(pd.__version__)
    try:
        version > Version("0.0.1")
    except TypeError:
        raise ValueError(
            "No git tags exist, please sync tags between upstream and your repo"
Example #9
0
    stdout = capsys.readouterr().out

    # check valid json is printed to the console if as_json is True
    result = json.loads(stdout)

    # Basic check that each version element is found in output
    expected = {
        "system": _get_sys_info(),
        "dependencies": _get_dependency_info(),
    }

    assert result == expected


@pytest.mark.xfail(
    is_ci_environment() and not IS64, reason="Failing on 32 bit Python CI job"
)
def test_show_versions_console(capsys):
    # gh-32041
    # gh-32041
    pd.show_versions(as_json=False)
    result = capsys.readouterr().out

    # check header
    assert "INSTALLED VERSIONS" in result

    # check full commit hash
    assert re.search(r"commit\s*:\s[0-9a-f]{40}\n", result)

    # check required dependency
    # 2020-12-09 npdev has "dirty" in the tag
Example #10
0
    @pytest.mark.xfail(
        reason="ufunc 'invert' not supported for the input types")
    def test_construct_empty_dataframe(self, dtype):
        super().test_construct_empty_dataframe(dtype)

    @pytest.mark.xfail(reason="_from_sequence ignores dtype keyword")
    def test_empty(self, dtype):
        super().test_empty(dtype)


class TestReduce(base.BaseNoReduceTests):
    def test_reduce_series_boolean(self):
        pass


@pytest.mark.skipif(
    is_ci_environment() and is_platform_windows(),
    reason="Causes stack overflow on Windows CI",
)
class TestReduceBoolean(base.BaseBooleanReduceTests):
    pass


def test_is_bool_dtype(data):
    assert is_bool_dtype(data)
    assert pd.core.common.is_bool_indexer(data)
    s = pd.Series(range(len(data)))
    result = s[data]
    expected = s[np.asarray(data)]
    tm.assert_series_equal(result, expected)
Example #11
0
class TestSafeSort:
    @pytest.mark.parametrize(
        "arg, exp",
        [
            [[3, 1, 2, 0, 4], [0, 1, 2, 3, 4]],
            [list("baaacb"),
             np.array(list("aaabbc"), dtype=object)],
            [[], []],
        ],
    )
    def test_basic_sort(self, arg, exp):
        result = safe_sort(arg)
        expected = np.array(exp)
        tm.assert_numpy_array_equal(result, expected)

    @pytest.mark.parametrize("verify", [True, False])
    @pytest.mark.parametrize(
        "codes, exp_codes, na_sentinel",
        [
            [[0, 1, 1, 2, 3, 0, -1, 4], [3, 1, 1, 2, 0, 3, -1, 4], -1],
            [[0, 1, 1, 2, 3, 0, 99, 4], [3, 1, 1, 2, 0, 3, 99, 4], 99],
            [[], [], -1],
        ],
    )
    def test_codes(self, verify, codes, exp_codes, na_sentinel):
        values = [3, 1, 2, 0, 4]
        expected = np.array([0, 1, 2, 3, 4])

        result, result_codes = safe_sort(values,
                                         codes,
                                         na_sentinel=na_sentinel,
                                         verify=verify)
        expected_codes = np.array(exp_codes, dtype=np.intp)
        tm.assert_numpy_array_equal(result, expected)
        tm.assert_numpy_array_equal(result_codes, expected_codes)

    @pytest.mark.skipif(
        is_platform_windows() and is_ci_environment(),
        reason="In CI environment can crash thread with: "
        "Windows fatal exception: access violation",
    )
    @pytest.mark.parametrize("na_sentinel", [-1, 99])
    def test_codes_out_of_bound(self, na_sentinel):
        values = [3, 1, 2, 0, 4]
        expected = np.array([0, 1, 2, 3, 4])

        # out of bound indices
        codes = [0, 101, 102, 2, 3, 0, 99, 4]
        result, result_codes = safe_sort(values,
                                         codes,
                                         na_sentinel=na_sentinel)
        expected_codes = np.array(
            [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4],
            dtype=np.intp)
        tm.assert_numpy_array_equal(result, expected)
        tm.assert_numpy_array_equal(result_codes, expected_codes)

    @pytest.mark.parametrize("box",
                             [lambda x: np.array(x, dtype=object), list])
    def test_mixed_integer(self, box):
        values = box(["b", 1, 0, "a", 0, "b"])
        result = safe_sort(values)
        expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object)
        tm.assert_numpy_array_equal(result, expected)

    def test_mixed_integer_with_codes(self):
        values = np.array(["b", 1, 0, "a"], dtype=object)
        codes = [0, 1, 2, 3, 0, -1, 1]
        result, result_codes = safe_sort(values, codes)
        expected = np.array([0, 1, "a", "b"], dtype=object)
        expected_codes = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
        tm.assert_numpy_array_equal(result, expected)
        tm.assert_numpy_array_equal(result_codes, expected_codes)

    def test_unsortable(self):
        # GH 13714
        arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object)
        msg = "'[<>]' not supported between instances of .*"
        with pytest.raises(TypeError, match=msg):
            safe_sort(arr)

    @pytest.mark.parametrize(
        "arg, codes, err, msg",
        [
            [1, None, TypeError, "Only list-like objects are allowed"],
            [[0, 1, 2], 1, TypeError, "Only list-like objects or None"],
            [[0, 1, 2, 1], [0, 1], ValueError, "values should be unique"],
        ],
    )
    def test_exceptions(self, arg, codes, err, msg):
        with pytest.raises(err, match=msg):
            safe_sort(values=arg, codes=codes)

    @pytest.mark.parametrize(
        "arg, exp",
        [[[1, 3, 2], [1, 2, 3]], [[1, 3, np.nan, 2], [1, 2, 3, np.nan]]])
    def test_extension_array(self, arg, exp):
        a = array(arg, dtype="Int64")
        result = safe_sort(a)
        expected = array(exp, dtype="Int64")
        tm.assert_extension_array_equal(result, expected)

    @pytest.mark.parametrize("verify", [True, False])
    @pytest.mark.parametrize("na_sentinel", [-1, 99])
    def test_extension_array_codes(self, verify, na_sentinel):
        a = array([1, 3, 2], dtype="Int64")
        result, codes = safe_sort(a, [0, 1, na_sentinel, 2],
                                  na_sentinel=na_sentinel,
                                  verify=verify)
        expected_values = array([1, 2, 3], dtype="Int64")
        expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp)
        tm.assert_extension_array_equal(result, expected_values)
        tm.assert_numpy_array_equal(codes, expected_codes)