Example #1
0
    def test_es_if_exists_replace(self):
        # Assert that 'replace' allows for creation
        df1 = pandas_to_eland(
            pd_df2,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="replace",
            es_refresh=True,
        ).to_pandas()
        assert_frame_equal(pd_df2, df1)

        # Assert that 'replace' will replace existing mapping and entries
        df2 = pandas_to_eland(
            pd_df,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="replace",
            es_refresh=True,
        )
        assert_pandas_eland_frame_equal(pd_df, df2)

        df3 = pandas_to_eland(
            pd_df2,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="replace",
            es_refresh=True,
        ).to_pandas()
        assert_frame_equal(df1, df3)
Example #2
0
    def test_es_if_exists_append_mapping_mismatch(self):
        df1 = pandas_to_eland(
            pd_df,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="append",
            es_refresh=True,
        )

        with pytest.raises(ValueError) as e:
            pandas_to_eland(
                pd_df2,
                es_client=ES_TEST_CLIENT,
                es_dest_index="test-index",
                es_if_exists="append",
            )

        assert str(e.value) == (
            "DataFrame dtypes and Elasticsearch index mapping aren't compatible:\n"
            "- 'b' is missing from DataFrame columns\n"
            "- 'c' is missing from DataFrame columns\n"
            "- 'd' is missing from DataFrame columns\n"
            "- 'Z' is missing from ES index mapping\n"
            "- 'a' column type ('keyword') not compatible with ES index mapping type ('long')"
        )
        # Assert that the index isn't modified
        assert_pandas_eland_frame_equal(pd_df, df1)
    def test_head_0(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()

        ed_head_0 = ed_flights.head(0)
        pd_head_0 = pd_flights.head(0)
        assert_pandas_eland_frame_equal(pd_head_0, ed_head_0)
Example #4
0
    def test_es_if_exists_append_es_type_coerce_error(self):
        df1 = pandas_to_eland(
            pd_df,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="append",
            es_refresh=True,
            es_type_overrides={"a": "byte"},
        )
        assert_pandas_eland_frame_equal(pd_df, df1)

        pd_df_short = pd.DataFrame(
            {
                "a": [128],  # This value is too large for 'byte'
                "b": [-1.0],
                "c": ["A"],
                "d": [dt],
            },
            index=["3"],
        )

        with pytest.raises(BulkIndexError) as e:
            pandas_to_eland(
                pd_df_short,
                es_client=ES_TEST_CLIENT,
                es_dest_index="test-index",
                es_if_exists="append",
            )

        # Assert that the value 128 caused the index error
        assert "Value [128] is out of range for a byte" in str(e.value)
Example #5
0
    def test_generate_es_mappings(self):
        df = pd.DataFrame(
            data={
                "A": np.random.rand(3),
                "B": 1,
                "C": "foo",
                "D": pd.Timestamp("20190102"),
                "E": [1.0, 2.0, 3.0],
                "F": False,
                "G": [1, 2, 3],
            },
            index=["0", "1", "2"],
        )

        expected_mappings = {
            "mappings": {
                "properties": {
                    "A": {
                        "type": "double"
                    },
                    "B": {
                        "type": "long"
                    },
                    "C": {
                        "type": "keyword"
                    },
                    "D": {
                        "type": "date"
                    },
                    "E": {
                        "type": "double"
                    },
                    "F": {
                        "type": "boolean"
                    },
                    "G": {
                        "type": "long"
                    },
                }
            }
        }

        mappings = FieldMappings._generate_es_mappings(df)

        assert expected_mappings == mappings

        # Now create index
        index_name = "eland_test_generate_es_mappings"

        ed_df = ed.pandas_to_eland(df,
                                   ES_TEST_CLIENT,
                                   index_name,
                                   es_if_exists="replace",
                                   es_refresh=True)
        ed_df_head = ed_df.head()

        assert_pandas_eland_frame_equal(df, ed_df_head)

        ES_TEST_CLIENT.indices.delete(index=index_name)
Example #6
0
    def test_flights_filter_columns_like(self, like):
        ed_flights_small = self.ed_flights_small()
        pd_flights_small = self.pd_flights_small()

        ed_df = ed_flights_small.filter(like=like)
        pd_df = pd_flights_small.filter(like=like)

        assert_pandas_eland_frame_equal(pd_df, ed_df)
Example #7
0
    def test_flights_filter_columns_items(self, items):
        ed_flights_small = self.ed_flights_small()
        pd_flights_small = self.pd_flights_small()

        ed_df = ed_flights_small.filter(items=items)
        pd_df = pd_flights_small.filter(items=items)

        assert_pandas_eland_frame_equal(pd_df, ed_df)
Example #8
0
    def test_getitem_attribute_list(self):
        ed_flights = self.ed_flights().head(42)
        pd_flights = self.pd_flights().head(42)

        ed_flights_slice = ed_flights[["OriginAirportID", "AvgTicketPrice", "Carrier"]]
        pd_flights_slice = pd_flights[["OriginAirportID", "AvgTicketPrice", "Carrier"]]

        assert_pandas_eland_frame_equal(pd_flights_slice, ed_flights_slice)
Example #9
0
    def test_flights_select_dtypes(self):
        pd_flights = self.pd_flights_small()
        ed_flights = self.ed_flights_small()

        assert_pandas_eland_frame_equal(
            pd_flights.select_dtypes(include=np.number),
            ed_flights.select_dtypes(include=np.number),
        )
Example #10
0
    def test_flights_filter_index_items(self, items):
        ed_flights_small = self.ed_flights_small()
        pd_flights_small = self.pd_flights_small()

        ed_df = ed_flights_small.filter(items=items, axis=0)
        pd_df = pd_flights_small.filter(items=items, axis=0)

        assert_pandas_eland_frame_equal(pd_df, ed_df)
Example #11
0
    def test_flights_filter_columns_regex(self, regex):
        ed_flights_small = self.ed_flights_small()
        pd_flights_small = self.pd_flights_small()

        ed_df = ed_flights_small.filter(regex=regex)
        pd_df = pd_flights_small.filter(regex=regex)

        assert_pandas_eland_frame_equal(pd_df, ed_df)
Example #12
0
    def test_isna(self):
        ed_ecommerce = self.ed_ecommerce()
        pd_ecommerce = eland_to_pandas(ed_ecommerce)

        isna_ed_ecommerce = ed_ecommerce[
            ed_ecommerce["geoip.region_name"].isna()]
        isna_pd_ecommerce = pd_ecommerce[
            pd_ecommerce["geoip.region_name"].isna()]
        assert_pandas_eland_frame_equal(isna_pd_ecommerce, isna_ed_ecommerce)
Example #13
0
    def test_notna(self):
        ed_ecommerce = self.ed_ecommerce()
        pd_ecommerce = eland_to_pandas(ed_ecommerce)

        for column in self.columns:
            notna_ed_ecommerce = ed_ecommerce[ed_ecommerce[column].notna()]
            notna_pd_ecommerce = pd_ecommerce[pd_ecommerce[column].notna()]
            assert_pandas_eland_frame_equal(notna_pd_ecommerce,
                                            notna_ed_ecommerce)
    def test_select_dtypes_exclude_number(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()

        ed_flights_non_numeric = ed_flights.select_dtypes(exclude=[np.number])
        pd_flights_non_numeric = pd_flights.select_dtypes(exclude=[np.number])

        assert_pandas_eland_frame_equal(pd_flights_non_numeric.head(103),
                                        ed_flights_non_numeric.head(103))
Example #15
0
    def test_getitem_query(self):
        # Examples from:
        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html
        pd_df = pd.DataFrame(
            {
                "A": range(1, 6),
                "B": range(10, 0, -2),
                "C": range(10, 5, -1)
            },
            index=["0", "1", "2", "3", "4"],
        )
        """
        >>> pd_df
           A   B   C
        0  1  10  10
        1  2   8   9
        2  3   6   8
        3  4   4   7
        4  5   2   6
        """
        # Now create index
        index_name = "eland_test_query"

        ed_df = ed.pandas_to_eland(pd_df,
                                   ES_TEST_CLIENT,
                                   index_name,
                                   es_if_exists="replace",
                                   es_refresh=True)

        assert_pandas_eland_frame_equal(pd_df, ed_df)

        pd_df.info()
        ed_df.info()

        pd_q1 = pd_df[pd_df.A > 2]
        pd_q2 = pd_df[pd_df.A > pd_df.B]
        pd_q3 = pd_df[pd_df.B == pd_df.C]

        ed_q1 = ed_df[ed_df.A > 2]
        ed_q2 = ed_df[ed_df.A > ed_df.B]
        ed_q3 = ed_df[ed_df.B == ed_df.C]

        assert_pandas_eland_frame_equal(pd_q1, ed_q1)
        assert_pandas_eland_frame_equal(pd_q2, ed_q2)
        assert_pandas_eland_frame_equal(pd_q3, ed_q3)

        pd_q4 = pd_df[(pd_df.A > 2) & (pd_df.B > 3)]
        ed_q4 = ed_df[(ed_df.A > 2) & (ed_df.B > 3)]

        assert_pandas_eland_frame_equal(pd_q4, ed_q4)

        ES_TEST_CLIENT.indices.delete(index_name)
Example #16
0
    def test_flights_drop_all_columns(self):
        ed_flights_small = self.ed_flights_small()
        pd_flights_small = self.pd_flights_small()

        all_columns = ed_flights_small.columns

        pd_col0 = pd_flights_small.drop(labels=all_columns, axis=1)
        pd_col1 = pd_flights_small.drop(columns=all_columns)

        ed_col0 = ed_flights_small.drop(labels=all_columns, axis=1)
        ed_col1 = ed_flights_small.drop(columns=all_columns)

        assert_pandas_eland_frame_equal(pd_col0, ed_col0)
        assert_pandas_eland_frame_equal(pd_col1, ed_col1)

        assert ed_col0.columns.equals(pd_col0.columns)
        assert ed_col1.columns.equals(pd_col1.columns)
Example #17
0
    def test_flights_small_drop(self):
        ed_flights_small = self.ed_flights_small()
        pd_flights_small = self.pd_flights_small()

        # ['AvgTicketPrice', 'Cancelled', 'Carrier', 'Dest', 'DestAirportID',
        #        'DestCityName', 'DestCountry', 'DestLocation', 'DestRegion',
        #        'DestWeather', 'DistanceKilometers', 'DistanceMiles', 'FlightDelay',
        #        'FlightDelayMin', 'FlightDelayType', 'FlightNum', 'FlightTimeHour',
        #        'FlightTimeMin', 'Origin', 'OriginAirportID', 'OriginCityName',
        #        'OriginCountry', 'OriginLocation', 'OriginRegion', 'OriginWeather',
        #        'dayOfWeek', 'timestamp']
        pd_col0 = pd_flights_small.drop(["Carrier", "DestCityName"], axis=1)
        pd_col1 = pd_flights_small.drop(columns=["Carrier", "DestCityName"])

        ed_col0 = ed_flights_small.drop(["Carrier", "DestCityName"], axis=1)
        ed_col1 = ed_flights_small.drop(columns=["Carrier", "DestCityName"])

        assert_pandas_eland_frame_equal(pd_col0, ed_col0)
        assert_pandas_eland_frame_equal(pd_col1, ed_col1)

        # Drop rows by index
        pd_idx0 = pd_flights_small.drop(["1", "2"])
        ed_idx0 = ed_flights_small.drop(["1", "2"])

        assert_pandas_eland_frame_equal(pd_idx0, ed_idx0)
Example #18
0
    def test_es_if_exists_append(self):
        df1 = pandas_to_eland(
            pd_df,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="append",
            es_refresh=True,
            # We use 'short' here specifically so that the
            # assumed type of 'long' is coerced into a 'short'
            # by append mode.
            es_type_overrides={"a": "short"},
        )
        assert_pandas_eland_frame_equal(pd_df, df1)
        assert df1.shape == (3, 4)

        pd_df2 = pd.DataFrame(
            {
                "a": [4, 5, 6],
                "b": [-1.0, -2.0, -3.0],
                "c": ["A", "B", "C"],
                "d": [dt, dt - timedelta(1), dt - timedelta(2)],
            },
            index=["3", "4", "5"],
        )
        df2 = pandas_to_eland(
            pd_df2,
            es_client=ES_TEST_CLIENT,
            es_dest_index="test-index",
            es_if_exists="append",
            es_refresh=True,
        )

        # Assert that the second pandas dataframe is actually appended
        assert df2.shape == (6, 4)
        pd_df3 = pd_df.append(pd_df2)
        assert_pandas_eland_frame_equal(pd_df3, df2)
    def test_tail_head(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()

        ed_tail_10 = ed_flights.tail(10)
        pd_tail_10 = pd_flights.tail(10)
        assert_pandas_eland_frame_equal(pd_tail_10, ed_tail_10)

        ed_head_8 = ed_tail_10.head(8)
        pd_head_8 = pd_tail_10.head(8)
        assert_pandas_eland_frame_equal(pd_head_8, ed_head_8)

        ed_tail_5 = ed_head_8.tail(5)
        pd_tail_5 = pd_head_8.tail(5)
        assert_pandas_eland_frame_equal(pd_tail_5, ed_tail_5)

        ed_head_4 = ed_tail_5.head(4)
        pd_head_4 = pd_tail_5.head(4)
        assert_pandas_eland_frame_equal(pd_head_4, ed_head_4)
    def test_head(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()

        ed_head_10 = ed_flights.head(10)
        pd_head_10 = pd_flights.head(10)
        assert_pandas_eland_frame_equal(pd_head_10, ed_head_10)

        ed_head_8 = ed_head_10.head(8)
        pd_head_8 = pd_head_10.head(8)
        assert_pandas_eland_frame_equal(pd_head_8, ed_head_8)

        ed_head_20 = ed_head_10.head(20)
        pd_head_20 = pd_head_10.head(20)
        assert_pandas_eland_frame_equal(pd_head_20, ed_head_20)
    def test_tail(self):
        ed_flights = self.ed_flights()
        pd_flights = self.pd_flights()

        ed_tail_10 = ed_flights.tail(10)
        pd_tail_10 = pd_flights.tail(10)
        assert_pandas_eland_frame_equal(pd_tail_10, ed_tail_10)

        ed_tail_8 = ed_tail_10.tail(8)
        pd_tail_8 = pd_tail_10.tail(8)
        assert_pandas_eland_frame_equal(pd_tail_8, ed_tail_8)

        ed_tail_20 = ed_tail_10.tail(20)
        pd_tail_20 = pd_tail_10.tail(20)
        assert_pandas_eland_frame_equal(pd_tail_20, ed_tail_20)