Example #1
0
def test_check_permissions():
    data = pd.DataFrame({
        "time_stamp": [
            pd.Timestamp("2021-06-09 09:30:06.008"),
            pd.Timestamp("2021-06-09 10:29:07.009"),
            pd.Timestamp("2021-06-09 09:29:08.010"),
        ],
        "data": [10, 20, 30],
        "string": ["ab", "cd", "ef"],
    })
    data_set1 = fs.FeatureSet("fs1", entities=[Entity("string")])

    mlrun.db.FileRunDB.verify_authorization = unittest.mock.Mock(
        side_effect=mlrun.errors.MLRunAccessDeniedError(""))

    try:
        fs.preview(
            data_set1,
            data,
            entity_columns=[Entity("string")],
            timestamp_key="time_stamp",
        )
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        fs.ingest(data_set1, data, infer_options=fs.InferOptions.default())
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    features = ["fs1.*"]
    feature_vector = fs.FeatureVector("test", features)
    try:
        fs.get_offline_features(feature_vector,
                                entity_timestamp_column="time_stamp")
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        fs.get_online_feature_service(feature_vector)
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        fs.deploy_ingestion_service(featureset=data_set1)
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        data_set1.purge_targets()
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass
Example #2
0
    def test_overwrite(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")])
        fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()])

        features = ["overwrite-fs.*"]
        fvec = fs.FeatureVector("overwrite-vec", features=features)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0]["value"] == 3
        svc.close()

        fs.ingest(fset, df2)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0] is None

        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()
Example #3
0
    def test_unaggregated_columns(self):
        test_base_time = datetime(2020, 12, 1, 17, 33, 15)

        data = pd.DataFrame({
            "time": [test_base_time, test_base_time - pd.Timedelta(minutes=1)],
            "first_name": ["moshe", "yosi"],
            "last_name": ["cohen", "levi"],
            "bid": [2000, 10],
        })

        name = f"measurements_{uuid.uuid4()}"

        # write to kv
        data_set = fs.FeatureSet(name, entities=[Entity("first_name")])

        data_set.add_aggregation(
            name="bids",
            column="bid",
            operations=["sum", "max"],
            windows="1h",
            period="10m",
        )

        fs.ingest(data_set, data, return_df=True)

        features = [f"{name}.bids_sum_1h", f"{name}.last_name"]

        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"first_name": "moshe"}])
        expected = {"bids_sum_1h": 2000.0, "last_name": "cohen"}
        assert resp[0] == expected
        svc.close()
Example #4
0
    def test_override_false(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})
        df3 = pd.concat([df1, df2])

        fset = fs.FeatureSet(name="override-false", entities=[fs.Entity("name")])
        fs.ingest(fset, df1)

        features = ["override-false.*"]
        fvec = fs.FeatureVector("override-false-vec", features=features)

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        fs.ingest(fset, df2, overwrite=False)

        off2 = fs.get_offline_features(fvec).to_dataframe()
        assert df3.set_index(keys="name").sort_index().equals(off2.sort_index())

        fs.ingest(fset, df1, targets=[ParquetTarget()])

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()

        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, targets=[CSVTarget()], overwrite=False)

        fset.set_targets(targets=[CSVTarget()])
        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, overwrite=False)
Example #5
0
    def test_multiple_entities(self):
        name = f"measurements_{uuid.uuid4()}"
        current_time = pd.Timestamp.now()
        data = pd.DataFrame(
            {
                "time": [
                    current_time,
                    current_time - pd.Timedelta(minutes=1),
                    current_time - pd.Timedelta(minutes=2),
                    current_time - pd.Timedelta(minutes=3),
                    current_time - pd.Timedelta(minutes=4),
                    current_time - pd.Timedelta(minutes=5),
                ],
                "first_name": ["moshe", "yosi", "yosi", "yosi", "moshe", "yosi"],
                "last_name": ["cohen", "levi", "levi", "levi", "cohen", "levi"],
                "bid": [2000, 10, 11, 12, 2500, 14],
            }
        )

        # write to kv
        data_set = fs.FeatureSet(
            name, entities=[Entity("first_name"), Entity("last_name")]
        )

        data_set.add_aggregation(
            name="bids",
            column="bid",
            operations=["sum", "max"],
            windows="1h",
            period="10m",
            emit_policy=EmitAfterMaxEvent(1),
        )
        fs.infer_metadata(
            data_set,
            data,  # source
            entity_columns=["first_name", "last_name"],
            timestamp_key="time",
            options=fs.InferOptions.default(),
        )

        data_set.plot(
            str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True
        )
        fs.ingest(data_set, data, return_df=True)

        features = [
            f"{name}.bids_sum_1h",
        ]

        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"first_name": "yosi", "last_name": "levi"}])
        assert resp[0]["bids_sum_1h"] == 47.0

        svc.close()
Example #6
0
    def _get_online_features(self, features, features_size):
        # test real-time query
        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
        resp = svc.get([{"ticker": "AAPL"}])
        assert (resp[0]["name"] == "Apple Inc" and resp[0]["exchange"]
                == "NASDAQ"), "unexpected online result"
        resp2 = svc.get([{"ticker": "AAPL"}], as_list=True)
        assert (len(resp2[0]) == features_size -
                1), "unexpected online vector size"  # -1 label
        svc.close()
Example #7
0
    def test_none_value(self):
        data = pd.DataFrame(
            {"first_name": ["moshe", "yossi"], "bid": [2000, 10], "bool": [True, None]}
        )

        # write to kv
        data_set = fs.FeatureSet("tests2", entities=[Entity("first_name")])
        fs.ingest(data_set, data, return_df=True)
        features = ["tests2.*"]
        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"first_name": "yossi"}])
        assert resp[0] == {"bid": 10, "bool": None}

        svc.close()
Example #8
0
def test_realtime_query():
    init_store()

    features = [
        "stock-quotes.bid",
        "stock-quotes.asks_sum_5h",
        "stock-quotes.ask as mycol",
        "stocks.*",
    ]
    features_size = (len(features) + 1 + 1
                     )  # (*) returns 2 features, label adds 1 feature

    resp = fs.get_offline_features(
        features,
        entity_rows=trades,
        entity_timestamp_column="time",
        label_feature="stock-quotes.xx",
    )
    vector = resp.vector
    assert len(vector.spec.features) == len(
        features), "unexpected num of requested features"
    assert (len(vector.status.features) == features_size
            ), "unexpected num of returned features"
    assert len(vector.status.stats
               ) == features_size, "unexpected num of feature stats"
    assert vector.status.label_column == "xx", "unexpected label_column name"

    df = resp.to_dataframe()
    columns = trades.shape[1] + features_size - 2  # - 2 keys
    assert df.shape[1] == columns, "unexpected num of returned df columns"
    resp.to_parquet(results_dir + "query.parquet")

    # test real-time query
    vector = fs.FeatureVector("my-vec", features)
    svc = fs.get_online_feature_service(vector)

    resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
    resp = svc.get([{"ticker": "AAPL"}])
    assert (resp[0]["name"] == "Apple Inc"
            and resp[0]["exchange"] == "NASDAQ"), "unexpected online result"
    resp2 = svc.get([{"ticker": "AAPL"}], as_list=True)
    assert (len(resp2[0]) == features_size -
            1), "unexpected online vector size"  # -1 label
    svc.close()
Example #9
0
    def test_overwrite_specified_nosql_path(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        targets = [NoSqlTarget(path="v3io:///bigdata/overwrite-spec")]

        fset = fs.FeatureSet(name="overwrite-spec-path",
                             entities=[fs.Entity("name")])
        features = ["overwrite-spec-path.*"]
        fvec = fs.FeatureVector("overwrite-spec-path-fvec", features=features)

        fs.ingest(fset, df1, targets=targets)

        fs.ingest(fset, df2, targets=targets)

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        resp = svc.get(entity_rows=[{"name": "ABC"}])
        assert resp[0] is None
        svc.close()
Example #10
0
def test_realtime_query():
    init_store()

    features = [
        "stock-quotes.bid",
        "stock-quotes.asks_sum_5h",
        "stock-quotes.ask as mycol",
        "stocks.*",
    ]

    resp = fs.get_offline_features(features,
                                   entity_rows=trades,
                                   entity_timestamp_column="time")
    vector = resp.vector
    assert len(vector.spec.features) == len(
        features), "unexpected num of requested features"
    # stocks (*) returns 2 features
    assert (len(vector.status.features) == len(features) +
            1), "unexpected num of returned features"
    assert (len(vector.status.stats) == len(features) +
            1), "unexpected num of feature stats"

    df = resp.to_dataframe()
    columns = trades.shape[1] + len(features) + 1
    assert df.shape[1] == columns, "unexpected num of returned df columns"
    resp.to_parquet(results_dir + "query.parquet")

    # test real-time query
    vector = fs.FeatureVector("my-vec", features)
    svc = fs.get_online_feature_service(vector)

    resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}])
    print(resp)
    resp = svc.get([{"ticker": "AAPL"}])
    assert (resp[0]["ticker"] == "AAPL"
            and resp[0]["exchange"] == "NASDAQ"), "unexpected online result"
    svc.close()
Example #11
0
    def test_schedule_on_filtered_by_time(self, partitioned):
        name = f"sched-time-{str(partitioned)}"

        now = datetime.now()

        path = "v3io:///bigdata/bla.parquet"
        fsys = fsspec.filesystem(v3iofs.fs.V3ioFS.protocol)
        pd.DataFrame({
            "time": [
                pd.Timestamp("2021-01-10 10:00:00"),
                pd.Timestamp("2021-01-10 11:00:00"),
            ],
            "first_name": ["moshe", "yosi"],
            "data": [2000, 10],
        }).to_parquet(path=path, filesystem=fsys)

        cron_trigger = "*/2 * * * *"

        source = ParquetSource("myparquet",
                               path=path,
                               time_field="time",
                               schedule=cron_trigger)

        feature_set = fs.FeatureSet(
            name=name,
            entities=[fs.Entity("first_name")],
            timestamp_key="time",
            engine="spark",
        )

        if partitioned:
            targets = [
                NoSqlTarget(),
                ParquetTarget(
                    name="tar1",
                    path="v3io:///bigdata/fs1/",
                    partitioned=True,
                    partition_cols=["time"],
                ),
            ]
        else:
            targets = [
                ParquetTarget(name="tar2",
                              path="v3io:///bigdata/fs2/",
                              partitioned=False),
                NoSqlTarget(),
            ]

        fs.ingest(
            feature_set,
            source,
            run_config=fs.RunConfig(local=False),
            targets=targets,
            spark_context=self.spark_service,
        )
        # ingest starts every second minute and it takes ~90 seconds to finish.
        if (now.minute % 2) == 0:
            sleep(60 - now.second + 60 + 90)
        else:
            sleep(60 - now.second + 90)

        features = [f"{name}.*"]
        vec = fs.FeatureVector("sched_test-vec", features)

        svc = fs.get_online_feature_service(vec)

        resp = svc.get([{"first_name": "yosi"}, {"first_name": "moshe"}])
        assert resp[0]["data"] == 10
        assert resp[1]["data"] == 2000

        pd.DataFrame({
            "time": [
                pd.Timestamp("2021-01-10 12:00:00"),
                pd.Timestamp("2021-01-10 13:00:00"),
                now + pd.Timedelta(minutes=10),
                pd.Timestamp("2021-01-09 13:00:00"),
            ],
            "first_name": ["moshe", "dina", "katya", "uri"],
            "data": [50, 10, 25, 30],
        }).to_parquet(path=path)

        sleep(120)
        resp = svc.get([
            {
                "first_name": "yosi"
            },
            {
                "first_name": "moshe"
            },
            {
                "first_name": "katya"
            },
            {
                "first_name": "dina"
            },
            {
                "first_name": "uri"
            },
        ])
        assert resp[0]["data"] == 10
        assert resp[1]["data"] == 50
        assert resp[2] is None
        assert resp[3]["data"] == 10
        assert resp[4] is None

        svc.close()

        # check offline
        resp = fs.get_offline_features(vec)
        assert len(resp.to_dataframe() == 4)
        assert "uri" not in resp.to_dataframe(
        ) and "katya" not in resp.to_dataframe()