Esempio n. 1
0
    def test_parquet_target_vector_overwrite(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        fset = fs.FeatureSet(name="fvec-parquet-fset", entities=[fs.Entity("name")])
        fs.ingest(fset, df1)

        features = ["fvec-parquet-fset.*"]
        fvec = fs.FeatureVector("fvec-parquet", features=features)

        target = ParquetTarget()
        off1 = fs.get_offline_features(fvec, target=target)
        dfout1 = pd.read_parquet(target._target_path)

        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(off1.to_dataframe().sort_index())
        )
        assert df1.set_index(keys="name").sort_index().equals(dfout1.sort_index())

        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})
        fs.ingest(fset, df2)
        off2 = fs.get_offline_features(fvec, target=target)
        dfout2 = pd.read_parquet(target._target_path)
        assert (
            df2.set_index(keys="name")
            .sort_index()
            .equals(off2.to_dataframe().sort_index())
        )
        assert df2.set_index(keys="name").sort_index().equals(dfout2.sort_index())
Esempio n. 2
0
    def test_unaggregated_columns(self):
        test_base_time = datetime(2020, 12, 1, 17, 33, 15)

        data = pd.DataFrame({
            "time": [test_base_time, test_base_time - pd.Timedelta(minutes=1)],
            "first_name": ["moshe", "yosi"],
            "last_name": ["cohen", "levi"],
            "bid": [2000, 10],
        })

        name = f"measurements_{uuid.uuid4()}"

        # write to kv
        data_set = fs.FeatureSet(name, entities=[Entity("first_name")])

        data_set.add_aggregation(
            name="bids",
            column="bid",
            operations=["sum", "max"],
            windows="1h",
            period="10m",
        )

        fs.ingest(data_set, data, return_df=True)

        features = [f"{name}.bids_sum_1h", f"{name}.last_name"]

        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"first_name": "moshe"}])
        expected = {"bids_sum_1h": 2000.0, "last_name": "cohen"}
        assert resp[0] == expected
        svc.close()
Esempio n. 3
0
    def test_serverless_ingest(self):
        key = "patient_id"
        measurements = fs.FeatureSet("measurements",
                                     entities=[Entity(key)],
                                     timestamp_key="timestamp")
        target_path = os.path.relpath(str(self.results_path / "mycsv.csv"))
        source = CSVSource("mycsv",
                           path=os.path.relpath(
                               str(self.assets_path / "testdata.csv")))
        targets = [CSVTarget("mycsv", path=target_path)]
        if os.path.exists(target_path):
            os.remove(target_path)

        fs.ingest(
            measurements,
            source,
            targets,
            infer_options=fs.InferOptions.schema() + fs.InferOptions.Stats,
            run_config=fs.RunConfig(local=True),
        )
        assert os.path.exists(target_path), "result file was not generated"
        features = sorted(measurements.spec.features.keys())
        stats = sorted(measurements.status.stats.keys())
        print(features)
        print(stats)
        stats.remove("timestamp")
        assert features == stats, "didnt infer stats for all features"
Esempio n. 4
0
    def test_override_false(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})
        df3 = pd.concat([df1, df2])

        fset = fs.FeatureSet(name="override-false", entities=[fs.Entity("name")])
        fs.ingest(fset, df1)

        features = ["override-false.*"]
        fvec = fs.FeatureVector("override-false-vec", features=features)

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        fs.ingest(fset, df2, overwrite=False)

        off2 = fs.get_offline_features(fvec).to_dataframe()
        assert df3.set_index(keys="name").sort_index().equals(off2.sort_index())

        fs.ingest(fset, df1, targets=[ParquetTarget()])

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()

        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, targets=[CSVTarget()], overwrite=False)

        fset.set_targets(targets=[CSVTarget()])
        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, overwrite=False)
Esempio n. 5
0
    def test_read_csv(self):
        from storey import ReadCSV, ReduceToDataFrame, build_flow

        csv_path = str(self.results_path / _generate_random_name() / ".csv")
        targets = [CSVTarget("mycsv", path=csv_path)]
        stocks_set = fs.FeatureSet(
            "tests", entities=[Entity("ticker", ValueType.STRING)])
        fs.ingest(stocks_set,
                  stocks,
                  infer_options=fs.InferOptions.default(),
                  targets=targets)

        # reading csv file
        controller = build_flow([ReadCSV(csv_path), ReduceToDataFrame()]).run()
        termination_result = controller.await_termination()

        expected = pd.DataFrame({
            0: ["ticker", "MSFT", "GOOG", "AAPL"],
            1: ["name", "Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
            2: ["exchange", "NASDAQ", "NASDAQ", "NASDAQ"],
        })

        assert termination_result.equals(
            expected), f"{termination_result}\n!=\n{expected}"
        os.remove(csv_path)
Esempio n. 6
0
    def test_feature_set_db(self):
        name = "stocks_test"
        stocks_set = fs.FeatureSet(
            name, entities=[Entity("ticker", ValueType.STRING)])
        fs.preview(
            stocks_set,
            stocks,
        )
        stocks_set.save()
        db = mlrun.get_run_db()

        sets = db.list_feature_sets(self.project_name, name)
        assert len(sets) == 1, "bad number of results"

        feature_set = fs.get_feature_set(name, self.project_name)
        assert feature_set.metadata.name == name, "bad feature set response"

        fs.ingest(stocks_set, stocks)
        with pytest.raises(mlrun.errors.MLRunPreconditionFailedError):
            fs.delete_feature_set(name, self.project_name)

        stocks_set.purge_targets()

        fs.delete_feature_set(name, self.project_name)
        sets = db.list_feature_sets(self.project_name, name)
        assert not sets, "Feature set should be deleted"
Esempio n. 7
0
    def test_purge(self):
        key = "patient_id"
        fset = fs.FeatureSet("purge",
                             entities=[Entity(key)],
                             timestamp_key="timestamp")
        path = os.path.relpath(str(self.assets_path / "testdata.csv"))
        source = CSVSource(
            "mycsv",
            path=path,
            time_field="timestamp",
        )
        targets = [
            CSVTarget(),
            CSVTarget(name="specified-path",
                      path="v3io:///bigdata/csv-purge-test.csv"),
            ParquetTarget(partitioned=True, partition_cols=["timestamp"]),
            NoSqlTarget(),
        ]
        fset.set_targets(
            targets=targets,
            with_defaults=False,
        )
        fs.ingest(fset, source)

        verify_purge(fset, targets)

        fs.ingest(fset, source)

        targets_to_purge = targets[:-1]
        verify_purge(fset, targets_to_purge)
Esempio n. 8
0
    def test_ingest_dataframe_index(self):
        orig_df = pd.DataFrame([{"x", "y"}])
        orig_df.index.name = "idx"

        fset = fs.FeatureSet("myfset", entities=[Entity("idx")])
        fs.ingest(
            fset, orig_df, [ParquetTarget()], infer_options=fs.InferOptions.default()
        )
Esempio n. 9
0
def test_check_permissions():
    data = pd.DataFrame({
        "time_stamp": [
            pd.Timestamp("2021-06-09 09:30:06.008"),
            pd.Timestamp("2021-06-09 10:29:07.009"),
            pd.Timestamp("2021-06-09 09:29:08.010"),
        ],
        "data": [10, 20, 30],
        "string": ["ab", "cd", "ef"],
    })
    data_set1 = fs.FeatureSet("fs1", entities=[Entity("string")])

    mlrun.db.FileRunDB.verify_authorization = unittest.mock.Mock(
        side_effect=mlrun.errors.MLRunAccessDeniedError(""))

    try:
        fs.preview(
            data_set1,
            data,
            entity_columns=[Entity("string")],
            timestamp_key="time_stamp",
        )
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        fs.ingest(data_set1, data, infer_options=fs.InferOptions.default())
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    features = ["fs1.*"]
    feature_vector = fs.FeatureVector("test", features)
    try:
        fs.get_offline_features(feature_vector,
                                entity_timestamp_column="time_stamp")
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        fs.get_online_feature_service(feature_vector)
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        fs.deploy_ingestion_service(featureset=data_set1)
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass

    try:
        data_set1.purge_targets()
        assert False
    except mlrun.errors.MLRunAccessDeniedError:
        pass
Esempio n. 10
0
def verify_target_list_fail(targets, with_defaults=None):
    feature_set = fs.FeatureSet(name="target-list-fail", entities=[fs.Entity("ticker")])
    with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
        if with_defaults:
            feature_set.set_targets(targets=targets, with_defaults=with_defaults)
        else:
            feature_set.set_targets(targets=targets)
    with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
        fs.ingest(feature_set, quotes, targets=targets)
Esempio n. 11
0
    def test_multiple_entities(self):
        name = f"measurements_{uuid.uuid4()}"
        current_time = pd.Timestamp.now()
        data = pd.DataFrame(
            {
                "time": [
                    current_time,
                    current_time - pd.Timedelta(minutes=1),
                    current_time - pd.Timedelta(minutes=2),
                    current_time - pd.Timedelta(minutes=3),
                    current_time - pd.Timedelta(minutes=4),
                    current_time - pd.Timedelta(minutes=5),
                ],
                "first_name": ["moshe", "yosi", "yosi", "yosi", "moshe", "yosi"],
                "last_name": ["cohen", "levi", "levi", "levi", "cohen", "levi"],
                "bid": [2000, 10, 11, 12, 2500, 14],
            }
        )

        # write to kv
        data_set = fs.FeatureSet(
            name, entities=[Entity("first_name"), Entity("last_name")]
        )

        data_set.add_aggregation(
            name="bids",
            column="bid",
            operations=["sum", "max"],
            windows="1h",
            period="10m",
            emit_policy=EmitAfterMaxEvent(1),
        )
        fs.infer_metadata(
            data_set,
            data,  # source
            entity_columns=["first_name", "last_name"],
            timestamp_key="time",
            options=fs.InferOptions.default(),
        )

        data_set.plot(
            str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True
        )
        fs.ingest(data_set, data, return_df=True)

        features = [
            f"{name}.bids_sum_1h",
        ]

        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"first_name": "yosi", "last_name": "levi"}])
        assert resp[0]["bids_sum_1h"] == 47.0

        svc.close()
Esempio n. 12
0
    def test_ingest_twice_with_nulls(self):
        name = f"test_ingest_twice_with_nulls_{uuid.uuid4()}"
        key = "key"

        measurements = fs.FeatureSet(
            name, entities=[Entity(key)], timestamp_key="my_time"
        )
        columns = [key, "my_string", "my_time"]
        df = pd.DataFrame(
            [["mykey1", "hello", pd.Timestamp("2019-01-26 14:52:37")]], columns=columns
        )
        df.set_index("my_string")
        source = DataFrameSource(df)
        measurements.set_targets(
            targets=[ParquetTarget(partitioned=True)], with_defaults=False,
        )
        resp1 = fs.ingest(measurements, source)
        assert resp1.to_dict() == {
            "my_string": {"mykey1": "hello"},
            "my_time": {"mykey1": pd.Timestamp("2019-01-26 14:52:37")},
        }

        features = [
            f"{name}.*",
        ]
        vector = fs.FeatureVector("myvector", features)
        resp2 = fs.get_offline_features(vector)
        resp2 = resp2.to_dataframe()
        assert resp2.to_dict() == {"my_string": {"mykey1": "hello"}}

        measurements = fs.FeatureSet(
            name, entities=[Entity(key)], timestamp_key="my_time"
        )
        columns = [key, "my_string", "my_time"]
        df = pd.DataFrame(
            [["mykey2", None, pd.Timestamp("2019-01-26 14:52:37")]], columns=columns
        )
        df.set_index("my_string")
        source = DataFrameSource(df)
        measurements.set_targets(
            targets=[ParquetTarget(partitioned=True)], with_defaults=False,
        )
        resp1 = fs.ingest(measurements, source, overwrite=False)
        assert resp1.to_dict() == {
            "my_string": {"mykey2": None},
            "my_time": {"mykey2": pd.Timestamp("2019-01-26 14:52:37")},
        }

        features = [
            f"{name}.*",
        ]
        vector = fs.FeatureVector("myvector", features)
        resp2 = fs.get_offline_features(vector)
        resp2 = resp2.to_dataframe()
        assert resp2.to_dict() == {"my_string": {"mykey1": "hello", "mykey2": None}}
Esempio n. 13
0
    def test_overwrite_single_parquet_file(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        targets = [
            ParquetTarget(path="v3io:///bigdata/overwrite-pq-spec/my.parquet")
        ]

        fset = fs.FeatureSet(name="overwrite-pq-spec-path",
                             entities=[fs.Entity("name")])

        fs.ingest(fset, df1, targets=targets)
        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df2, targets=targets, overwrite=False)
Esempio n. 14
0
    def test_non_partitioned_target_in_dir(self):
        source = CSVSource(
            "mycsv", path=os.path.relpath(str(self.assets_path / "testdata.csv"))
        )
        path = str(self.results_path / _generate_random_name())
        target = ParquetTarget(path=path)

        fset = fs.FeatureSet(
            name="test", entities=[Entity("patient_id")], timestamp_key="timestamp"
        )
        fs.ingest(fset, source, targets=[target])

        list_files = os.listdir(path)
        assert len(list_files) == 1 and not os.path.isdir(path + "/" + list_files[0])
        os.remove(path + "/" + list_files[0])
Esempio n. 15
0
    def test_overwrite(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")])
        fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()])

        features = ["overwrite-fs.*"]
        fvec = fs.FeatureVector("overwrite-vec", features=features)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0]["value"] == 3
        svc.close()

        fs.ingest(fset, df2)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0] is None

        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()
Esempio n. 16
0
    def test_csv_time_columns(self):
        df = pd.DataFrame(
            {
                "key": ["key1", "key2"],
                "time_stamp": [
                    datetime(2020, 11, 1, 17, 33, 15),
                    datetime(2020, 10, 1, 17, 33, 15),
                ],
                "another_time_column": [
                    datetime(2020, 9, 1, 17, 33, 15),
                    datetime(2020, 8, 1, 17, 33, 15),
                ],
            }
        )

        csv_path = "/tmp/multiple_time_columns.csv"
        df.to_csv(path_or_buf=csv_path, index=False)
        source = CSVSource(
            path=csv_path, time_field="time_stamp", parse_dates=["another_time_column"]
        )

        measurements = fs.FeatureSet(
            "fs", entities=[Entity("key")], timestamp_key="time_stamp"
        )
        try:
            resp = fs.ingest(measurements, source)
            df.set_index("key", inplace=True)
            assert_frame_equal(df, resp)
        finally:
            os.remove(csv_path)
Esempio n. 17
0
    def test_none_value(self):
        data = pd.DataFrame(
            {"first_name": ["moshe", "yossi"], "bid": [2000, 10], "bool": [True, None]}
        )

        # write to kv
        data_set = fs.FeatureSet("tests2", entities=[Entity("first_name")])
        fs.ingest(data_set, data, return_df=True)
        features = ["tests2.*"]
        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"first_name": "yossi"}])
        assert resp[0] == {"bid": 10, "bool": None}

        svc.close()
Esempio n. 18
0
    def _ingest_quotes_featureset(self):
        quotes_set = FeatureSet("stock-quotes", entities=[Entity("ticker")])

        flow = quotes_set.graph
        flow.to("MyMap", multiplier=3).to("storey.Extend",
                                          _fn="({'z': event['bid'] * 77})").to(
                                              "storey.Filter",
                                              "filter",
                                              _fn="(event['bid'] > 51.92)").to(
                                                  FeaturesetValidator())

        quotes_set.add_aggregation("asks", "ask", ["sum", "max"], ["1h", "5h"],
                                   "10m")
        quotes_set.add_aggregation("bids", "bid", ["min", "max"], ["1h"],
                                   "10m")

        df = fs.infer_metadata(
            quotes_set,
            quotes,
            entity_columns=["ticker"],
            timestamp_key="time",
            options=fs.InferOptions.default(),
        )
        self._logger.info(f"quotes spec: {quotes_set.spec.to_yaml()}")
        assert df["zz"].mean() == 9, "map didnt set the zz column properly"
        quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info")

        quotes_set.plot(str(self.results_path / "pipe.png"),
                        rankdir="LR",
                        with_targets=True)
        df = fs.ingest(quotes_set, quotes, return_df=True)
        self._logger.info(f"output df:\n{df}")
        assert quotes_set.status.stats.get("asks_sum_1h"), "stats not created"
Esempio n. 19
0
 def test_basic_remote_spark_ingest(self):
     key = "patient_id"
     measurements = fs.FeatureSet(
         "measurements",
         entities=[fs.Entity(key)],
         timestamp_key="timestamp",
         engine="spark",
     )
     source = ParquetSource("myparquet",
                            path=self.get_remote_pq_source_path())
     fs.ingest(
         measurements,
         source,
         return_df=True,
         spark_context=self.spark_service,
         run_config=fs.RunConfig(local=False),
     )
Esempio n. 20
0
def prepare_feature_set(name: str, entity: str, data: pd.DataFrame, timestamp_key=None):
    df_source = mlrun.datastore.sources.DataFrameSource(data, entity, timestamp_key)

    feature_set = fs.FeatureSet(
        name, entities=[fs.Entity(entity)], timestamp_key=timestamp_key
    )
    feature_set.set_targets()
    df = fs.ingest(feature_set, df_source, infer_options=fs.InferOptions.default())
    return feature_set, df
Esempio n. 21
0
    def test_error_flow(self):
        df = pd.DataFrame({
            "name": ["Jean", "Jacques", "Pierre"],
            "last_name": ["Dubois", "Dupont", "Lavigne"],
        })

        measurements = fs.FeatureSet(
            "measurements",
            entities=[fs.Entity("name")],
            engine="spark",
        )

        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(
                measurements,
                df,
                return_df=True,
                spark_context=self.spark_service,
                run_config=fs.RunConfig(local=False),
            )
Esempio n. 22
0
    def test_overwrite_specified_nosql_path(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        targets = [NoSqlTarget(path="v3io:///bigdata/overwrite-spec")]

        fset = fs.FeatureSet(name="overwrite-spec-path",
                             entities=[fs.Entity("name")])
        features = ["overwrite-spec-path.*"]
        fvec = fs.FeatureVector("overwrite-spec-path-fvec", features=features)

        fs.ingest(fset, df1, targets=targets)

        fs.ingest(fset, df2, targets=targets)

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        resp = svc.get(entity_rows=[{"name": "ABC"}])
        assert resp[0] is None
        svc.close()
Esempio n. 23
0
 def test_ingest_with_timestamp(self):
     key = "patient_id"
     measurements = fs.FeatureSet("measurements",
                                  entities=[Entity(key)],
                                  timestamp_key="timestamp")
     source = CSVSource(
         "mycsv",
         path=os.path.relpath(str(self.assets_path / "testdata.csv")),
         time_field="timestamp",
     )
     resp = fs.ingest(measurements, source)
     assert resp["timestamp"].head(
         n=1)[0] == datetime.fromisoformat("2020-12-01 17:24:15.906352")
Esempio n. 24
0
    def test_offline_features_filter_non_partitioned(self):
        data = pd.DataFrame({
            "time_stamp": [
                pd.Timestamp("2021-06-09 09:30:06.008"),
                pd.Timestamp("2021-06-09 10:29:07.009"),
                pd.Timestamp("2021-06-09 09:29:08.010"),
            ],
            "data": [10, 20, 30],
            "string": ["ab", "cd", "ef"],
        })

        data_set1 = fs.FeatureSet("fs1", entities=[Entity("string")])
        fs.ingest(data_set1, data, infer_options=fs.InferOptions.default())
        features = ["fs1.*"]
        vector = fs.FeatureVector("vector", features)
        resp = fs.get_offline_features(
            vector,
            entity_timestamp_column="time_stamp",
            start_time=datetime(2021, 6, 9, 9, 30),
            end_time=datetime(2021, 6, 9, 10, 30),
        )
        assert len(resp.to_dataframe()) == 2
Esempio n. 25
0
def handler(context, event):
    context.logger.debug(event.body)
    event_body = json.loads(event.body)

    options = fs.InferOptions.Null
    if context.need_to_infer:
        options = fs.InferOptions.default()
        context.need_to_infer = False

    events = []
    if "headers" in event_body and "values" in event_body:
        for values in event_body["values"]:
            events.append(
                {k: v
                 for k, v in zip(event_body["headers"], values)})
    else:
        events.append(event_body)

    for enriched in map(enrich_even_details, events):

        if enriched is not None:
            enriched[TIMESTAMP] = datetime.strptime(enriched["when"],
                                                    ISO_8061_UTC)
            if enriched.get("class"):
                # class is illegal column name in pandas df
                enriched[MODEL_CLASS] = enriched["class"]
                del enriched["class"]

            fs.ingest(
                context.fset,
                pd.DataFrame({k: [v]
                              for k, v in enriched.items()}),
                infer_options=options,
                return_df=False,
                overwrite=False,
            )
        else:
            pass
Esempio n. 26
0
    def test_split_graph(self):
        quotes_set = fs.FeatureSet("stock-quotes",
                                   entities=[fs.Entity("ticker")])

        quotes_set.graph.to("MyMap", "somemap1", field="multi1",
                            multiplier=3).to(
                                "storey.Extend",
                                _fn="({'extra': event['bid'] * 77})").to(
                                    "storey.Filter",
                                    "filter",
                                    _fn="(event['bid'] > 70)").to(
                                        FeaturesetValidator())

        side_step_name = "side-step"
        quotes_set.graph.to("storey.Extend",
                            name=side_step_name,
                            _fn="({'extra2': event['bid'] * 17})")
        with pytest.raises(mlrun.errors.MLRunPreconditionFailedError):
            fs.infer_metadata(quotes_set, quotes)

        non_default_target_name = "side-target"
        quotes_set.set_targets(
            targets=[
                CSVTarget(name=non_default_target_name,
                          after_state=side_step_name)
            ],
            default_final_state="FeaturesetValidator",
        )

        quotes_set.plot(with_targets=True)

        inf_out = fs.infer_metadata(quotes_set, quotes)
        ing_out = fs.ingest(quotes_set, quotes, return_df=True)

        default_file_path = quotes_set.get_target_path(TargetTypes.parquet)
        side_file_path = quotes_set.get_target_path(non_default_target_name)

        side_file_out = pd.read_csv(side_file_path)
        default_file_out = pd.read_parquet(default_file_path)
        self._split_graph_expected_default.set_index("ticker", inplace=True)

        assert all(
            self._split_graph_expected_default == default_file_out.round(2))
        assert all(self._split_graph_expected_default == ing_out.round(2))
        assert all(self._split_graph_expected_default == inf_out.round(2))

        assert all(
            self._split_graph_expected_side.sort_index(
                axis=1) == side_file_out.sort_index(axis=1).round(2))
Esempio n. 27
0
    def _ingest_stocks_featureset(self):
        stocks_set = fs.FeatureSet(
            "stocks", entities=[Entity("ticker", ValueType.STRING)])
        df = fs.ingest(stocks_set,
                       stocks,
                       infer_options=fs.InferOptions.default())

        self._logger.info(f"output df:\n{df}")
        stocks_set["name"].description = "some name"

        self._logger.info(f"stocks spec: {stocks_set.to_yaml()}")
        assert (stocks_set.spec.features["name"].description == "some name"
                ), "description was not set"
        assert len(df) == len(stocks), "dataframe size doesnt match"
        assert stocks_set.status.stats["exchange"], "stats not created"
Esempio n. 28
0
    def test_time_with_timezone(self):
        data = pd.DataFrame({
            "time": [
                datetime(2021, 6, 30, 15, 9, 35, tzinfo=timezone.utc),
                datetime(2021, 6, 30, 15, 9, 35, tzinfo=timezone.utc),
            ],
            "first_name": ["katya", "dina"],
            "bid": [2000, 10],
        })
        data_set = fs.FeatureSet("fs4", entities=[Entity("first_name")])

        df = fs.ingest(data_set, data, return_df=True)

        data.set_index("first_name", inplace=True)
        assert_frame_equal(df, data)
Esempio n. 29
0
    def test_sync_pipeline(self):
        stocks_set = fs.FeatureSet(
            "stocks-sync",
            entities=[Entity("ticker", ValueType.STRING)],
            engine="pandas",
        )

        stocks_set.graph.to(name="s1", handler="myfunc1")
        df = fs.ingest(stocks_set, stocks)
        self._logger.info(f"output df:\n{df}")

        features = list(stocks_set.spec.features.keys())
        assert len(features) == 1, "wrong num of features"
        assert "exchange" not in features, "field was not dropped"
        assert len(df) == len(stocks), "dataframe size doesnt match"
Esempio n. 30
0
    def test_filtering_parquet_by_time(self):
        key = "patient_id"
        measurements = fs.FeatureSet(
            "measurements", entities=[Entity(key)], timestamp_key="timestamp"
        )
        source = ParquetSource(
            "myparquet",
            path=os.path.relpath(str(self.assets_path / "testdata.parquet")),
            time_field="timestamp",
            start_time=datetime(2020, 12, 1, 17, 33, 15),
            end_time="2020-12-01 17:33:16",
        )

        resp = fs.ingest(measurements, source, return_df=True,)
        assert len(resp) == 10