Ejemplo n.º 1
0
    def test_override_false(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})
        df3 = pd.concat([df1, df2])

        fset = fs.FeatureSet(name="override-false", entities=[fs.Entity("name")])
        fs.ingest(fset, df1)

        features = ["override-false.*"]
        fvec = fs.FeatureVector("override-false-vec", features=features)

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        fs.ingest(fset, df2, overwrite=False)

        off2 = fs.get_offline_features(fvec).to_dataframe()
        assert df3.set_index(keys="name").sort_index().equals(off2.sort_index())

        fs.ingest(fset, df1, targets=[ParquetTarget()])

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()

        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, targets=[CSVTarget()], overwrite=False)

        fset.set_targets(targets=[CSVTarget()])
        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, overwrite=False)
Ejemplo n.º 2
0
    def test_purge(self):
        key = "patient_id"
        fset = fs.FeatureSet("purge",
                             entities=[Entity(key)],
                             timestamp_key="timestamp")
        path = os.path.relpath(str(self.assets_path / "testdata.csv"))
        source = CSVSource(
            "mycsv",
            path=path,
            time_field="timestamp",
        )
        targets = [
            CSVTarget(),
            CSVTarget(name="specified-path",
                      path="v3io:///bigdata/csv-purge-test.csv"),
            ParquetTarget(partitioned=True, partition_cols=["timestamp"]),
            NoSqlTarget(),
        ]
        fset.set_targets(
            targets=targets,
            with_defaults=False,
        )
        fs.ingest(fset, source)

        verify_purge(fset, targets)

        fs.ingest(fset, source)

        targets_to_purge = targets[:-1]
        verify_purge(fset, targets_to_purge)
Ejemplo n.º 3
0
    def test_read_csv(self):
        from storey import ReadCSV, ReduceToDataFrame, build_flow

        csv_path = str(self.results_path / _generate_random_name() / ".csv")
        targets = [CSVTarget("mycsv", path=csv_path)]
        stocks_set = fs.FeatureSet(
            "tests", entities=[Entity("ticker", ValueType.STRING)])
        fs.ingest(stocks_set,
                  stocks,
                  infer_options=fs.InferOptions.default(),
                  targets=targets)

        # reading csv file
        controller = build_flow([ReadCSV(csv_path), ReduceToDataFrame()]).run()
        termination_result = controller.await_termination()

        expected = pd.DataFrame({
            0: ["ticker", "MSFT", "GOOG", "AAPL"],
            1: ["name", "Microsoft Corporation", "Alphabet Inc", "Apple Inc"],
            2: ["exchange", "NASDAQ", "NASDAQ", "NASDAQ"],
        })

        assert termination_result.equals(
            expected), f"{termination_result}\n!=\n{expected}"
        os.remove(csv_path)
Ejemplo n.º 4
0
    def test_serverless_ingest(self):
        key = "patient_id"
        measurements = fs.FeatureSet("measurements",
                                     entities=[Entity(key)],
                                     timestamp_key="timestamp")
        target_path = os.path.relpath(str(self.results_path / "mycsv.csv"))
        source = CSVSource("mycsv",
                           path=os.path.relpath(
                               str(self.assets_path / "testdata.csv")))
        targets = [CSVTarget("mycsv", path=target_path)]
        if os.path.exists(target_path):
            os.remove(target_path)

        fs.ingest(
            measurements,
            source,
            targets,
            infer_options=fs.InferOptions.schema() + fs.InferOptions.Stats,
            run_config=fs.RunConfig(local=True),
        )
        assert os.path.exists(target_path), "result file was not generated"
        features = sorted(measurements.spec.features.keys())
        stats = sorted(measurements.status.stats.keys())
        print(features)
        print(stats)
        stats.remove("timestamp")
        assert features == stats, "didnt infer stats for all features"
Ejemplo n.º 5
0
def test_serverless_ingest():
    init_store()
    key = "patient_id"

    measurements = fs.FeatureSet("measurements",
                                 entities=[Entity(key)],
                                 timestamp_key="timestamp")
    target_path = os.path.relpath(results_dir + "mycsv.csv")
    source = CSVSource("mycsv",
                       path=os.path.relpath(local_dir + "testdata.csv"))
    targets = [CSVTarget("mycsv", path=target_path)]
    if os.path.exists(target_path):
        os.remove(target_path)

    run_ingestion_job(
        measurements,
        source,
        targets,
        name="test_ingest",
        infer_options=fs.InferOptions.schema() + fs.InferOptions.Stats,
        parameters={},
        function=None,
        local=True,
    )
    assert os.path.exists(target_path), "result file was not generated"
    features = sorted(measurements.spec.features.keys())
    stats = sorted(measurements.status.stats.keys())
    print(features)
    print(stats)
    stats.remove("timestamp")
    assert features == stats, "didnt infer stats for all features"

    print(measurements.to_yaml())
Ejemplo n.º 6
0
    def test_csv_parquet_index_alignment(self):
        targets = [CSVTarget()]
        csv_align_set, _ = prepare_feature_set("csv-align",
                                               "ticker",
                                               quotes,
                                               timestamp_key="time",
                                               targets=targets)
        csv_df = csv_align_set.to_dataframe()

        features = ["csv-align.*"]
        csv_vec = fs.FeatureVector("csv-align-vector", features)
        resp = fs.get_offline_features(csv_vec)
        csv_vec_df = resp.to_dataframe()

        targets = [ParquetTarget()]
        parquet_align_set, _ = prepare_feature_set("parquet-align",
                                                   "ticker",
                                                   quotes,
                                                   timestamp_key="time",
                                                   targets=targets)
        parquet_df = parquet_align_set.to_dataframe()
        features = ["parquet-align.*"]
        parquet_vec = fs.FeatureVector("parquet-align-vector", features)
        resp = fs.get_offline_features(parquet_vec)
        parquet_vec_df = resp.to_dataframe()

        assert all(csv_df == parquet_df)
        assert all(csv_vec_df == parquet_vec_df)
Ejemplo n.º 7
0
    def test_split_graph(self):
        quotes_set = fs.FeatureSet("stock-quotes",
                                   entities=[fs.Entity("ticker")])

        quotes_set.graph.to("MyMap", "somemap1", field="multi1",
                            multiplier=3).to(
                                "storey.Extend",
                                _fn="({'extra': event['bid'] * 77})").to(
                                    "storey.Filter",
                                    "filter",
                                    _fn="(event['bid'] > 70)").to(
                                        FeaturesetValidator())

        side_step_name = "side-step"
        quotes_set.graph.to("storey.Extend",
                            name=side_step_name,
                            _fn="({'extra2': event['bid'] * 17})")
        with pytest.raises(mlrun.errors.MLRunPreconditionFailedError):
            fs.infer_metadata(quotes_set, quotes)

        non_default_target_name = "side-target"
        quotes_set.set_targets(
            targets=[
                CSVTarget(name=non_default_target_name,
                          after_state=side_step_name)
            ],
            default_final_state="FeaturesetValidator",
        )

        quotes_set.plot(with_targets=True)

        inf_out = fs.infer_metadata(quotes_set, quotes)
        ing_out = fs.ingest(quotes_set, quotes, return_df=True)

        default_file_path = quotes_set.get_target_path(TargetTypes.parquet)
        side_file_path = quotes_set.get_target_path(non_default_target_name)

        side_file_out = pd.read_csv(side_file_path)
        default_file_out = pd.read_parquet(default_file_path)
        self._split_graph_expected_default.set_index("ticker", inplace=True)

        assert all(
            self._split_graph_expected_default == default_file_out.round(2))
        assert all(self._split_graph_expected_default == ing_out.round(2))
        assert all(self._split_graph_expected_default == inf_out.round(2))

        assert all(
            self._split_graph_expected_side.sort_index(
                axis=1) == side_file_out.sort_index(axis=1).round(2))
Ejemplo n.º 8
0
    def test_overwrite(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")])
        fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()])

        features = ["overwrite-fs.*"]
        fvec = fs.FeatureVector("overwrite-vec", features=features)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0]["value"] == 3
        svc.close()

        fs.ingest(fset, df2)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0] is None

        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()