Example #1
0
    def test_multiple_entities(self):
        name = f"measurements_{uuid.uuid4()}"
        current_time = pd.Timestamp.now()
        data = pd.DataFrame(
            {
                "time": [
                    current_time,
                    current_time - pd.Timedelta(minutes=1),
                    current_time - pd.Timedelta(minutes=2),
                    current_time - pd.Timedelta(minutes=3),
                    current_time - pd.Timedelta(minutes=4),
                    current_time - pd.Timedelta(minutes=5),
                ],
                "first_name": ["moshe", "yosi", "yosi", "yosi", "moshe", "yosi"],
                "last_name": ["cohen", "levi", "levi", "levi", "cohen", "levi"],
                "bid": [2000, 10, 11, 12, 2500, 14],
            }
        )

        # write to kv
        data_set = fs.FeatureSet(
            name, entities=[Entity("first_name"), Entity("last_name")]
        )

        data_set.add_aggregation(
            name="bids",
            column="bid",
            operations=["sum", "max"],
            windows="1h",
            period="10m",
            emit_policy=EmitAfterMaxEvent(1),
        )
        fs.infer_metadata(
            data_set,
            data,  # source
            entity_columns=["first_name", "last_name"],
            timestamp_key="time",
            options=fs.InferOptions.default(),
        )

        data_set.plot(
            str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True
        )
        fs.ingest(data_set, data, return_df=True)

        features = [
            f"{name}.bids_sum_1h",
        ]

        vector = fs.FeatureVector("my-vec", features)
        svc = fs.get_online_feature_service(vector)

        resp = svc.get([{"first_name": "yosi", "last_name": "levi"}])
        assert resp[0]["bids_sum_1h"] == 47.0

        svc.close()
Example #2
0
    def test_split_graph(self):
        quotes_set = fs.FeatureSet("stock-quotes",
                                   entities=[fs.Entity("ticker")])

        quotes_set.graph.to("MyMap", "somemap1", field="multi1",
                            multiplier=3).to(
                                "storey.Extend",
                                _fn="({'extra': event['bid'] * 77})").to(
                                    "storey.Filter",
                                    "filter",
                                    _fn="(event['bid'] > 70)").to(
                                        FeaturesetValidator())

        side_step_name = "side-step"
        quotes_set.graph.to("storey.Extend",
                            name=side_step_name,
                            _fn="({'extra2': event['bid'] * 17})")
        with pytest.raises(mlrun.errors.MLRunPreconditionFailedError):
            fs.infer_metadata(quotes_set, quotes)

        non_default_target_name = "side-target"
        quotes_set.set_targets(
            targets=[
                CSVTarget(name=non_default_target_name,
                          after_state=side_step_name)
            ],
            default_final_state="FeaturesetValidator",
        )

        quotes_set.plot(with_targets=True)

        inf_out = fs.infer_metadata(quotes_set, quotes)
        ing_out = fs.ingest(quotes_set, quotes, return_df=True)

        default_file_path = quotes_set.get_target_path(TargetTypes.parquet)
        side_file_path = quotes_set.get_target_path(non_default_target_name)

        side_file_out = pd.read_csv(side_file_path)
        default_file_out = pd.read_parquet(default_file_path)
        self._split_graph_expected_default.set_index("ticker", inplace=True)

        assert all(
            self._split_graph_expected_default == default_file_out.round(2))
        assert all(self._split_graph_expected_default == ing_out.round(2))
        assert all(self._split_graph_expected_default == inf_out.round(2))

        assert all(
            self._split_graph_expected_side.sort_index(
                axis=1) == side_file_out.sort_index(axis=1).round(2))
Example #3
0
    def _ingest_quotes_featureset(self):
        quotes_set = FeatureSet("stock-quotes", entities=[Entity("ticker")])

        flow = quotes_set.graph
        flow.to("MyMap", multiplier=3).to("storey.Extend",
                                          _fn="({'z': event['bid'] * 77})").to(
                                              "storey.Filter",
                                              "filter",
                                              _fn="(event['bid'] > 51.92)").to(
                                                  FeaturesetValidator())

        quotes_set.add_aggregation("asks", "ask", ["sum", "max"], ["1h", "5h"],
                                   "10m")
        quotes_set.add_aggregation("bids", "bid", ["min", "max"], ["1h"],
                                   "10m")

        df = fs.infer_metadata(
            quotes_set,
            quotes,
            entity_columns=["ticker"],
            timestamp_key="time",
            options=fs.InferOptions.default(),
        )
        self._logger.info(f"quotes spec: {quotes_set.spec.to_yaml()}")
        assert df["zz"].mean() == 9, "map didnt set the zz column properly"
        quotes_set["bid"].validator = MinMaxValidator(min=52, severity="info")

        quotes_set.plot(str(self.results_path / "pipe.png"),
                        rankdir="LR",
                        with_targets=True)
        df = fs.ingest(quotes_set, quotes, return_df=True)
        self._logger.info(f"output df:\n{df}")
        assert quotes_set.status.stats.get("asks_sum_1h"), "stats not created"
Example #4
0
def test_serverless_ingest():
    init_store()

    measurements = fs.FeatureSet("measurements",
                                 entities=[Entity("patient_id")])
    src_df = pd.read_csv(local_dir + "testdata.csv")
    df = fs.infer_metadata(
        measurements,
        src_df,
        timestamp_key="timestamp",
        options=fs.InferOptions.default(),
    )
    print(df.head(5))
    target_path = os.path.relpath(results_dir + "mycsv.csv")
    source = CSVSource("mycsv",
                       path=os.path.relpath(local_dir + "testdata.csv"))
    targets = [CSVTarget("mycsv", path=target_path)]
    if os.path.exists(target_path):
        os.remove(target_path)

    run_ingestion_task(
        measurements,
        source,
        targets,
        name="test_ingest",
        infer_options=fs.InferOptions.Null,
        parameters={},
        function=None,
        local=True,
    )
    assert os.path.exists(target_path), "result file was not generated"
Example #5
0
    def test_feature_set_db(self):
        name = "stocks_test"
        stocks_set = fs.FeatureSet(name, entities=[Entity("ticker", ValueType.STRING)])
        fs.infer_metadata(
            stocks_set, stocks,
        )
        stocks_set.save()
        db = mlrun.get_run_db()

        sets = db.list_feature_sets(self.project_name, name)
        assert len(sets) == 1, "bad number of results"

        feature_set = fs.get_feature_set(name, self.project_name)
        assert feature_set.metadata.name == name, "bad feature set response"

        fs.delete_feature_set(name, self.project_name)
        sets = db.list_feature_sets(self.project_name, name)
        assert not sets, "Feature set should be deleted"
Example #6
0
def test_feature_set_db():
    init_store()

    name = "stocks_test"
    stocks_set = fs.FeatureSet(name,
                               entities=[Entity("ticker", ValueType.STRING)])
    fs.infer_metadata(
        stocks_set,
        stocks,
    )
    stocks_set.save()
    db = mlrun.get_run_db()

    sets = db.list_feature_sets("", name)
    assert len(sets) == 1, "bad number of results"

    feature_set = db.get_feature_set(name)
    assert feature_set.metadata.name == name, "bad feature set response"
Example #7
0
def verify_ingest(
    base_data, keys, infer=False, targets=None, infer_options=fs.InferOptions.default()
):
    if isinstance(keys, str):
        keys = [keys]
    feature_set = fs.FeatureSet("my-feature-set")
    if infer:
        data = base_data.copy()
        fs.infer_metadata(feature_set, data, entity_columns=keys)
    else:
        data = base_data.set_index(keys=keys)
    if targets:
        feature_set.set_targets(targets=targets, with_defaults=False)
    df = fs.ingest(feature_set, data, infer_options=infer_options)

    assert len(df) == len(data)
    if infer:
        data.set_index(keys=keys, inplace=True)
    for idx in range(len(df)):
        assert all(df.values[idx] == data.values[idx])