Exemple #1
0
    def test_same_target_type(self):
        parquet_path1 = str(self.results_path / _generate_random_name() /
                            "par1.parquet")
        parquet_path2 = str(self.results_path / _generate_random_name() /
                            "par2.parquet")

        targets = [
            ParquetTarget(name="parquet1", path=parquet_path1),
            ParquetTarget(name="parquet2", path=parquet_path2),
        ]
        feature_set, _ = prepare_feature_set("same-target-type",
                                             "ticker",
                                             quotes,
                                             timestamp_key="time",
                                             targets=targets)
        parquet1 = pd.read_parquet(
            feature_set.get_target_path(name="parquet1"))
        parquet2 = pd.read_parquet(
            feature_set.get_target_path(name="parquet2"))

        assert all(parquet1 == quotes.set_index("ticker"))
        assert all(parquet1 == parquet2)

        os.remove(parquet_path1)
        os.remove(parquet_path2)
Exemple #2
0
    def test_ingest_twice_with_nulls(self):
        name = f"test_ingest_twice_with_nulls_{uuid.uuid4()}"
        key = "key"

        measurements = fs.FeatureSet(
            name, entities=[Entity(key)], timestamp_key="my_time"
        )
        columns = [key, "my_string", "my_time"]
        df = pd.DataFrame(
            [["mykey1", "hello", pd.Timestamp("2019-01-26 14:52:37")]], columns=columns
        )
        df.set_index("my_string")
        source = DataFrameSource(df)
        measurements.set_targets(
            targets=[ParquetTarget(partitioned=True)], with_defaults=False,
        )
        resp1 = fs.ingest(measurements, source)
        assert resp1.to_dict() == {
            "my_string": {"mykey1": "hello"},
            "my_time": {"mykey1": pd.Timestamp("2019-01-26 14:52:37")},
        }

        features = [
            f"{name}.*",
        ]
        vector = fs.FeatureVector("myvector", features)
        resp2 = fs.get_offline_features(vector)
        resp2 = resp2.to_dataframe()
        assert resp2.to_dict() == {"my_string": {"mykey1": "hello"}}

        measurements = fs.FeatureSet(
            name, entities=[Entity(key)], timestamp_key="my_time"
        )
        columns = [key, "my_string", "my_time"]
        df = pd.DataFrame(
            [["mykey2", None, pd.Timestamp("2019-01-26 14:52:37")]], columns=columns
        )
        df.set_index("my_string")
        source = DataFrameSource(df)
        measurements.set_targets(
            targets=[ParquetTarget(partitioned=True)], with_defaults=False,
        )
        resp1 = fs.ingest(measurements, source, overwrite=False)
        assert resp1.to_dict() == {
            "my_string": {"mykey2": None},
            "my_time": {"mykey2": pd.Timestamp("2019-01-26 14:52:37")},
        }

        features = [
            f"{name}.*",
        ]
        vector = fs.FeatureVector("myvector", features)
        resp2 = fs.get_offline_features(vector)
        resp2 = resp2.to_dataframe()
        assert resp2.to_dict() == {"my_string": {"mykey1": "hello", "mykey2": None}}
Exemple #3
0
    def test_purge(self):
        key = "patient_id"
        fset = fs.FeatureSet("purge",
                             entities=[Entity(key)],
                             timestamp_key="timestamp")
        path = os.path.relpath(str(self.assets_path / "testdata.csv"))
        source = CSVSource(
            "mycsv",
            path=path,
            time_field="timestamp",
        )
        targets = [
            CSVTarget(),
            CSVTarget(name="specified-path",
                      path="v3io:///bigdata/csv-purge-test.csv"),
            ParquetTarget(partitioned=True, partition_cols=["timestamp"]),
            NoSqlTarget(),
        ]
        fset.set_targets(
            targets=targets,
            with_defaults=False,
        )
        fs.ingest(fset, source)

        verify_purge(fset, targets)

        fs.ingest(fset, source)

        targets_to_purge = targets[:-1]
        verify_purge(fset, targets_to_purge)
Exemple #4
0
    def test_override_false(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})
        df3 = pd.concat([df1, df2])

        fset = fs.FeatureSet(name="override-false", entities=[fs.Entity("name")])
        fs.ingest(fset, df1)

        features = ["override-false.*"]
        fvec = fs.FeatureVector("override-false-vec", features=features)

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        fs.ingest(fset, df2, overwrite=False)

        off2 = fs.get_offline_features(fvec).to_dataframe()
        assert df3.set_index(keys="name").sort_index().equals(off2.sort_index())

        fs.ingest(fset, df1, targets=[ParquetTarget()])

        off1 = fs.get_offline_features(fvec).to_dataframe()
        assert df1.set_index(keys="name").sort_index().equals(off1.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()

        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, targets=[CSVTarget()], overwrite=False)

        fset.set_targets(targets=[CSVTarget()])
        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df1, overwrite=False)
Exemple #5
0
    def test_parquet_target_vector_overwrite(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        fset = fs.FeatureSet(name="fvec-parquet-fset", entities=[fs.Entity("name")])
        fs.ingest(fset, df1)

        features = ["fvec-parquet-fset.*"]
        fvec = fs.FeatureVector("fvec-parquet", features=features)

        target = ParquetTarget()
        off1 = fs.get_offline_features(fvec, target=target)
        dfout1 = pd.read_parquet(target._target_path)

        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(off1.to_dataframe().sort_index())
        )
        assert df1.set_index(keys="name").sort_index().equals(dfout1.sort_index())

        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})
        fs.ingest(fset, df2)
        off2 = fs.get_offline_features(fvec, target=target)
        dfout2 = pd.read_parquet(target._target_path)
        assert (
            df2.set_index(keys="name")
            .sort_index()
            .equals(off2.to_dataframe().sort_index())
        )
        assert df2.set_index(keys="name").sort_index().equals(dfout2.sort_index())
Exemple #6
0
    def test_csv_parquet_index_alignment(self):
        targets = [CSVTarget()]
        csv_align_set, _ = prepare_feature_set("csv-align",
                                               "ticker",
                                               quotes,
                                               timestamp_key="time",
                                               targets=targets)
        csv_df = csv_align_set.to_dataframe()

        features = ["csv-align.*"]
        csv_vec = fs.FeatureVector("csv-align-vector", features)
        resp = fs.get_offline_features(csv_vec)
        csv_vec_df = resp.to_dataframe()

        targets = [ParquetTarget()]
        parquet_align_set, _ = prepare_feature_set("parquet-align",
                                                   "ticker",
                                                   quotes,
                                                   timestamp_key="time",
                                                   targets=targets)
        parquet_df = parquet_align_set.to_dataframe()
        features = ["parquet-align.*"]
        parquet_vec = fs.FeatureVector("parquet-align-vector", features)
        resp = fs.get_offline_features(parquet_vec)
        parquet_vec_df = resp.to_dataframe()

        assert all(csv_df == parquet_df)
        assert all(csv_vec_df == parquet_vec_df)
Exemple #7
0
    def test_ingest_dataframe_index(self):
        orig_df = pd.DataFrame([{"x", "y"}])
        orig_df.index.name = "idx"

        fset = fs.FeatureSet("myfset", entities=[Entity("idx")])
        fs.ingest(
            fset, orig_df, [ParquetTarget()], infer_options=fs.InferOptions.default()
        )
Exemple #8
0
    def test_forced_columns_target(self):
        columns = ["time", "ask"]
        targets = [ParquetTarget(columns=columns)]
        quotes_set, _ = prepare_feature_set(
            "forced-columns", "ticker", quotes, timestamp_key="time", targets=targets
        )

        df = pd.read_parquet(quotes_set.get_target_path())
        assert all(df.columns.values == columns)
Exemple #9
0
    def test_overwrite_single_parquet_file(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        targets = [
            ParquetTarget(path="v3io:///bigdata/overwrite-pq-spec/my.parquet")
        ]

        fset = fs.FeatureSet(name="overwrite-pq-spec-path",
                             entities=[fs.Entity("name")])

        fs.ingest(fset, df1, targets=targets)
        with pytest.raises(mlrun.errors.MLRunInvalidArgumentError):
            fs.ingest(fset, df2, targets=targets, overwrite=False)
Exemple #10
0
    def test_ordered_pandas_asof_merge(self):
        targets = [ParquetTarget(), NoSqlTarget()]
        left_set, left = prepare_feature_set(
            "left", "ticker", trades, timestamp_key="time", targets=targets
        )
        right_set, right = prepare_feature_set(
            "right", "ticker", quotes, timestamp_key="time", targets=targets
        )

        features = ["left.*", "right.*"]
        feature_vector = fs.FeatureVector("test_fv", features, description="test FV")
        res = fs.get_offline_features(feature_vector, entity_timestamp_column="time")
        res = res.to_dataframe()
        assert res.shape[0] == left.shape[0]
Exemple #11
0
    def test_non_partitioned_target_in_dir(self):
        source = CSVSource(
            "mycsv", path=os.path.relpath(str(self.assets_path / "testdata.csv"))
        )
        path = str(self.results_path / _generate_random_name())
        target = ParquetTarget(path=path)

        fset = fs.FeatureSet(
            name="test", entities=[Entity("patient_id")], timestamp_key="timestamp"
        )
        fs.ingest(fset, source, targets=[target])

        list_files = os.listdir(path)
        assert len(list_files) == 1 and not os.path.isdir(path + "/" + list_files[0])
        os.remove(path + "/" + list_files[0])
Exemple #12
0
    def test_target_list_validation(self):
        targets = [ParquetTarget()]
        verify_target_list_fail(targets, with_defaults=True)

        targets = [ParquetTarget(path="path1"), ParquetTarget(path="path2")]
        verify_target_list_fail(targets, with_defaults=False)

        targets = [ParquetTarget(name="parquet1"), ParquetTarget(name="parquet2")]
        verify_target_list_fail(targets)

        targets = [
            ParquetTarget(name="same-name", path="path1"),
            ParquetTarget(name="same-name", path="path2"),
        ]
        verify_target_list_fail(targets, with_defaults=False)

        targets = [
            ParquetTarget(name="parquet1", path="same-path"),
            ParquetTarget(name="parquet2", path="same-path"),
        ]
        verify_target_list_fail(targets)
Exemple #13
0
    def test_overwrite(self):
        df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]})
        df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]})

        fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")])
        fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()])

        features = ["overwrite-fs.*"]
        fvec = fs.FeatureVector("overwrite-vec", features=features)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0]["value"] == 3
        svc.close()

        fs.ingest(fset, df2)

        csv_path = fset.get_target_path(name="csv")
        csv_df = pd.read_csv(csv_path)
        assert (
            df1.set_index(keys="name")
            .sort_index()
            .equals(csv_df.set_index(keys="name").sort_index())
        )

        parquet_path = fset.get_target_path(name="parquet")
        parquet_df = pd.read_parquet(parquet_path)
        assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index())

        svc = fs.get_online_feature_service(fvec)
        resp = svc.get(entity_rows=[{"name": "GHI"}])
        assert resp[0] is None

        resp = svc.get(entity_rows=[{"name": "PQR"}])
        assert resp[0]["value"] == 6
        svc.close()
Exemple #14
0
def test_backwards_compatibility_step_vs_state():
    quotes_set = fs.FeatureSet("post-aggregation",
                               entities=[fs.Entity("ticker")])
    agg_step = quotes_set.add_aggregation("ask", ["sum", "max"], "1h", "10m")
    agg_step.to("MyMap", "somemap1", field="multi1", multiplier=3)
    quotes_set.set_targets(
        targets=[ParquetTarget("parquet1", after_state="somemap1")],
        with_defaults=False,
    )

    feature_set_dict = quotes_set.to_dict()
    # Make sure we're backwards compatible
    feature_set_dict["spec"]["graph"]["states"] = feature_set_dict["spec"][
        "graph"].pop("steps")
    feature_set_dict["spec"]["targets"][0]["after_state"] = feature_set_dict[
        "spec"]["targets"][0].pop("after_step")

    from_dict_feature_set = fs.FeatureSet.from_dict(feature_set_dict)
    assert (deepdiff.DeepDiff(from_dict_feature_set.to_dict(),
                              quotes_set.to_dict()) == {})
    def create_feature_set(self):
        feature_set = fs.FeatureSet("monitoring",
                                    entities=[ENDPOINT_ID],
                                    timestamp_key=TIMESTAMP)
        feature_set.graph.to(
            "ProcessEndpointEvent",
            kv_container=self.kv_container,
            kv_path=self.kv_path,
            v3io_access_key=self.v3io_access_key,
        ).to("storey.Filter", "filter_none",
             _fn="(event is not None)").to("storey.FlatMap",
                                           "flatten_events",
                                           _fn="(event)").to(
                                               "MapFeatureNames",
                                               name="MapFeatureNames",
                                               kv_container=self.kv_container,
                                               kv_path=self.kv_path,
                                               access_key=self.v3io_access_key,
                                               infer_columns_from_data=True,
                                           )
        # kv and tsdb branch
        feature_set.add_aggregation(
            ENDPOINT_ID,
            ["count"],
            self.aggregate_count_windows,
            self.aggregate_count_period,
            name=PREDICTIONS,
            after="MapFeatureNames",
            step_name="Aggregates",
        )
        feature_set.add_aggregation(
            LATENCY,
            ["avg"],
            self.aggregate_avg_windows,
            self.aggregate_avg_period,
        )
        feature_set.graph.add_step(
            "storey.steps.SampleWindow",
            name="sample",
            after="Aggregates",
            window_size=self.sample_window,
            key=ENDPOINT_ID,
        )
        # kv
        feature_set.graph.add_step("ProcessBeforeKV",
                                   name="ProcessBeforeKV",
                                   after="sample")
        feature_set.graph.add_step(
            "WriteToKV",
            name="WriteToKV",
            after="ProcessBeforeKV",
            container=self.kv_container,
            table=self.kv_path,
        )
        feature_set.graph.add_step(
            "InferSchema",
            name="InferSchema",
            after="WriteToKV",
            v3io_access_key=self.v3io_access_key,
            v3io_framesd=self.v3io_framesd,
            container=self.kv_container,
            table=self.kv_path,
        )
        # tsdb
        feature_set.graph.add_step("ProcessBeforeTSDB",
                                   name="ProcessBeforeTSDB",
                                   after="sample")
        feature_set.graph.add_step(
            "FilterAndUnpackKeys",
            name="FilterAndUnpackKeys1",
            after="ProcessBeforeTSDB",
            keys=[BASE_METRICS],
        )
        feature_set.graph.add_step(
            "storey.TSDBTarget",
            name="tsdb1",
            after="FilterAndUnpackKeys1",
            path=self.tsdb_path,
            rate="10/m",
            time_col=TIMESTAMP,
            container=self.tsdb_container,
            access_key=self.v3io_access_key,
            v3io_frames=self.v3io_framesd,
            index_cols=[ENDPOINT_ID, RECORD_TYPE],
            max_events=self.tsdb_batching_max_events,
            timeout_secs=self.tsdb_batching_timeout_secs,
            key=ENDPOINT_ID,
        )
        feature_set.graph.add_step(
            "FilterAndUnpackKeys",
            name="FilterAndUnpackKeys2",
            after="ProcessBeforeTSDB",
            keys=[ENDPOINT_FEATURES],
        )
        feature_set.graph.add_step(
            "storey.TSDBTarget",
            name="tsdb2",
            after="FilterAndUnpackKeys2",
            path=self.tsdb_path,
            rate="10/m",
            time_col=TIMESTAMP,
            container=self.tsdb_container,
            access_key=self.v3io_access_key,
            v3io_frames=self.v3io_framesd,
            index_cols=[ENDPOINT_ID, RECORD_TYPE],
            max_events=self.tsdb_batching_max_events,
            timeout_secs=self.tsdb_batching_timeout_secs,
            key=ENDPOINT_ID,
        )
        feature_set.graph.add_step(
            "FilterAndUnpackKeys",
            name="FilterAndUnpackKeys3",
            after="ProcessBeforeTSDB",
            keys=[CUSTOM_METRICS],
        )
        feature_set.graph.add_step(
            "storey.Filter",
            "FilterNotNone",
            after="FilterAndUnpackKeys3",
            _fn="(event is not None)",
        )
        feature_set.graph.add_step(
            "storey.TSDBTarget",
            name="tsdb3",
            after="FilterNotNone",
            path=self.tsdb_path,
            rate="10/m",
            time_col=TIMESTAMP,
            container=self.tsdb_container,
            access_key=self.v3io_access_key,
            v3io_frames=self.v3io_framesd,
            index_cols=[ENDPOINT_ID, RECORD_TYPE],
            max_events=self.tsdb_batching_max_events,
            timeout_secs=self.tsdb_batching_timeout_secs,
            key=ENDPOINT_ID,
        )

        # parquet branch
        feature_set.graph.add_step(
            "ProcessBeforeParquet",
            name="ProcessBeforeParquet",
            after="MapFeatureNames",
            _fn="(event)",
        )
        storage_options = dict(
            v3io_access_key=self.model_monitoring_access_key,
            v3io_api=self.v3io_api)

        pq_target = ParquetTarget(
            path=self.parquet_path,
            after_step="ProcessBeforeParquet",
            key_bucketing_number=0,
            time_partitioning_granularity="hour",
            max_events=self.parquet_batching_max_events,
            flush_after_seconds=self.parquet_batching_timeout_secs,
            storage_options=storage_options,
            attributes={"infer_columns_from_data": True},
        )

        feature_set.set_targets(
            targets=[pq_target],
            with_defaults=False,
            default_final_step="ProcessBeforeParquet",
        )
        return feature_set
Exemple #16
0
    def test_ingest_with_column_conversion(self):
        orig_df = source = pd.DataFrame(
            {
                "time_stamp": [
                    pd.Timestamp("2002-04-01 04:32:34.000"),
                    pd.Timestamp("2002-04-01 15:05:37.000"),
                    pd.Timestamp("2002-03-31 23:46:07.000"),
                ],
                "ssrxbtok": [488441267876, 438975336749, 298802679370],
                "nkxuonfx": [0.241233, 0.160264, 0.045345],
                "xzvipbmo": [True, False, None],
                "bikyseca": ["ONE", "TWO", "THREE"],
                "napxsuhp": [True, False, True],
                "oegndrxe": [
                    pd.Timestamp("2002-04-01 04:32:34.000"),
                    pd.Timestamp("2002-04-01 05:06:34.000"),
                    pd.Timestamp("2002-04-01 05:38:34.000"),
                ],
                "aatxnkgx": [-227504700006, -470002151801, -33193685176],
                "quupyoxi": ["FOUR", "FIVE", "SIX"],
                "temdojgz": [0.570031, 0.677182, 0.276053],
            },
            index=None,
        )

        fset = fs.FeatureSet(
            "rWQTKqbhje",
            timestamp_key="time_stamp",
            entities=[
                Entity("{}".format(k["name"])) for k in [
                    {
                        "dtype": "float",
                        "null_values": False,
                        "name": "temdojgz",
                        "df_dtype": "float64",
                    },
                    {
                        "dtype": "str",
                        "null_values": False,
                        "name": "bikyseca",
                        "df_dtype": "object",
                    },
                    {
                        "dtype": "float",
                        "null_values": False,
                        "name": "nkxuonfx",
                        "df_dtype": "float64",
                    },
                ]
            ],
        )

        fset.graph.to(name="s1", handler="my_func")
        ikjqkfcz = ParquetTarget(path="v3io:///bigdata/ifrlsjvxgv",
                                 partitioned=False)
        fs.ingest(fset, source, targets=[ikjqkfcz])

        features = ["rWQTKqbhje.*"]
        vector = fs.FeatureVector("WPAyrYux", features)
        vector.spec.with_indexes = False
        resp = fs.get_offline_features(vector)
        off_df = resp.to_dataframe()
        del orig_df["time_stamp"]
        if None in list(orig_df.index.names):
            orig_df.set_index(["temdojgz", "bikyseca", "nkxuonfx"],
                              inplace=True)
        orig_df = orig_df.sort_values(
            by=["temdojgz", "bikyseca", "nkxuonfx"]).sort_index(axis=1)
        off_df = off_df.sort_values(
            by=["temdojgz", "bikyseca", "nkxuonfx"]).sort_index(axis=1)
        pd.testing.assert_frame_equal(
            off_df,
            orig_df,
            check_dtype=True,
            check_index_type=True,
            check_column_type=True,
            check_like=True,
            check_names=True,
        )
Exemple #17
0
    def test_ingest_partitioned_by_key_and_time(
        self, key_bucketing_number, partition_cols, time_partitioning_granularity
    ):
        key = "patient_id"
        name = f"measurements_{uuid.uuid4()}"
        measurements = fs.FeatureSet(name, entities=[Entity(key)])
        source = CSVSource(
            "mycsv",
            path=os.path.relpath(str(self.assets_path / "testdata.csv")),
            time_field="timestamp",
        )
        measurements.set_targets(
            targets=[
                ParquetTarget(
                    partitioned=True,
                    key_bucketing_number=key_bucketing_number,
                    partition_cols=partition_cols,
                    time_partitioning_granularity=time_partitioning_granularity,
                )
            ],
            with_defaults=False,
        )
        resp1 = fs.ingest(measurements, source)

        features = [
            f"{name}.*",
        ]
        vector = fs.FeatureVector("myvector", features)
        resp = fs.get_offline_features(vector)
        resp2 = resp.to_dataframe()

        assert resp1.to_dict() == resp2.to_dict()

        file_system = fsspec.filesystem("v3io")
        kind = TargetTypes.parquet
        path = f"{get_default_prefix_for_target(kind)}/sets/{name}-latest"
        path = path.format(name=name, kind=kind, project="system-test-project")
        dataset = pq.ParquetDataset(path, filesystem=file_system,)
        partitions = [key for key, _ in dataset.pieces[0].partition_keys]

        if key_bucketing_number is None:
            expected_partitions = []
        elif key_bucketing_number == 0:
            expected_partitions = ["igzpart_key"]
        else:
            expected_partitions = [f"igzpart_hash{key_bucketing_number}_key"]
        expected_partitions += partition_cols or []
        if all(
            value is None
            for value in [
                key_bucketing_number,
                partition_cols,
                time_partitioning_granularity,
            ]
        ):
            time_partitioning_granularity = "hour"
        if time_partitioning_granularity:
            for unit in ["year", "month", "day", "hour"]:
                expected_partitions.append(f"igzpart_{unit}")
                if unit == time_partitioning_granularity:
                    break

        assert partitions == expected_partitions

        resp = fs.get_offline_features(
            vector,
            start_time=datetime(2020, 12, 1, 17, 33, 15),
            end_time=datetime(2020, 12, 1, 17, 33, 16),
            entity_timestamp_column="timestamp",
        )
        resp2 = resp.to_dataframe()
        assert len(resp2) == 10
Exemple #18
0
def feature_selection(context,
                      df_artifact,
                      k: int=5,
                      min_votes: float=0.5,
                      label_column: str=None,
                      stat_filters: list=['f_classif', 'mutual_info_classif', 'chi2', 'f_regression'],
                      model_filters: dict={'LinearSVC': 'LinearSVC',
                                     'LogisticRegression': 'LogisticRegression',
                                     'ExtraTreesClassifier': 'ExtraTreesClassifier'},
                      max_scaled_scores: bool=True,
                      sample_ratio: float=None,
                      output_vector_name: float=None,
                      ignore_type_errors: bool=False,
                      is_feature_vector: bool=False):
    
    """Applies selected feature selection statistical functions
    or models on our 'df_artifact'.

    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.

    :param context:           the function context.
    
    :param k:                 number of top features to select from each statistical
                              function or model.
                              
    :param min_votes:         minimal number of votes (from a model or by statistical
                              function) needed for a feature to be selected.
                              Can be specified by percentage of votes or absolute
                              number of votes.
                              
    :param label_column:      ground-truth (y) labels.
    
    :param stat_filters:      statistical functions to apply to the features
                              (from sklearn.feature_selection).
                              
    :param model_filters:     models to use for feature evaluation, can be specified by
                              model name (ex. LinearSVC), formalized json (contains 'CLASS',
                              'FIT', 'META') or a path to such json file.
                              
    :param max_scaled_scores: produce feature scores table scaled with max_scaler.

    :param sample_ratio: percentage of the dataset the user whishes to compute the feature selection process on.
    
    :param output_vector_name: creates a new feature vector containing only the identifies features.
    
    :param ignore_type_errors: skips datatypes that are neither float or int within the feature vector.
    
    :param is_feature_vector: bool stating if the data is passed as a feature vector.
    """
        
    # Check if df.meta is valid, if it is, look for a feature vector
    if df_artifact.meta:
        if df_artifact.meta.kind == mlrun.api.schemas.ObjectKind.feature_vector:
            is_feature_vector = True
    
    # Look inside meta.spec.label_feature to identify the label_column if the user did not specify it
    if label_column is None:
        if is_feature_vector:
            label_column = df_artifact.meta.spec.label_feature.split('.')[1]
        else:
            raise ValueError('No label_column was given, please add a label_column.')
    
    # Use the feature vector as dataframe
    df = df_artifact.as_df()
    
    # Ensure k is not bigger than the the total number of features
    if k > df.shape[1]:
        raise ValueError(f'K cannot be bigger than the total number of features ({df.shape[1]}). Please choose a smaller K.')
    elif k < 1:
        raise ValueError(f'K cannot be smaller than 1. Please choose a bigger K.')
        
    # Create a sample dataframe of the original feature vector
    if sample_ratio:
        df = df.groupby(label_column).apply(lambda x: x.sample(frac=sample_ratio)).reset_index(drop=True)
        df = df.dropna()
        
    # Set feature vector and labels
    y = df.pop(label_column)
    X = df
    
    if np.object in list(X.dtypes) and ignore_type_errors is False:
        raise ValueError(f"{df.select_dtypes(include=['object']).columns.tolist()} are neither float or int.")
        
    # Create selected statistical estimators
    stat_functions_list = {stat_name: SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k)
                           for stat_name in stat_filters}
    requires_abs = ['chi2']

    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns).dropna()
                
    for stat_name, stat_func in stat_functions_list.items():
        try:
            params = (X, y) if stat_name in requires_abs else (abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(index=X.columns,
                                   columns=[stat_name],
                                   data=stat.scores_)
            plot_stat(context, stat_name, stat_df)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features
            
        except Exception as e:
            context.logger.info(f"Couldn't calculate {stat_name} because of: {e}")

    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if '.json' in model:
            current_model = json.load(open(model, 'r'))
            ClassifierClass = create_class(current_model["META"]["class"])
            selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()
            
        else:
            try:
                current_model = json.loads(model) if isinstance(model, str) else current_model
                ClassifierClass = create_class(current_model["META"]["class"])
                selected_models[model_name] = ClassifierClass(**current_model["CLASS"])
            except:
                context.logger.info(f'unable to load {model}')

    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():
        

        if model_name == 'LogisticRegression':
            model.set_params(solver='liblinear')
            
        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X, y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()

        # Collect model feature importance
        if hasattr(select_from_model.estimator_, 'coef_'):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, 'feature_importances_'):
            stat_df = select_from_model.estimator_.feature_importances_

        stat_df = pd.DataFrame(index=X.columns,
                               columns=[model_name],
                               data=stat_df[0])
        models_df = models_df.join(stat_df)

        plot_stat(context, model_name, stat_df)

    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)
    context.log_dataset(key='feature_scores',
                        df=result_matrix_df,
                        local_path='feature_scores.parquet',
                        format='parquet')
    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(data=normalized_df,
                                     columns=result_matrix_df.columns,
                                     index=result_matrix_df.index)
        context.log_dataset(key='max_scaled_scores_feature_scores',
                            df=normalized_df,
                            local_path='max_scaled_scores_feature_scores.parquet',
                            format='parquet')

    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [1 if x in selected_features_agg[test_name] else 0 for x in X.columns]
    result_matrix_df.loc[:, 'num_votes'] = result_matrix_df.sum(axis=1)
    context.log_dataset(key='selected_features_count',
                        df=result_matrix_df,
                        local_path='selected_features_count.parquet',
                        format='parquet')

    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    context.logger.info(f'votes needed to be selected: {votes_needed}')

    # Create final feature dataframe
    selected_features = result_matrix_df[result_matrix_df.num_votes >= votes_needed].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df, y], axis=1)
    context.log_dataset(key='selected_features',
                        df=final_df,
                        local_path='selected_features.parquet',
                        format='parquet')
    
    # Creating a new feature vector containing only the identified top features
    if is_feature_vector and df_artifact.meta.spec.features and output_vector_name:

        # Selecting the top K features from our top feature dataframe
        selected_features = result_matrix_df.head(k).index

        # Match the selected feature names to the FS Feature annotations
        matched_selections = [feature for feature in list(df_artifact.meta.spec.features) for selected in list(selected_features) if feature.endswith(selected)]

        # Defining our new feature vector
        top_features_fv = fs.FeatureVector(output_vector_name, 
                                    matched_selections, 
                                    label_feature="labels.label",
                                    description='feature vector composed strictly of our top features')

        # Saving
        top_features_fv.save()
        fs.get_offline_features(top_features_fv, target=ParquetTarget())

        # Logging our new feature vector URI
        context.log_result('top_features_vector', top_features_fv.uri)
Exemple #19
0
    def test_schedule_on_filtered_by_time(self, partitioned):
        name = f"sched-time-{str(partitioned)}"

        now = datetime.now()

        path = "v3io:///bigdata/bla.parquet"
        fsys = fsspec.filesystem(v3iofs.fs.V3ioFS.protocol)
        pd.DataFrame({
            "time": [
                pd.Timestamp("2021-01-10 10:00:00"),
                pd.Timestamp("2021-01-10 11:00:00"),
            ],
            "first_name": ["moshe", "yosi"],
            "data": [2000, 10],
        }).to_parquet(path=path, filesystem=fsys)

        cron_trigger = "*/2 * * * *"

        source = ParquetSource("myparquet",
                               path=path,
                               time_field="time",
                               schedule=cron_trigger)

        feature_set = fs.FeatureSet(
            name=name,
            entities=[fs.Entity("first_name")],
            timestamp_key="time",
            engine="spark",
        )

        if partitioned:
            targets = [
                NoSqlTarget(),
                ParquetTarget(
                    name="tar1",
                    path="v3io:///bigdata/fs1/",
                    partitioned=True,
                    partition_cols=["time"],
                ),
            ]
        else:
            targets = [
                ParquetTarget(name="tar2",
                              path="v3io:///bigdata/fs2/",
                              partitioned=False),
                NoSqlTarget(),
            ]

        fs.ingest(
            feature_set,
            source,
            run_config=fs.RunConfig(local=False),
            targets=targets,
            spark_context=self.spark_service,
        )
        # ingest starts every second minute and it takes ~90 seconds to finish.
        if (now.minute % 2) == 0:
            sleep(60 - now.second + 60 + 90)
        else:
            sleep(60 - now.second + 90)

        features = [f"{name}.*"]
        vec = fs.FeatureVector("sched_test-vec", features)

        svc = fs.get_online_feature_service(vec)

        resp = svc.get([{"first_name": "yosi"}, {"first_name": "moshe"}])
        assert resp[0]["data"] == 10
        assert resp[1]["data"] == 2000

        pd.DataFrame({
            "time": [
                pd.Timestamp("2021-01-10 12:00:00"),
                pd.Timestamp("2021-01-10 13:00:00"),
                now + pd.Timedelta(minutes=10),
                pd.Timestamp("2021-01-09 13:00:00"),
            ],
            "first_name": ["moshe", "dina", "katya", "uri"],
            "data": [50, 10, 25, 30],
        }).to_parquet(path=path)

        sleep(120)
        resp = svc.get([
            {
                "first_name": "yosi"
            },
            {
                "first_name": "moshe"
            },
            {
                "first_name": "katya"
            },
            {
                "first_name": "dina"
            },
            {
                "first_name": "uri"
            },
        ])
        assert resp[0]["data"] == 10
        assert resp[1]["data"] == 50
        assert resp[2] is None
        assert resp[3]["data"] == 10
        assert resp[4] is None

        svc.close()

        # check offline
        resp = fs.get_offline_features(vec)
        assert len(resp.to_dataframe() == 4)
        assert "uri" not in resp.to_dataframe(
        ) and "katya" not in resp.to_dataframe()