def test_csv_parquet_index_alignment(self): targets = [CSVTarget()] csv_align_set, _ = prepare_feature_set("csv-align", "ticker", quotes, timestamp_key="time", targets=targets) csv_df = csv_align_set.to_dataframe() features = ["csv-align.*"] csv_vec = fs.FeatureVector("csv-align-vector", features) resp = fs.get_offline_features(csv_vec) csv_vec_df = resp.to_dataframe() targets = [ParquetTarget()] parquet_align_set, _ = prepare_feature_set("parquet-align", "ticker", quotes, timestamp_key="time", targets=targets) parquet_df = parquet_align_set.to_dataframe() features = ["parquet-align.*"] parquet_vec = fs.FeatureVector("parquet-align-vector", features) resp = fs.get_offline_features(parquet_vec) parquet_vec_df = resp.to_dataframe() assert all(csv_df == parquet_df) assert all(csv_vec_df == parquet_vec_df)
def test_ingest_twice_with_nulls(self): name = f"test_ingest_twice_with_nulls_{uuid.uuid4()}" key = "key" measurements = fs.FeatureSet( name, entities=[Entity(key)], timestamp_key="my_time" ) columns = [key, "my_string", "my_time"] df = pd.DataFrame( [["mykey1", "hello", pd.Timestamp("2019-01-26 14:52:37")]], columns=columns ) df.set_index("my_string") source = DataFrameSource(df) measurements.set_targets( targets=[ParquetTarget(partitioned=True)], with_defaults=False, ) resp1 = fs.ingest(measurements, source) assert resp1.to_dict() == { "my_string": {"mykey1": "hello"}, "my_time": {"mykey1": pd.Timestamp("2019-01-26 14:52:37")}, } features = [ f"{name}.*", ] vector = fs.FeatureVector("myvector", features) resp2 = fs.get_offline_features(vector) resp2 = resp2.to_dataframe() assert resp2.to_dict() == {"my_string": {"mykey1": "hello"}} measurements = fs.FeatureSet( name, entities=[Entity(key)], timestamp_key="my_time" ) columns = [key, "my_string", "my_time"] df = pd.DataFrame( [["mykey2", None, pd.Timestamp("2019-01-26 14:52:37")]], columns=columns ) df.set_index("my_string") source = DataFrameSource(df) measurements.set_targets( targets=[ParquetTarget(partitioned=True)], with_defaults=False, ) resp1 = fs.ingest(measurements, source, overwrite=False) assert resp1.to_dict() == { "my_string": {"mykey2": None}, "my_time": {"mykey2": pd.Timestamp("2019-01-26 14:52:37")}, } features = [ f"{name}.*", ] vector = fs.FeatureVector("myvector", features) resp2 = fs.get_offline_features(vector) resp2 = resp2.to_dataframe() assert resp2.to_dict() == {"my_string": {"mykey1": "hello", "mykey2": None}}
def test_unaggregated_columns(self): test_base_time = datetime(2020, 12, 1, 17, 33, 15) data = pd.DataFrame({ "time": [test_base_time, test_base_time - pd.Timedelta(minutes=1)], "first_name": ["moshe", "yosi"], "last_name": ["cohen", "levi"], "bid": [2000, 10], }) name = f"measurements_{uuid.uuid4()}" # write to kv data_set = fs.FeatureSet(name, entities=[Entity("first_name")]) data_set.add_aggregation( name="bids", column="bid", operations=["sum", "max"], windows="1h", period="10m", ) fs.ingest(data_set, data, return_df=True) features = [f"{name}.bids_sum_1h", f"{name}.last_name"] vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"first_name": "moshe"}]) expected = {"bids_sum_1h": 2000.0, "last_name": "cohen"} assert resp[0] == expected svc.close()
def test_parquet_target_vector_overwrite(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) fset = fs.FeatureSet(name="fvec-parquet-fset", entities=[fs.Entity("name")]) fs.ingest(fset, df1) features = ["fvec-parquet-fset.*"] fvec = fs.FeatureVector("fvec-parquet", features=features) target = ParquetTarget() off1 = fs.get_offline_features(fvec, target=target) dfout1 = pd.read_parquet(target._target_path) assert ( df1.set_index(keys="name") .sort_index() .equals(off1.to_dataframe().sort_index()) ) assert df1.set_index(keys="name").sort_index().equals(dfout1.sort_index()) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) fs.ingest(fset, df2) off2 = fs.get_offline_features(fvec, target=target) dfout2 = pd.read_parquet(target._target_path) assert ( df2.set_index(keys="name") .sort_index() .equals(off2.to_dataframe().sort_index()) ) assert df2.set_index(keys="name").sort_index().equals(dfout2.sort_index())
def test_override_false(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) df3 = pd.concat([df1, df2]) fset = fs.FeatureSet(name="override-false", entities=[fs.Entity("name")]) fs.ingest(fset, df1) features = ["override-false.*"] fvec = fs.FeatureVector("override-false-vec", features=features) off1 = fs.get_offline_features(fvec).to_dataframe() assert df1.set_index(keys="name").sort_index().equals(off1.sort_index()) fs.ingest(fset, df2, overwrite=False) off2 = fs.get_offline_features(fvec).to_dataframe() assert df3.set_index(keys="name").sort_index().equals(off2.sort_index()) fs.ingest(fset, df1, targets=[ParquetTarget()]) off1 = fs.get_offline_features(fvec).to_dataframe() assert df1.set_index(keys="name").sort_index().equals(off1.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 svc.close() with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest(fset, df1, targets=[CSVTarget()], overwrite=False) fset.set_targets(targets=[CSVTarget()]) with pytest.raises(mlrun.errors.MLRunInvalidArgumentError): fs.ingest(fset, df1, overwrite=False)
def _get_offline_vector(self, features, features_size): vector = fs.FeatureVector("myvector", features, "stock-quotes.xx") resp = fs.get_offline_features( vector, entity_rows=trades, entity_timestamp_column="time", ) assert len(vector.spec.features) == len( features), "unexpected num of requested features" assert (len(vector.status.features) == features_size ), "unexpected num of returned features" assert (len(vector.status.stats) == features_size ), "unexpected num of feature stats" assert vector.status.label_column == "xx", "unexpected label_column name" df = resp.to_dataframe() columns = trades.shape[1] + features_size - 2 # - 2 keys assert df.shape[1] == columns, "unexpected num of returned df columns" resp.to_parquet(str(self.results_path / "query.parquet")) # check simple api without join with other df resp = fs.get_offline_features(vector) df = resp.to_dataframe() assert df.shape[ 1] == features_size, "unexpected num of returned df columns"
def test_check_permissions(): data = pd.DataFrame({ "time_stamp": [ pd.Timestamp("2021-06-09 09:30:06.008"), pd.Timestamp("2021-06-09 10:29:07.009"), pd.Timestamp("2021-06-09 09:29:08.010"), ], "data": [10, 20, 30], "string": ["ab", "cd", "ef"], }) data_set1 = fs.FeatureSet("fs1", entities=[Entity("string")]) mlrun.db.FileRunDB.verify_authorization = unittest.mock.Mock( side_effect=mlrun.errors.MLRunAccessDeniedError("")) try: fs.preview( data_set1, data, entity_columns=[Entity("string")], timestamp_key="time_stamp", ) assert False except mlrun.errors.MLRunAccessDeniedError: pass try: fs.ingest(data_set1, data, infer_options=fs.InferOptions.default()) assert False except mlrun.errors.MLRunAccessDeniedError: pass features = ["fs1.*"] feature_vector = fs.FeatureVector("test", features) try: fs.get_offline_features(feature_vector, entity_timestamp_column="time_stamp") assert False except mlrun.errors.MLRunAccessDeniedError: pass try: fs.get_online_feature_service(feature_vector) assert False except mlrun.errors.MLRunAccessDeniedError: pass try: fs.deploy_ingestion_service(featureset=data_set1) assert False except mlrun.errors.MLRunAccessDeniedError: pass try: data_set1.purge_targets() assert False except mlrun.errors.MLRunAccessDeniedError: pass
def test_multiple_entities(self): name = f"measurements_{uuid.uuid4()}" current_time = pd.Timestamp.now() data = pd.DataFrame( { "time": [ current_time, current_time - pd.Timedelta(minutes=1), current_time - pd.Timedelta(minutes=2), current_time - pd.Timedelta(minutes=3), current_time - pd.Timedelta(minutes=4), current_time - pd.Timedelta(minutes=5), ], "first_name": ["moshe", "yosi", "yosi", "yosi", "moshe", "yosi"], "last_name": ["cohen", "levi", "levi", "levi", "cohen", "levi"], "bid": [2000, 10, 11, 12, 2500, 14], } ) # write to kv data_set = fs.FeatureSet( name, entities=[Entity("first_name"), Entity("last_name")] ) data_set.add_aggregation( name="bids", column="bid", operations=["sum", "max"], windows="1h", period="10m", emit_policy=EmitAfterMaxEvent(1), ) fs.infer_metadata( data_set, data, # source entity_columns=["first_name", "last_name"], timestamp_key="time", options=fs.InferOptions.default(), ) data_set.plot( str(self.results_path / "pipe.png"), rankdir="LR", with_targets=True ) fs.ingest(data_set, data, return_df=True) features = [ f"{name}.bids_sum_1h", ] vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"first_name": "yosi", "last_name": "levi"}]) assert resp[0]["bids_sum_1h"] == 47.0 svc.close()
def test_ordered_pandas_asof_merge(self): left_set, left = prepare_feature_set( "left", "ticker", trades, timestamp_key="time" ) right_set, right = prepare_feature_set( "right", "ticker", quotes, timestamp_key="time" ) features = ["left.*", "right.*"] feature_vector = fs.FeatureVector("test_fv", features, description="test FV") res = fs.get_offline_features(feature_vector, entity_timestamp_column="time") res = res.to_dataframe() assert res.shape[0] == left.shape[0]
def _get_online_features(self, features, features_size): # test real-time query vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}]) resp = svc.get([{"ticker": "AAPL"}]) assert (resp[0]["name"] == "Apple Inc" and resp[0]["exchange"] == "NASDAQ"), "unexpected online result" resp2 = svc.get([{"ticker": "AAPL"}], as_list=True) assert (len(resp2[0]) == features_size - 1), "unexpected online vector size" # -1 label svc.close()
def test_overwrite(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) fset = fs.FeatureSet(name="overwrite-fs", entities=[fs.Entity("name")]) fs.ingest(fset, df1, targets=[CSVTarget(), ParquetTarget(), NoSqlTarget()]) features = ["overwrite-fs.*"] fvec = fs.FeatureVector("overwrite-vec", features=features) csv_path = fset.get_target_path(name="csv") csv_df = pd.read_csv(csv_path) assert ( df1.set_index(keys="name") .sort_index() .equals(csv_df.set_index(keys="name").sort_index()) ) parquet_path = fset.get_target_path(name="parquet") parquet_df = pd.read_parquet(parquet_path) assert df1.set_index(keys="name").sort_index().equals(parquet_df.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "GHI"}]) assert resp[0]["value"] == 3 svc.close() fs.ingest(fset, df2) csv_path = fset.get_target_path(name="csv") csv_df = pd.read_csv(csv_path) assert ( df1.set_index(keys="name") .sort_index() .equals(csv_df.set_index(keys="name").sort_index()) ) parquet_path = fset.get_target_path(name="parquet") parquet_df = pd.read_parquet(parquet_path) assert df2.set_index(keys="name").sort_index().equals(parquet_df.sort_index()) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "GHI"}]) assert resp[0] is None resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 svc.close()
def test_none_value(self): data = pd.DataFrame( {"first_name": ["moshe", "yossi"], "bid": [2000, 10], "bool": [True, None]} ) # write to kv data_set = fs.FeatureSet("tests2", entities=[Entity("first_name")]) fs.ingest(data_set, data, return_df=True) features = ["tests2.*"] vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"first_name": "yossi"}]) assert resp[0] == {"bid": 10, "bool": None} svc.close()
def test_realtime_query(): init_store() features = [ "stock-quotes.bid", "stock-quotes.asks_sum_5h", "stock-quotes.ask as mycol", "stocks.*", ] features_size = (len(features) + 1 + 1 ) # (*) returns 2 features, label adds 1 feature resp = fs.get_offline_features( features, entity_rows=trades, entity_timestamp_column="time", label_feature="stock-quotes.xx", ) vector = resp.vector assert len(vector.spec.features) == len( features), "unexpected num of requested features" assert (len(vector.status.features) == features_size ), "unexpected num of returned features" assert len(vector.status.stats ) == features_size, "unexpected num of feature stats" assert vector.status.label_column == "xx", "unexpected label_column name" df = resp.to_dataframe() columns = trades.shape[1] + features_size - 2 # - 2 keys assert df.shape[1] == columns, "unexpected num of returned df columns" resp.to_parquet(results_dir + "query.parquet") # test real-time query vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}]) resp = svc.get([{"ticker": "AAPL"}]) assert (resp[0]["name"] == "Apple Inc" and resp[0]["exchange"] == "NASDAQ"), "unexpected online result" resp2 = svc.get([{"ticker": "AAPL"}], as_list=True) assert (len(resp2[0]) == features_size - 1), "unexpected online vector size" # -1 label svc.close()
def test_feature_vector_db(self): name = "fvec-test" fvec = fs.FeatureVector(name=name) db = mlrun.get_run_db() # TODO: Using to_dict due to a bug in httpdb api which will be fixed in another PR db.create_feature_vector(feature_vector=fvec.to_dict(), project=self.project_name) vecs = db.list_feature_vectors(self.project_name, name) assert len(vecs) == 1, "bad number of results" feature_vec = db.get_feature_vector(name, self.project_name) assert feature_vec.metadata.name == name, "bad feature set response" fs.delete_feature_vector(name, self.project_name) vecs = db.list_feature_vectors(self.project_name, name) assert not vecs, "Feature vector should be deleted"
def test_right_not_ordered_pandas_asof_merge(): init_store() right = quotes.sort_values(by="bid") left_set, left = prepare_feature_set("left", "ticker", trades, timestamp_key="time") right_set, right = prepare_feature_set("right", "ticker", right, timestamp_key="time") features = ["left.*", "right.*"] feature_vector = fs.FeatureVector("test_fv", features, "test FV") res = fs.get_offline_features(feature_vector, entity_timestamp_column="time") res = res.to_dataframe() assert res.shape[0] == left.shape[0]
def test_overwrite_specified_nosql_path(self): df1 = pd.DataFrame({"name": ["ABC", "DEF", "GHI"], "value": [1, 2, 3]}) df2 = pd.DataFrame({"name": ["JKL", "MNO", "PQR"], "value": [4, 5, 6]}) targets = [NoSqlTarget(path="v3io:///bigdata/overwrite-spec")] fset = fs.FeatureSet(name="overwrite-spec-path", entities=[fs.Entity("name")]) features = ["overwrite-spec-path.*"] fvec = fs.FeatureVector("overwrite-spec-path-fvec", features=features) fs.ingest(fset, df1, targets=targets) fs.ingest(fset, df2, targets=targets) svc = fs.get_online_feature_service(fvec) resp = svc.get(entity_rows=[{"name": "PQR"}]) assert resp[0]["value"] == 6 resp = svc.get(entity_rows=[{"name": "ABC"}]) assert resp[0] is None svc.close()
def test_offline_features_filter_non_partitioned(self): data = pd.DataFrame({ "time_stamp": [ pd.Timestamp("2021-06-09 09:30:06.008"), pd.Timestamp("2021-06-09 10:29:07.009"), pd.Timestamp("2021-06-09 09:29:08.010"), ], "data": [10, 20, 30], "string": ["ab", "cd", "ef"], }) data_set1 = fs.FeatureSet("fs1", entities=[Entity("string")]) fs.ingest(data_set1, data, infer_options=fs.InferOptions.default()) features = ["fs1.*"] vector = fs.FeatureVector("vector", features) resp = fs.get_offline_features( vector, entity_timestamp_column="time_stamp", start_time=datetime(2021, 6, 9, 9, 30), end_time=datetime(2021, 6, 9, 10, 30), ) assert len(resp.to_dataframe()) == 2
def test_realtime_query(): init_store() features = [ "stock-quotes.bid", "stock-quotes.asks_sum_5h", "stock-quotes.ask as mycol", "stocks.*", ] resp = fs.get_offline_features(features, entity_rows=trades, entity_timestamp_column="time") vector = resp.vector assert len(vector.spec.features) == len( features), "unexpected num of requested features" # stocks (*) returns 2 features assert (len(vector.status.features) == len(features) + 1), "unexpected num of returned features" assert (len(vector.status.stats) == len(features) + 1), "unexpected num of feature stats" df = resp.to_dataframe() columns = trades.shape[1] + len(features) + 1 assert df.shape[1] == columns, "unexpected num of returned df columns" resp.to_parquet(results_dir + "query.parquet") # test real-time query vector = fs.FeatureVector("my-vec", features) svc = fs.get_online_feature_service(vector) resp = svc.get([{"ticker": "GOOG"}, {"ticker": "MSFT"}]) print(resp) resp = svc.get([{"ticker": "AAPL"}]) assert (resp[0]["ticker"] == "AAPL" and resp[0]["exchange"] == "NASDAQ"), "unexpected online result" svc.close()
def test_schedule_on_filtered_by_time(self, partitioned): name = f"sched-time-{str(partitioned)}" now = datetime.now() path = "v3io:///bigdata/bla.parquet" fsys = fsspec.filesystem(v3iofs.fs.V3ioFS.protocol) pd.DataFrame({ "time": [ pd.Timestamp("2021-01-10 10:00:00"), pd.Timestamp("2021-01-10 11:00:00"), ], "first_name": ["moshe", "yosi"], "data": [2000, 10], }).to_parquet(path=path, filesystem=fsys) cron_trigger = "*/2 * * * *" source = ParquetSource("myparquet", path=path, time_field="time", schedule=cron_trigger) feature_set = fs.FeatureSet( name=name, entities=[fs.Entity("first_name")], timestamp_key="time", engine="spark", ) if partitioned: targets = [ NoSqlTarget(), ParquetTarget( name="tar1", path="v3io:///bigdata/fs1/", partitioned=True, partition_cols=["time"], ), ] else: targets = [ ParquetTarget(name="tar2", path="v3io:///bigdata/fs2/", partitioned=False), NoSqlTarget(), ] fs.ingest( feature_set, source, run_config=fs.RunConfig(local=False), targets=targets, spark_context=self.spark_service, ) # ingest starts every second minute and it takes ~90 seconds to finish. if (now.minute % 2) == 0: sleep(60 - now.second + 60 + 90) else: sleep(60 - now.second + 90) features = [f"{name}.*"] vec = fs.FeatureVector("sched_test-vec", features) svc = fs.get_online_feature_service(vec) resp = svc.get([{"first_name": "yosi"}, {"first_name": "moshe"}]) assert resp[0]["data"] == 10 assert resp[1]["data"] == 2000 pd.DataFrame({ "time": [ pd.Timestamp("2021-01-10 12:00:00"), pd.Timestamp("2021-01-10 13:00:00"), now + pd.Timedelta(minutes=10), pd.Timestamp("2021-01-09 13:00:00"), ], "first_name": ["moshe", "dina", "katya", "uri"], "data": [50, 10, 25, 30], }).to_parquet(path=path) sleep(120) resp = svc.get([ { "first_name": "yosi" }, { "first_name": "moshe" }, { "first_name": "katya" }, { "first_name": "dina" }, { "first_name": "uri" }, ]) assert resp[0]["data"] == 10 assert resp[1]["data"] == 50 assert resp[2] is None assert resp[3]["data"] == 10 assert resp[4] is None svc.close() # check offline resp = fs.get_offline_features(vec) assert len(resp.to_dataframe() == 4) assert "uri" not in resp.to_dataframe( ) and "katya" not in resp.to_dataframe()
def feature_selection(context, df_artifact, k: int=5, min_votes: float=0.5, label_column: str=None, stat_filters: list=['f_classif', 'mutual_info_classif', 'chi2', 'f_regression'], model_filters: dict={'LinearSVC': 'LinearSVC', 'LogisticRegression': 'LogisticRegression', 'ExtraTreesClassifier': 'ExtraTreesClassifier'}, max_scaled_scores: bool=True, sample_ratio: float=None, output_vector_name: float=None, ignore_type_errors: bool=False, is_feature_vector: bool=False): """Applies selected feature selection statistical functions or models on our 'df_artifact'. Each statistical function or model will vote for it's best K selected features. If a feature has >= 'min_votes' votes, it will be selected. :param context: the function context. :param k: number of top features to select from each statistical function or model. :param min_votes: minimal number of votes (from a model or by statistical function) needed for a feature to be selected. Can be specified by percentage of votes or absolute number of votes. :param label_column: ground-truth (y) labels. :param stat_filters: statistical functions to apply to the features (from sklearn.feature_selection). :param model_filters: models to use for feature evaluation, can be specified by model name (ex. LinearSVC), formalized json (contains 'CLASS', 'FIT', 'META') or a path to such json file. :param max_scaled_scores: produce feature scores table scaled with max_scaler. :param sample_ratio: percentage of the dataset the user whishes to compute the feature selection process on. :param output_vector_name: creates a new feature vector containing only the identifies features. :param ignore_type_errors: skips datatypes that are neither float or int within the feature vector. :param is_feature_vector: bool stating if the data is passed as a feature vector. """ # Check if df.meta is valid, if it is, look for a feature vector if df_artifact.meta: if df_artifact.meta.kind == mlrun.api.schemas.ObjectKind.feature_vector: is_feature_vector = True # Look inside meta.spec.label_feature to identify the label_column if the user did not specify it if label_column is None: if is_feature_vector: label_column = df_artifact.meta.spec.label_feature.split('.')[1] else: raise ValueError('No label_column was given, please add a label_column.') # Use the feature vector as dataframe df = df_artifact.as_df() # Ensure k is not bigger than the the total number of features if k > df.shape[1]: raise ValueError(f'K cannot be bigger than the total number of features ({df.shape[1]}). Please choose a smaller K.') elif k < 1: raise ValueError(f'K cannot be smaller than 1. Please choose a bigger K.') # Create a sample dataframe of the original feature vector if sample_ratio: df = df.groupby(label_column).apply(lambda x: x.sample(frac=sample_ratio)).reset_index(drop=True) df = df.dropna() # Set feature vector and labels y = df.pop(label_column) X = df if np.object in list(X.dtypes) and ignore_type_errors is False: raise ValueError(f"{df.select_dtypes(include=['object']).columns.tolist()} are neither float or int.") # Create selected statistical estimators stat_functions_list = {stat_name: SelectKBest(create_class(f'sklearn.feature_selection.{stat_name}'), k) for stat_name in stat_filters} requires_abs = ['chi2'] # Run statistic filters selected_features_agg = {} stats_df = pd.DataFrame(index=X.columns).dropna() for stat_name, stat_func in stat_functions_list.items(): try: params = (X, y) if stat_name in requires_abs else (abs(X), y) stat = stat_func.fit(*params) # Collect stat function results stat_df = pd.DataFrame(index=X.columns, columns=[stat_name], data=stat.scores_) plot_stat(context, stat_name, stat_df) stats_df = stats_df.join(stat_df) # Select K Best features selected_features = X.columns[stat_func.get_support()] selected_features_agg[stat_name] = selected_features except Exception as e: context.logger.info(f"Couldn't calculate {stat_name} because of: {e}") # Create models from class name / json file / json params all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {} selected_models = {} for model_name, model in model_filters.items(): if '.json' in model: current_model = json.load(open(model, 'r')) ClassifierClass = create_class(current_model["META"]["class"]) selected_models[model_name] = ClassifierClass(**current_model["CLASS"]) elif model in all_sklearn_estimators: selected_models[model_name] = all_sklearn_estimators[model_name]() else: try: current_model = json.loads(model) if isinstance(model, str) else current_model ClassifierClass = create_class(current_model["META"]["class"]) selected_models[model_name] = ClassifierClass(**current_model["CLASS"]) except: context.logger.info(f'unable to load {model}') # Run model filters models_df = pd.DataFrame(index=X.columns) for model_name, model in selected_models.items(): if model_name == 'LogisticRegression': model.set_params(solver='liblinear') # Train model and get feature importance select_from_model = SelectFromModel(model).fit(X, y) feature_idx = select_from_model.get_support() feature_names = X.columns[feature_idx] selected_features_agg[model_name] = feature_names.tolist() # Collect model feature importance if hasattr(select_from_model.estimator_, 'coef_'): stat_df = select_from_model.estimator_.coef_ elif hasattr(select_from_model.estimator_, 'feature_importances_'): stat_df = select_from_model.estimator_.feature_importances_ stat_df = pd.DataFrame(index=X.columns, columns=[model_name], data=stat_df[0]) models_df = models_df.join(stat_df) plot_stat(context, model_name, stat_df) # Create feature_scores DF with stat & model filters scores result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False) context.log_dataset(key='feature_scores', df=result_matrix_df, local_path='feature_scores.parquet', format='parquet') if max_scaled_scores: normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values min_max_scaler = MinMaxScaler() normalized_df = min_max_scaler.fit_transform(normalized_df) normalized_df = pd.DataFrame(data=normalized_df, columns=result_matrix_df.columns, index=result_matrix_df.index) context.log_dataset(key='max_scaled_scores_feature_scores', df=normalized_df, local_path='max_scaled_scores_feature_scores.parquet', format='parquet') # Create feature count DataFrame for test_name in selected_features_agg: result_matrix_df[test_name] = [1 if x in selected_features_agg[test_name] else 0 for x in X.columns] result_matrix_df.loc[:, 'num_votes'] = result_matrix_df.sum(axis=1) context.log_dataset(key='selected_features_count', df=result_matrix_df, local_path='selected_features_count.parquet', format='parquet') # How many votes are needed for a feature to be selected? if isinstance(min_votes, int): votes_needed = min_votes else: num_filters = len(stat_filters) + len(model_filters) votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0))) context.logger.info(f'votes needed to be selected: {votes_needed}') # Create final feature dataframe selected_features = result_matrix_df[result_matrix_df.num_votes >= votes_needed].index.tolist() good_feature_df = df.loc[:, selected_features] final_df = pd.concat([good_feature_df, y], axis=1) context.log_dataset(key='selected_features', df=final_df, local_path='selected_features.parquet', format='parquet') # Creating a new feature vector containing only the identified top features if is_feature_vector and df_artifact.meta.spec.features and output_vector_name: # Selecting the top K features from our top feature dataframe selected_features = result_matrix_df.head(k).index # Match the selected feature names to the FS Feature annotations matched_selections = [feature for feature in list(df_artifact.meta.spec.features) for selected in list(selected_features) if feature.endswith(selected)] # Defining our new feature vector top_features_fv = fs.FeatureVector(output_vector_name, matched_selections, label_feature="labels.label", description='feature vector composed strictly of our top features') # Saving top_features_fv.save() fs.get_offline_features(top_features_fv, target=ParquetTarget()) # Logging our new feature vector URI context.log_result('top_features_vector', top_features_fv.uri)
def test_ingest_with_column_conversion(self): orig_df = source = pd.DataFrame( { "time_stamp": [ pd.Timestamp("2002-04-01 04:32:34.000"), pd.Timestamp("2002-04-01 15:05:37.000"), pd.Timestamp("2002-03-31 23:46:07.000"), ], "ssrxbtok": [488441267876, 438975336749, 298802679370], "nkxuonfx": [0.241233, 0.160264, 0.045345], "xzvipbmo": [True, False, None], "bikyseca": ["ONE", "TWO", "THREE"], "napxsuhp": [True, False, True], "oegndrxe": [ pd.Timestamp("2002-04-01 04:32:34.000"), pd.Timestamp("2002-04-01 05:06:34.000"), pd.Timestamp("2002-04-01 05:38:34.000"), ], "aatxnkgx": [-227504700006, -470002151801, -33193685176], "quupyoxi": ["FOUR", "FIVE", "SIX"], "temdojgz": [0.570031, 0.677182, 0.276053], }, index=None, ) fset = fs.FeatureSet( "rWQTKqbhje", timestamp_key="time_stamp", entities=[ Entity("{}".format(k["name"])) for k in [ { "dtype": "float", "null_values": False, "name": "temdojgz", "df_dtype": "float64", }, { "dtype": "str", "null_values": False, "name": "bikyseca", "df_dtype": "object", }, { "dtype": "float", "null_values": False, "name": "nkxuonfx", "df_dtype": "float64", }, ] ], ) fset.graph.to(name="s1", handler="my_func") ikjqkfcz = ParquetTarget(path="v3io:///bigdata/ifrlsjvxgv", partitioned=False) fs.ingest(fset, source, targets=[ikjqkfcz]) features = ["rWQTKqbhje.*"] vector = fs.FeatureVector("WPAyrYux", features) vector.spec.with_indexes = False resp = fs.get_offline_features(vector) off_df = resp.to_dataframe() del orig_df["time_stamp"] if None in list(orig_df.index.names): orig_df.set_index(["temdojgz", "bikyseca", "nkxuonfx"], inplace=True) orig_df = orig_df.sort_values( by=["temdojgz", "bikyseca", "nkxuonfx"]).sort_index(axis=1) off_df = off_df.sort_values( by=["temdojgz", "bikyseca", "nkxuonfx"]).sort_index(axis=1) pd.testing.assert_frame_equal( off_df, orig_df, check_dtype=True, check_index_type=True, check_column_type=True, check_like=True, check_names=True, )
def test_ingest_partitioned_by_key_and_time( self, key_bucketing_number, partition_cols, time_partitioning_granularity ): key = "patient_id" name = f"measurements_{uuid.uuid4()}" measurements = fs.FeatureSet(name, entities=[Entity(key)]) source = CSVSource( "mycsv", path=os.path.relpath(str(self.assets_path / "testdata.csv")), time_field="timestamp", ) measurements.set_targets( targets=[ ParquetTarget( partitioned=True, key_bucketing_number=key_bucketing_number, partition_cols=partition_cols, time_partitioning_granularity=time_partitioning_granularity, ) ], with_defaults=False, ) resp1 = fs.ingest(measurements, source) features = [ f"{name}.*", ] vector = fs.FeatureVector("myvector", features) resp = fs.get_offline_features(vector) resp2 = resp.to_dataframe() assert resp1.to_dict() == resp2.to_dict() file_system = fsspec.filesystem("v3io") kind = TargetTypes.parquet path = f"{get_default_prefix_for_target(kind)}/sets/{name}-latest" path = path.format(name=name, kind=kind, project="system-test-project") dataset = pq.ParquetDataset(path, filesystem=file_system,) partitions = [key for key, _ in dataset.pieces[0].partition_keys] if key_bucketing_number is None: expected_partitions = [] elif key_bucketing_number == 0: expected_partitions = ["igzpart_key"] else: expected_partitions = [f"igzpart_hash{key_bucketing_number}_key"] expected_partitions += partition_cols or [] if all( value is None for value in [ key_bucketing_number, partition_cols, time_partitioning_granularity, ] ): time_partitioning_granularity = "hour" if time_partitioning_granularity: for unit in ["year", "month", "day", "hour"]: expected_partitions.append(f"igzpart_{unit}") if unit == time_partitioning_granularity: break assert partitions == expected_partitions resp = fs.get_offline_features( vector, start_time=datetime(2020, 12, 1, 17, 33, 15), end_time=datetime(2020, 12, 1, 17, 33, 16), entity_timestamp_column="timestamp", ) resp2 = resp.to_dataframe() assert len(resp2) == 10