def get_feature_view_query_context( feature_refs: List[str], feature_views: List[FeatureView] ) -> List[FeatureViewQueryContext]: """Build a query context containing all information required to template a BigQuery point-in-time SQL query""" feature_views_to_feature_map = _get_requested_feature_views_to_features_dict( feature_refs, feature_views ) query_context = [] for feature_view, features in feature_views_to_feature_map.items(): entity_names = [entity for entity in feature_view.entities] if isinstance(feature_view.ttl, timedelta): ttl_seconds = int(feature_view.ttl.total_seconds()) else: ttl_seconds = 0 assert isinstance(feature_view.input, BigQuerySource) context = FeatureViewQueryContext( name=feature_view.name, ttl=ttl_seconds, entities=entity_names, features=features, table_ref=feature_view.input.table_ref, event_timestamp_column=feature_view.input.event_timestamp_column, created_timestamp_column=feature_view.input.created_timestamp_column, # TODO: Make created column optional and not hardcoded field_mapping=feature_view.input.field_mapping, query=feature_view.input.query, table_subquery=feature_view.input.get_table_query_string(), ) query_context.append(context) return query_context
def get_feature_view_query_context( feature_refs: List[str], feature_views: List[FeatureView], registry: Registry, project: str, ) -> List[FeatureViewQueryContext]: """Build a query context containing all information required to template a BigQuery point-in-time SQL query""" feature_views_to_feature_map = _get_requested_feature_views_to_features_dict( feature_refs, feature_views ) query_context = [] for feature_view, features in feature_views_to_feature_map.items(): join_keys = [] entity_selections = [] reverse_field_mapping = { v: k for k, v in feature_view.input.field_mapping.items() } for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_keys.append(entity.join_key) join_key_column = reverse_field_mapping.get( entity.join_key, entity.join_key ) entity_selections.append(f"{join_key_column} AS {entity.join_key}") if isinstance(feature_view.ttl, timedelta): ttl_seconds = int(feature_view.ttl.total_seconds()) else: ttl_seconds = 0 assert isinstance(feature_view.input, BigQuerySource) event_timestamp_column = feature_view.input.event_timestamp_column created_timestamp_column = feature_view.input.created_timestamp_column context = FeatureViewQueryContext( name=feature_view.name, ttl=ttl_seconds, entities=join_keys, features=features, table_ref=feature_view.input.table_ref, event_timestamp_column=reverse_field_mapping.get( event_timestamp_column, event_timestamp_column ), created_timestamp_column=reverse_field_mapping.get( created_timestamp_column, created_timestamp_column ), # TODO: Make created column optional and not hardcoded query=feature_view.input.query, table_subquery=feature_view.input.get_table_query_string(), entity_selections=entity_selections, ) query_context.append(context) return query_context
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False, ) -> RetrievalJob: if not isinstance(entity_df, pd.DataFrame): raise ValueError( f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}" ) entity_df_event_timestamp_col = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL # local modifiable copy of global variable if entity_df_event_timestamp_col not in entity_df.columns: datetime_columns = entity_df.select_dtypes( include=["datetime", "datetimetz"]).columns if len(datetime_columns) == 1: print( f"Using {datetime_columns[0]} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}." ) entity_df_event_timestamp_col = datetime_columns[0] else: raise ValueError( f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events." ) ( feature_views_to_features, on_demand_feature_views_to_features, ) = _get_requested_feature_views_to_features_dict( feature_refs, feature_views, registry.list_on_demand_feature_views(config.project), ) # Create lazy function that is only called from the RetrievalJob object def evaluate_historical_retrieval(): # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC entity_df[entity_df_event_timestamp_col] = entity_df[ entity_df_event_timestamp_col].apply( lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)) # Create a copy of entity_df to prevent modifying the original entity_df_with_features = entity_df.copy() # Convert event timestamp column to datetime and normalize time zone to UTC # This is necessary to avoid issues with pd.merge_asof entity_df_with_features[ entity_df_event_timestamp_col] = pd.to_datetime( entity_df_with_features[entity_df_event_timestamp_col], utc=True) # Sort event timestamp values entity_df_with_features = entity_df_with_features.sort_values( entity_df_event_timestamp_col) # Load feature view data from sources and join them incrementally for feature_view, features in feature_views_to_features.items(): event_timestamp_column = ( feature_view.batch_source.event_timestamp_column) created_timestamp_column = ( feature_view.batch_source.created_timestamp_column) # Read offline parquet data in pyarrow format. filesystem, path = FileSource.create_filesystem_and_path( feature_view.batch_source.path, feature_view.batch_source.file_options. s3_endpoint_override, ) table = pyarrow.parquet.read_table(path, filesystem=filesystem) # Rename columns by the field mapping dictionary if it exists if feature_view.batch_source.field_mapping is not None: table = _run_field_mapping( table, feature_view.batch_source.field_mapping) # Rename entity columns by the join_key_map dictionary if it exists if feature_view.projection.join_key_map: table = _run_field_mapping( table, feature_view.projection.join_key_map) # Convert pyarrow table to pandas dataframe. Note, if the underlying data has missing values, # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean # If the dtype is 'object', then missing values are inferred as python `None`s. # More details at: # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing df_to_join = table.to_pandas() # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC df_to_join[event_timestamp_column] = df_to_join[ event_timestamp_column].apply(lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)) if created_timestamp_column: df_to_join[created_timestamp_column] = df_to_join[ created_timestamp_column].apply( lambda x: x if x.tzinfo is not None else x.replace( tzinfo=pytz.utc)) # Sort dataframe by the event timestamp column df_to_join = df_to_join.sort_values(event_timestamp_column) # Build a list of all the features we should select from this source feature_names = [] for feature in features: # Modify the separator for feature refs in column names to double underscore. We are using # double underscore as separator for consistency with other databases like BigQuery, # where there are very few characters available for use as separators if full_feature_names: formatted_feature_name = ( f"{feature_view.projection.name_to_use()}__{feature}" ) else: formatted_feature_name = feature # Add the feature name to the list of columns feature_names.append(formatted_feature_name) # Ensure that the source dataframe feature column includes the feature view name as a prefix df_to_join.rename( columns={feature: formatted_feature_name}, inplace=True, ) # Build a list of entity columns to join on (from the right table) join_keys = [] for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_key = feature_view.projection.join_key_map.get( entity.join_key, entity.join_key) join_keys.append(join_key) right_entity_columns = join_keys right_entity_key_columns = [event_timestamp_column ] + right_entity_columns # Remove all duplicate entity keys (using created timestamp) right_entity_key_sort_columns = right_entity_key_columns if created_timestamp_column: # If created_timestamp is available, use it to dedupe deterministically right_entity_key_sort_columns = right_entity_key_sort_columns + [ created_timestamp_column ] df_to_join.sort_values(by=right_entity_key_sort_columns, inplace=True) df_to_join.drop_duplicates( right_entity_key_sort_columns, keep="last", ignore_index=True, inplace=True, ) # Select only the columns we need to join from the feature dataframe df_to_join = df_to_join[right_entity_key_columns + feature_names] # Do point in-time-join between entity_df and feature dataframe entity_df_with_features = pd.merge_asof( entity_df_with_features, df_to_join, left_on=entity_df_event_timestamp_col, right_on=event_timestamp_column, by=right_entity_columns or None, tolerance=feature_view.ttl, ) # Remove right (feature table/view) event_timestamp column. if event_timestamp_column != entity_df_event_timestamp_col: entity_df_with_features.drop( columns=[event_timestamp_column], inplace=True) # Ensure that we delete dataframes to free up memory del df_to_join # Move "event_timestamp" column to front current_cols = entity_df_with_features.columns.tolist() current_cols.remove(entity_df_event_timestamp_col) entity_df_with_features = entity_df_with_features[ [entity_df_event_timestamp_col] + current_cols] return entity_df_with_features job = FileRetrievalJob( evaluation_function=evaluate_historical_retrieval, full_feature_names=full_feature_names, on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( feature_refs, project, registry), ) return job
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, ) -> FileRetrievalJob: if not isinstance(entity_df, pd.DataFrame): raise ValueError( f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}" ) feature_views_to_features = _get_requested_feature_views_to_features_dict( feature_refs, feature_views ) # Create lazy function that is only called from the RetrievalJob object def evaluate_historical_retrieval(): # Sort entity dataframe prior to join, and create a copy to prevent modifying the original entity_df_with_features = entity_df.sort_values( ENTITY_DF_EVENT_TIMESTAMP_COL ).copy() # Load feature view data from sources and join them incrementally for feature_view, features in feature_views_to_features.items(): event_timestamp_column = feature_view.input.event_timestamp_column created_timestamp_column = feature_view.input.created_timestamp_column # Read dataframe to join to entity dataframe df_to_join = pd.read_parquet(feature_view.input.path).sort_values( event_timestamp_column ) # Build a list of all the features we should select from this source feature_names = [] for feature in features: # Modify the separator for feature refs in column names to double underscore. We are using # double underscore as separator for consistency with other databases like BigQuery, # where there are very few characters available for use as separators prefixed_feature_name = f"{feature_view.name}__{feature}" # Add the feature name to the list of columns feature_names.append(prefixed_feature_name) # Ensure that the source dataframe feature column includes the feature view name as a prefix df_to_join.rename( columns={feature: prefixed_feature_name}, inplace=True, ) # Build a list of entity columns to join on (from the right table) join_keys = [] for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_keys.append(entity.join_key) right_entity_columns = join_keys right_entity_key_columns = [ event_timestamp_column ] + right_entity_columns # Remove all duplicate entity keys (using created timestamp) right_entity_key_sort_columns = right_entity_key_columns if created_timestamp_column: # If created_timestamp is available, use it to dedupe deterministically right_entity_key_sort_columns = right_entity_key_sort_columns + [ created_timestamp_column ] df_to_join.sort_values(by=right_entity_key_sort_columns, inplace=True) df_to_join = df_to_join.groupby(by=right_entity_key_columns).last() df_to_join.reset_index(inplace=True) # Select only the columns we need to join from the feature dataframe df_to_join = df_to_join[right_entity_key_columns + feature_names] # Do point in-time-join between entity_df and feature dataframe entity_df_with_features = pd.merge_asof( entity_df_with_features, df_to_join, left_on=ENTITY_DF_EVENT_TIMESTAMP_COL, right_on=event_timestamp_column, by=right_entity_columns, tolerance=feature_view.ttl, ) # Remove right (feature table/view) event_timestamp column. if event_timestamp_column != ENTITY_DF_EVENT_TIMESTAMP_COL: entity_df_with_features.drop( columns=[event_timestamp_column], inplace=True ) # Ensure that we delete dataframes to free up memory del df_to_join # Move "datetime" column to front current_cols = entity_df_with_features.columns.tolist() current_cols.remove(ENTITY_DF_EVENT_TIMESTAMP_COL) entity_df_with_features = entity_df_with_features[ [ENTITY_DF_EVENT_TIMESTAMP_COL] + current_cols ] return entity_df_with_features job = FileRetrievalJob(evaluation_function=evaluate_historical_retrieval) return job
def get_feature_view_query_context( feature_refs: List[str], feature_views: List[FeatureView], registry: Registry, project: str, entity_df_timestamp_range: Tuple[datetime, datetime], ) -> List[FeatureViewQueryContext]: """Build a query context containing all information required to template a BigQuery and Redshift point-in-time SQL query""" ( feature_views_to_feature_map, on_demand_feature_views_to_features, ) = _get_requested_feature_views_to_features_dict( feature_refs, feature_views, registry.list_on_demand_feature_views(project)) query_context = [] for feature_view, features in feature_views_to_feature_map.items(): join_keys = [] entity_selections = [] reverse_field_mapping = { v: k for k, v in feature_view.input.field_mapping.items() } for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_key = feature_view.projection.join_key_map.get( entity.join_key, entity.join_key) join_keys.append(join_key) entity_selections.append(f"{entity.join_key} AS {join_key}") if isinstance(feature_view.ttl, timedelta): ttl_seconds = int(feature_view.ttl.total_seconds()) else: ttl_seconds = 0 event_timestamp_column = feature_view.input.event_timestamp_column created_timestamp_column = feature_view.input.created_timestamp_column min_event_timestamp = None if feature_view.ttl: min_event_timestamp = to_naive_utc(entity_df_timestamp_range[0] - feature_view.ttl).isoformat() max_event_timestamp = to_naive_utc( entity_df_timestamp_range[1]).isoformat() context = FeatureViewQueryContext( name=feature_view.projection.name_to_use(), ttl=ttl_seconds, entities=join_keys, features=features, event_timestamp_column=reverse_field_mapping.get( event_timestamp_column, event_timestamp_column), created_timestamp_column=reverse_field_mapping.get( created_timestamp_column, created_timestamp_column), # TODO: Make created column optional and not hardcoded table_subquery=feature_view.input.get_table_query_string(), entity_selections=entity_selections, min_event_timestamp=min_event_timestamp, max_event_timestamp=max_event_timestamp, ) query_context.append(context) return query_context
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, ) -> FileRetrievalJob: if not isinstance(entity_df, pd.DataFrame): raise ValueError( f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}" ) if ENTITY_DF_EVENT_TIMESTAMP_COL not in entity_df.columns: raise ValueError( f"Please provide an entity_df with a column named {ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events." ) feature_views_to_features = _get_requested_feature_views_to_features_dict( feature_refs, feature_views) # Create lazy function that is only called from the RetrievalJob object def evaluate_historical_retrieval(): # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC entity_df[ENTITY_DF_EVENT_TIMESTAMP_COL] = entity_df[ ENTITY_DF_EVENT_TIMESTAMP_COL].apply( lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)) # Create a copy of entity_df to prevent modifying the original entity_df_with_features = entity_df.copy() # Convert event timestamp column to datetime and normalize time zone to UTC # This is necessary to avoid issues with pd.merge_asof entity_df_with_features[ ENTITY_DF_EVENT_TIMESTAMP_COL] = pd.to_datetime( entity_df_with_features[ENTITY_DF_EVENT_TIMESTAMP_COL], utc=True) # Sort event timestamp values entity_df_with_features = entity_df_with_features.sort_values( ENTITY_DF_EVENT_TIMESTAMP_COL) # Load feature view data from sources and join them incrementally for feature_view, features in feature_views_to_features.items(): event_timestamp_column = feature_view.input.event_timestamp_column created_timestamp_column = feature_view.input.created_timestamp_column # Read offline parquet data in pyarrow format table = pyarrow.parquet.read_table(feature_view.input.path) # Rename columns by the field mapping dictionary if it exists if feature_view.input.field_mapping is not None: table = _run_field_mapping( table, feature_view.input.field_mapping) # Convert pyarrow table to pandas dataframe df_to_join = table.to_pandas() # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC df_to_join[event_timestamp_column] = df_to_join[ event_timestamp_column].apply(lambda x: x if x.tzinfo is not None else x.replace(tzinfo=pytz.utc)) if created_timestamp_column: df_to_join[created_timestamp_column] = df_to_join[ created_timestamp_column].apply( lambda x: x if x.tzinfo is not None else x.replace( tzinfo=pytz.utc)) # Sort dataframe by the event timestamp column df_to_join = df_to_join.sort_values(event_timestamp_column) # Build a list of all the features we should select from this source feature_names = [] for feature in features: # Modify the separator for feature refs in column names to double underscore. We are using # double underscore as separator for consistency with other databases like BigQuery, # where there are very few characters available for use as separators prefixed_feature_name = f"{feature_view.name}__{feature}" # Add the feature name to the list of columns feature_names.append(prefixed_feature_name) # Ensure that the source dataframe feature column includes the feature view name as a prefix df_to_join.rename( columns={feature: prefixed_feature_name}, inplace=True, ) # Build a list of entity columns to join on (from the right table) join_keys = [] for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_keys.append(entity.join_key) right_entity_columns = join_keys right_entity_key_columns = [event_timestamp_column ] + right_entity_columns # Remove all duplicate entity keys (using created timestamp) right_entity_key_sort_columns = right_entity_key_columns if created_timestamp_column: # If created_timestamp is available, use it to dedupe deterministically right_entity_key_sort_columns = right_entity_key_sort_columns + [ created_timestamp_column ] df_to_join.sort_values(by=right_entity_key_sort_columns, inplace=True) df_to_join = df_to_join.groupby( by=right_entity_key_columns).last() df_to_join.reset_index(inplace=True) # Select only the columns we need to join from the feature dataframe df_to_join = df_to_join[right_entity_key_columns + feature_names] # Do point in-time-join between entity_df and feature dataframe entity_df_with_features = pd.merge_asof( entity_df_with_features, df_to_join, left_on=ENTITY_DF_EVENT_TIMESTAMP_COL, right_on=event_timestamp_column, by=right_entity_columns, tolerance=feature_view.ttl, ) # Remove right (feature table/view) event_timestamp column. if event_timestamp_column != ENTITY_DF_EVENT_TIMESTAMP_COL: entity_df_with_features.drop( columns=[event_timestamp_column], inplace=True) # Ensure that we delete dataframes to free up memory del df_to_join # Move "datetime" column to front current_cols = entity_df_with_features.columns.tolist() current_cols.remove(ENTITY_DF_EVENT_TIMESTAMP_COL) entity_df_with_features = entity_df_with_features[ [ENTITY_DF_EVENT_TIMESTAMP_COL] + current_cols] return entity_df_with_features job = FileRetrievalJob( evaluation_function=evaluate_historical_retrieval) return job
def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False, ) -> RetrievalJob: if not isinstance(entity_df, pd.DataFrame) and not isinstance( entity_df, dd.DataFrame): raise ValueError( f"Please provide an entity_df of type {type(pd.DataFrame)} instead of type {type(entity_df)}" ) entity_df_event_timestamp_col = DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL # local modifiable copy of global variable if entity_df_event_timestamp_col not in entity_df.columns: datetime_columns = entity_df.select_dtypes( include=["datetime", "datetimetz"]).columns if len(datetime_columns) == 1: print( f"Using {datetime_columns[0]} as the event timestamp. To specify a column explicitly, please name it {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL}." ) entity_df_event_timestamp_col = datetime_columns[0] else: raise ValueError( f"Please provide an entity_df with a column named {DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL} representing the time of events." ) ( feature_views_to_features, on_demand_feature_views_to_features, ) = _get_requested_feature_views_to_features_dict( feature_refs, feature_views, registry.list_on_demand_feature_views(config.project), ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( entity_df, entity_df_event_timestamp_col) # Create lazy function that is only called from the RetrievalJob object def evaluate_historical_retrieval(): # Create a copy of entity_df to prevent modifying the original entity_df_with_features = entity_df.copy() entity_df_event_timestamp_col_type = entity_df_with_features.dtypes[ entity_df_event_timestamp_col] if (not hasattr(entity_df_event_timestamp_col_type, "tz") or entity_df_event_timestamp_col_type.tz != pytz.UTC): # Make sure all event timestamp fields are tz-aware. We default tz-naive fields to UTC entity_df_with_features[ entity_df_event_timestamp_col] = entity_df_with_features[ entity_df_event_timestamp_col].apply( lambda x: x if x.tzinfo is not None else x.replace( tzinfo=pytz.utc)) # Convert event timestamp column to datetime and normalize time zone to UTC # This is necessary to avoid issues with pd.merge_asof if isinstance(entity_df_with_features, dd.DataFrame): entity_df_with_features[ entity_df_event_timestamp_col] = dd.to_datetime( entity_df_with_features[ entity_df_event_timestamp_col], utc=True) else: entity_df_with_features[ entity_df_event_timestamp_col] = pd.to_datetime( entity_df_with_features[ entity_df_event_timestamp_col], utc=True) # Sort event timestamp values entity_df_with_features = entity_df_with_features.sort_values( entity_df_event_timestamp_col) join_keys = [] all_join_keys = [] # Load feature view data from sources and join them incrementally for feature_view, features in feature_views_to_features.items(): event_timestamp_column = feature_view.batch_source.timestamp_field created_timestamp_column = ( feature_view.batch_source.created_timestamp_column) # Build a list of entity columns to join on (from the right table) join_keys = [] for entity_name in feature_view.entities: entity = registry.get_entity(entity_name, project) join_key = feature_view.projection.join_key_map.get( entity.join_key, entity.join_key) join_keys.append(join_key) right_entity_key_columns = [ event_timestamp_column, created_timestamp_column, ] + join_keys right_entity_key_columns = [ c for c in right_entity_key_columns if c ] all_join_keys = list(set(all_join_keys + join_keys)) df_to_join = _read_datasource(feature_view.batch_source) df_to_join, event_timestamp_column = _field_mapping( df_to_join, feature_view, features, right_entity_key_columns, entity_df_event_timestamp_col, event_timestamp_column, full_feature_names, ) df_to_join = _merge(entity_df_with_features, df_to_join, join_keys) df_to_join = _normalize_timestamp(df_to_join, event_timestamp_column, created_timestamp_column) df_to_join = _filter_ttl( df_to_join, feature_view, entity_df_event_timestamp_col, event_timestamp_column, ) df_to_join = _drop_duplicates( df_to_join, all_join_keys, event_timestamp_column, created_timestamp_column, entity_df_event_timestamp_col, ) entity_df_with_features = _drop_columns( df_to_join, event_timestamp_column, created_timestamp_column) # Ensure that we delete dataframes to free up memory del df_to_join return entity_df_with_features.persist() job = FileRetrievalJob( evaluation_function=evaluate_historical_retrieval, full_feature_names=full_feature_names, on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( feature_refs, project, registry), metadata=RetrievalMetadata( features=feature_refs, keys=list( set(entity_df.columns) - {entity_df_event_timestamp_col}), min_event_timestamp=entity_df_event_timestamp_range[0], max_event_timestamp=entity_df_event_timestamp_range[1], ), ) return job