def test_check_timedelta(es): time_units = list(Timedelta._readable_units.keys()) expanded_units = list(Timedelta._readable_units.values()) exp_to_standard_unit = {e: t for e, t in zip(expanded_units, time_units)} singular_units = [u[:-1] for u in expanded_units] sing_to_standard_unit = {s: t for s, t in zip(singular_units, time_units)} to_standard_unit = merge(exp_to_standard_unit, sing_to_standard_unit) full_units = singular_units + expanded_units + time_units + time_units strings = ["2 {}".format(u) for u in singular_units + expanded_units + time_units] strings += ["2{}".format(u) for u in time_units] for i, s in enumerate(strings): unit = full_units[i] standard_unit = unit if unit in to_standard_unit: standard_unit = to_standard_unit[unit] if standard_unit == 'o': s = (s, 'logs') td = _check_timedelta(s) if standard_unit != 'w': assert td.value == 2 assert td.unit == standard_unit else: assert td.value == 2 * 7 td = _check_timedelta(2) assert td.value == 2 assert td.unit == Timedelta._generic_unit td = _check_timedelta((2, 'logs')) assert td.value == 2 assert td.unit == Timedelta._Observations
def test_check_timedelta(es): time_units = list(Timedelta._readable_units.keys()) expanded_units = list(Timedelta._readable_units.values()) exp_to_standard_unit = {e: t for e, t in zip(expanded_units, time_units)} singular_units = [u[:-1] for u in expanded_units] sing_to_standard_unit = {s: t for s, t in zip(singular_units, time_units)} to_standard_unit = merge(exp_to_standard_unit, sing_to_standard_unit) full_units = singular_units + expanded_units + time_units + time_units strings = ["2 {}".format(u) for u in singular_units + expanded_units + time_units] strings += ["2{}".format(u) for u in time_units] for i, s in enumerate(strings): unit = full_units[i] standard_unit = unit if unit in to_standard_unit: standard_unit = to_standard_unit[unit] if standard_unit == 'o': s = (s, 'logs') td = _check_timedelta(s) if standard_unit != 'w': assert td.value == 2 assert td.unit == standard_unit else: assert td.value == 2 * 7 td = _check_timedelta(2) assert td.value == 2 assert td.unit == Timedelta._generic_unit td = _check_timedelta((2, 'logs')) assert td.value == 2 assert td.unit == Timedelta._Observations
def test_has_multiple_units(): single_unit = pd.DateOffset(months=3) multiple_units = pd.DateOffset(months=3, years=3, days=5) single_td = _check_timedelta(single_unit) multiple_td = _check_timedelta(multiple_units) assert single_td.has_multiple_units() is False assert multiple_td.has_multiple_units() is True
def test_pd_dateoffset_to_timedelta_math(): base = pd.to_datetime("2020-01-31") add = _check_timedelta(pd.DateOffset(months=2)) res = base + add assert res == pd.to_datetime("2020-03-31") base_2 = pd.to_datetime("2020-01-31") add_2 = _check_timedelta(pd.DateOffset(months=2, days=3)) res_2 = base_2 + add_2 assert res_2 == pd.to_datetime("2020-04-03") base_3 = pd.to_datetime("2019-09-20") sub = _check_timedelta(pd.offsets.BDay(10)) res_3 = base_3 - sub assert res_3 == pd.to_datetime("2019-09-06")
def query_by_values(self, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None): """Query instances that have variable with given value Args: instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Window defining how much time before the cutoff time data can be used when calculating features. If None, all data before cutoff time is used. Returns: pd.DataFrame : instances that match constraints with ids in order of underlying dataframe """ if not variable_id: variable_id = self.index instance_vals = self._vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert training_window.has_no_observations( ), "Training window cannot be in observations" if instance_vals is None: df = self.df.copy() elif instance_vals.shape[0] == 0: df = self.df.head(0) else: df = self.df[self.df[variable_id].isin(instance_vals)] df = df.set_index(self.index, drop=False) # ensure filtered df has same categories as original # workaround for issue below # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538 if pdtypes.is_categorical_dtype(self.df[variable_id]): categories = pd.api.types.CategoricalDtype( categories=self.df[variable_id].cat.categories) df[variable_id] = df[variable_id].astype(categories) df = self._handle_time(df=df, time_last=time_last, training_window=training_window) if columns is not None: df = df[columns] return df
def test_relative_month(): td_time = "1 month" td = _check_timedelta(td_time) assert td.get_value('mo') == 1 assert isinstance(td.delta_obj, relativedelta) time = pd.to_datetime('2020-01-31') assert time + td == pd.to_datetime('2020-02-29') td_time = "6 months" td = _check_timedelta(td_time) assert td.get_value('mo') == 6 assert isinstance(td.delta_obj, relativedelta) time = pd.to_datetime('2020-01-31') assert time + td == pd.to_datetime('2020-07-31')
def __init__(self, base_features, parent_entity, primitive, use_previous=None, where=None): if hasattr(base_features, '__iter__'): base_features = [_check_feature(bf) for bf in base_features] msg = "all base features must share the same entity" assert len(set([bf.entity for bf in base_features])) == 1, msg else: base_features = [_check_feature(base_features)] self.child_entity = base_features[0].entity self.parent_entity = parent_entity.entityset.metadata[parent_entity.id] if where is not None: self.where = _check_feature(where) msg = "Where feature must be defined on child entity {}".format( self.child_entity.id) assert self.where.entity.id == self.child_entity.id, msg if use_previous: assert self.child_entity.time_index is not None, ( "Applying function that requires time index to entity that " "doesn't have one") self.use_previous = _check_timedelta(use_previous) assert len(base_features) > 0 time_index = base_features[0].entity.time_index time_col = base_features[0].entity[time_index] assert time_index is not None, ("Use previous can only be defined " "on entities with a time index") assert _check_time_against_column(self.use_previous, time_col) super(AggregationFeature, self).__init__(parent_entity, base_features, primitive=primitive)
def _handle_time(self, df, time_last=None, training_window=None): """ Filter a dataframe for all instances before time_last. If this entity does not have a time index, return the original dataframe. """ if self.time_index: if time_last is not None and not df.empty: df = df[df[self.time_index] <= time_last] if training_window is not None: training_window = _check_timedelta(training_window) mask = df[self.time_index] >= time_last - training_window if self.last_time_index is not None: lti_slice = self.last_time_index.reindex(df.index) lti_mask = lti_slice >= time_last - training_window mask = mask | lti_mask else: logger.warning( "Using training_window but last_time_index is " "not set on entity %s" % (self.id)) df = df[mask] for secondary_time_index, columns in self.secondary_time_index.items(): # should we use ignore time last here? if time_last is not None and not df.empty: mask = df[secondary_time_index] >= time_last df.loc[mask, columns] = np.nan return df
def __init__(self, entity, base_features, **kwargs): assert all(isinstance(f, PrimitiveBase) for f in base_features), \ "All base features must be features" if len(set([bf.hash() for bf in base_features])) != len(base_features): raise ValueError(u"Duplicate base features ({}): {}".format( self.__class__, base_features)) self.entity_id = entity.id self.entityset = entity.entityset # P TODO: where should this logic go? # not all primitives support use previous so doesn't make sense to have # in base if self.use_previous: self.use_previous = _check_timedelta(self.use_previous) assert len(self.base_features) > 0 time_index = self.base_features[0].entity.time_index time_col = self.base_features[0].entity[time_index] assert time_index is not None, ("Use previous can only be defined " "on entities with a time index") assert _check_time_against_column(self.use_previous, time_col) self.base_features = base_features # variable type can be declared or inferred from first base feature self.additional_attributes = kwargs assert self._check_input_types(), ("Provided inputs don't match input " "type requirements") super(PrimitiveBase, self).__init__(**kwargs)
def __init__(self, entity, base_features, **kwargs): assert all(isinstance(f, PrimitiveBase) for f in base_features), \ "All base features must be features" if len(set([bf.hash() for bf in base_features])) != len(base_features): raise ValueError(u"Duplicate base features ({}): {}".format( self.__class__, base_features)) self.entity_id = entity.id self.entityset = entity.entityset.metadata # P TODO: where should this logic go? # not all primitives support use previous so doesn't make sense to have # in base if self.use_previous: self.use_previous = _check_timedelta(self.use_previous) assert len(self.base_features) > 0 time_index = self.base_features[0].entity.time_index time_col = self.base_features[0].entity[time_index] assert time_index is not None, ("Use previous can only be defined " "on entities with a time index") assert _check_time_against_column(self.use_previous, time_col) self.base_features = base_features # variable type can be declared or inferred from first base feature self.additional_attributes = kwargs assert self._check_input_types(), ("Provided inputs don't match input " "type requirements") super(PrimitiveBase, self).__init__(**kwargs)
def __init__(self, value, unit=None): """ Args: value (float, str) : Value of timedelta, or string providing both unit and value. unit (str) : Unit of time delta. """ # TODO: check if value is int or float if is_string(value): from featuretools.utils.wrangle import _check_timedelta td = _check_timedelta(value) value, unit = td.value, td.unit self.value = value self._original_unit = None # to alert get_name that although we converted the unit to 'd' it was initially unit = self._check_unit_plural(unit) assert unit in self._readable_units or unit in self._readable_to_unit if unit in self._readable_to_unit: unit = self._readable_to_unit[unit] # weeks if unit in self._convert_to_days: self._original_unit = unit self.value = self.value * self._convert_to_days[unit] unit = 'd' self.unit = unit self.delta_obj = self.get_unit_type()
def query_by_values(self, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None): """Query instances that have variable with given value Args: instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Data older than time_last by more than this will be ignored Returns: pd.DataFrame : instances that match constraints with ids in order of underlying dataframe """ instance_vals = self._vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert (isinstance(training_window, Timedelta) and training_window.is_absolute()),\ "training window must be an absolute Timedelta" if instance_vals is None: df = self.df.copy() elif instance_vals.shape[0] == 0: df = self.df.head(0) elif variable_id is None or variable_id == self.index: df = self.df.reindex(instance_vals) df.dropna(subset=[self.index], inplace=True) else: df = self.df[self.df[variable_id].isin(instance_vals)] df = df.set_index(self.index, drop=False) # ensure filtered df has same categories as original # workaround for issue below # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538 if pdtypes.is_categorical_dtype(self.df[variable_id]): categories = pd.api.types.CategoricalDtype( categories=self.df[variable_id].cat.categories) df[variable_id] = df[variable_id].astype(categories) df = self._handle_time(df=df, time_last=time_last, training_window=training_window) if columns is not None: df = df[columns] return df
def bin_cutoff_times(cuttoff_time, bin_size): binned_cutoff_time = cuttoff_time.copy() if type(bin_size) == int: binned_cutoff_time['time'] = binned_cutoff_time['time'].apply(lambda x: x / bin_size * bin_size) else: bin_size = _check_timedelta(bin_size).get_pandas_timedelta() binned_cutoff_time['time'] = datetime_round(binned_cutoff_time['time'], bin_size) return binned_cutoff_time
def test_relative_year(): td_time = "1 years" td = _check_timedelta(td_time) assert td.get_value("Y") == 1 assert isinstance(td.delta_obj, relativedelta) time = pd.to_datetime('2020-02-29') assert time + td == pd.to_datetime('2021-02-28')
def bin_cutoff_times(cuttoff_time, bin_size): binned_cutoff_time = cuttoff_time.copy() if type(bin_size) == int: binned_cutoff_time['time'] = binned_cutoff_time['time'].apply(lambda x: x / bin_size * bin_size) else: bin_size = _check_timedelta(bin_size).get_pandas_timedelta() binned_cutoff_time['time'] = datetime_round(binned_cutoff_time['time'], bin_size) return binned_cutoff_time
def check_value(self, value, unit): if isinstance(value, str): from featuretools.utils.wrangle import _check_timedelta td = _check_timedelta(value) self.times = td.times elif isinstance(value, dict): self.times = value else: self.times = {unit: value}
def test_pd_dateoffset_to_timedelta(): single_temporal = pd.DateOffset(months=3) single_td = _check_timedelta(single_temporal) assert single_td.get_value('mo') == 3 assert single_td.delta_obj == pd.DateOffset(months=3) mult_temporal = pd.DateOffset(years=10, months=3, days=5) mult_td = _check_timedelta(mult_temporal) expected = {'Y': 10, 'mo': 3, 'd': 5} assert mult_td.get_value() == expected assert mult_td.delta_obj == mult_temporal # get_name() for multiple values is not deterministic assert len(mult_td.get_name()) == len("10 Years 3 Months 5 Days") special_dateoffset = pd.offsets.BDay(100) special_td = _check_timedelta(special_dateoffset) assert special_td.get_value("businessdays") == 100 assert special_td.delta_obj == special_dateoffset
def query_by_values(self, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None): """Query instances that have variable with given value Args: instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Data older than time_last by more than this will be ignored Returns: pd.DataFrame : instances that match constraints """ instance_vals = self._vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert (isinstance(training_window, Timedelta) and training_window.is_absolute()),\ "training window must be an absolute Timedelta" if instance_vals is None: df = self.df elif instance_vals.shape[0] == 0: df = self.df.head(0) elif variable_id is None or variable_id == self.index: df = self.df.reindex(instance_vals) df.dropna(subset=[self.index], inplace=True) else: df = self.df.merge(instance_vals.to_frame(), how="inner", left_on=variable_id, right_on=variable_id).set_index(self.index, drop=False) # ensure filtered df has same categories as original if pdtypes.is_categorical_dtype(self.df[variable_id]): categories = pd.api.types.CategoricalDtype( categories=self.df[variable_id].cat.categories) df[variable_id] = df[variable_id].astype(categories) return self._filter_and_sort(df=df, time_last=time_last, training_window=training_window, columns=columns)
def bin_cutoff_times(cutoff_time, bin_size): binned_cutoff_time = cutoff_time.ww.copy() if type(bin_size) == int: binned_cutoff_time["time"] = binned_cutoff_time["time"].apply( lambda x: x / bin_size * bin_size) else: bin_size = _check_timedelta(bin_size) binned_cutoff_time["time"] = datetime_round(binned_cutoff_time["time"], bin_size) return binned_cutoff_time
def query_by_values(self, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None): """Query instances that have variable with given value Args: instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Data older than time_last by more than this will be ignored Returns: pd.DataFrame : instances that match constraints """ instance_vals = self._vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert (isinstance(training_window, Timedelta) and training_window.is_absolute()),\ "training window must be an absolute Timedelta" if instance_vals is None: df = self.df.copy() elif instance_vals.shape[0] == 0: df = self.df.head(0) elif variable_id is None or variable_id == self.index: df = self.df.reindex(instance_vals) df.dropna(subset=[self.index], inplace=True) else: df = self.df.merge(instance_vals.to_frame(variable_id), how="inner", on=variable_id) df = df.set_index(self.index, drop=False) # ensure filtered df has same categories as original # workaround for issue below # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538 if pdtypes.is_categorical_dtype(self.df[variable_id]): categories = pd.api.types.CategoricalDtype(categories=self.df[variable_id].cat.categories) df[variable_id] = df[variable_id].astype(categories) df = self._handle_time(df=df, time_last=time_last, training_window=training_window) if columns is not None: df = df[columns] return df
def query_by_values(self, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None): """Query instances that have variable with given value Args: instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Data older than time_last by more than this will be ignored Returns: pd.DataFrame : instances that match constraints """ instance_vals = self._vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert (isinstance(training_window, Timedelta) and training_window.is_absolute()),\ "training window must be an absolute Timedelta" if instance_vals is None: df = self.df elif variable_id is None or variable_id == self.index: df = self.df.reindex(instance_vals) df.dropna(subset=[self.index], inplace=True) elif variable_id in self.indexed_by: # some variables are indexed ahead of time index = self.indexed_by[variable_id] # generate pd.Series of all values from the index. Indexing # is much faster on this type. to_append = [pd.Series(index[v]) for v in instance_vals if v in index] my_id_vals = pd.Series([]).append(to_append) df = self.df.loc[my_id_vals] else: # filter by "row.variable_id IN instance_vals" mask = self.df[variable_id].isin(instance_vals) df = self.df[mask] return self._filter_and_sort(df=df, time_last=time_last, training_window=training_window, columns=columns)
def __init__( self, base_features, parent_dataframe_name, primitive, relationship_path=None, use_previous=None, where=None, name=None, ): base_features = _validate_base_features(base_features) for bf in base_features: if bf.number_output_features > 1: raise ValueError("Cannot stack on whole multi-output feature.") self.child_dataframe_name = base_features[0].dataframe_name entityset = base_features[0].entityset relationship_path, self._path_is_unique = self._handle_relationship_path( entityset, parent_dataframe_name, relationship_path ) self.parent_dataframe_name = parent_dataframe_name if where is not None: self.where = _validate_base_features(where)[0] msg = "Where feature must be defined on child dataframe {}".format( self.child_dataframe_name ) assert self.where.dataframe_name == self.child_dataframe_name, msg if use_previous: assert entityset[self.child_dataframe_name].ww.time_index is not None, ( "Applying function that requires time index to dataframe that " "doesn't have one" ) self.use_previous = _check_timedelta(use_previous) assert len(base_features) > 0 time_index = base_features[0].dataframe.ww.time_index time_col = base_features[0].dataframe.ww[time_index] assert time_index is not None, ( "Use previous can only be defined " "on dataframes with a time index" ) assert _check_time_against_column(self.use_previous, time_col) super(AggregationFeature, self).__init__( dataframe=entityset[parent_dataframe_name], base_features=base_features, relationship_path=relationship_path, primitive=primitive, name=name, )
def _handle_time(self, entity_id, df, time_last=None, training_window=None, include_cutoff_time=True): """ Filter a dataframe for all instances before time_last. If the DataTable does not have a time index, return the original dataframe. """ dt = self[entity_id] if is_instance(df, ks, 'DataFrame') and isinstance(time_last, np.datetime64): time_last = pd.to_datetime(time_last) if dt.time_index: df_empty = df.empty if isinstance(df, pd.DataFrame) else False if time_last is not None and not df_empty: if include_cutoff_time: df = df[df[dt.time_index] <= time_last] else: df = df[df[dt.time_index] < time_last] if training_window is not None: training_window = _check_timedelta(training_window) if include_cutoff_time: mask = df[dt.time_index] > time_last - training_window else: mask = df[dt.time_index] >= time_last - training_window if dt.last_time_index is not None: lti_slice = dt.last_time_index.reindex(df.index) if include_cutoff_time: lti_mask = lti_slice > time_last - training_window else: lti_mask = lti_slice >= time_last - training_window mask = mask | lti_mask else: warnings.warn( "Using training_window but last_time_index is " "not set on entity %s" % (dt.id) ) df = df[mask] for secondary_time_index, columns in dt.secondary_time_index.items(): # should we use ignore time last here? df_empty = df.empty if isinstance(df, pd.DataFrame) else False if time_last is not None and not df_empty: mask = df[secondary_time_index] >= time_last if isinstance(df, dd.DataFrame): for col in columns: df[col] = df[col].mask(mask, np.nan) elif is_instance(df, ks, 'DataFrame'): df.loc[mask, columns] = None else: df.loc[mask, columns] = np.nan return df
def __init__(self, base_features, parent_entity, primitive, relationship_path=None, use_previous=None, where=None, name=None): if hasattr(base_features, '__iter__'): base_features = [_check_feature(bf) for bf in base_features] msg = "all base features must share the same entity" assert len(set([bf.entity for bf in base_features])) == 1, msg else: base_features = [_check_feature(base_features)] for bf in base_features: if bf.number_output_features > 1: raise ValueError("Cannot stack on whole multi-output feature.") self.child_entity = base_features[0].entity relationship_path, self._path_is_unique = \ self._handle_relationship_path(parent_entity, relationship_path) self.parent_entity = parent_entity.entityset.metadata[parent_entity.id] if where is not None: self.where = _check_feature(where) msg = "Where feature must be defined on child entity {}".format( self.child_entity.id) assert self.where.entity.id == self.child_entity.id, msg if use_previous: assert self.child_entity.time_index is not None, ( "Applying function that requires time index to entity that " "doesn't have one") self.use_previous = _check_timedelta(use_previous) assert len(base_features) > 0 time_index = base_features[0].entity.time_index time_col = base_features[0].entity[time_index] assert time_index is not None, ("Use previous can only be defined " "on entities with a time index") assert _check_time_against_column(self.use_previous, time_col) super(AggregationFeature, self).__init__(entity=parent_entity, base_features=base_features, relationship_path=relationship_path, primitive=primitive, name=name)
def __init__(self, value, unit=None, entity=None, data=None, inclusive=False): """ Args: value (float, str) : Value of timedelta, or string providing both unit and value. unit (str) : Unit of time delta. entity (str, optional) : Entity id to use if unit equals "observations". data (pd.Series, optional) : series of timestamps to use with observations. Can be calculated later. inclusive (bool, optional) : if True, include events that are exactly timedelta distance away from the original time/observation """ # TODO: check if value is int or float if isinstance(value, basestring): from featuretools.utils.wrangle import _check_timedelta td = _check_timedelta(value) value, unit = td.value, td.unit self.value = value self._original_unit = None # to alert get_name that although we converted the unit to 'd' it was initially unit = self._check_unit_plural(unit) assert unit in self._readable_units or unit in self._readable_to_unit if unit in self._readable_to_unit: unit = self._readable_to_unit[unit] # weeks if unit in self._convert_to_days: self._original_unit = unit self.value = self.value * self._convert_to_days[unit] unit = 'd' self.unit = unit if unit == self._Observations and entity is None: raise Exception("Must define entity to use %s as unit" % (unit)) self.entity = entity self.data = data self.inclusive = inclusive
def __init__(self, base_feature, group_feature, time_index=None, where=None, use_previous=None): """Summary Args: agg_feature (type): subclass of :class:`.AggregationPrimitive`; aggregation method being used. This is passed by the constructors of the cumfeat subclasses base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature or variable calculated on group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature or variable used to group the rows before computation where (optional[:class:`.PrimitiveBase`]): use_previous (optional[:class:`.Timedelta`): """ self.return_type = self.agg_feature.return_type base_feature = self._check_feature(base_feature) td_entity_id = None if isinstance(use_previous, basestring): td_entity_id = base_feature.entity.id self.use_previous = _check_timedelta(use_previous, entity_id=td_entity_id) group_feature = self._check_feature(group_feature) self.group_feature = group_feature self.base_features = [base_feature, group_feature] if time_index is None: entity = base_feature.entity time_index = IdentityFeature(entity[entity.time_index]) self.base_features += [time_index] if where is not None: self.where = where super(CumFeature, self).__init__(*self.base_features)
def __init__(self, value, unit=None, entity=None, data=None, inclusive=False): """ Args: value (float, str) : Value of timedelta, or string providing both unit and value. unit (str) : Unit of time delta. entity (str, optional) : Entity id to use if unit equals "observations". data (pd.Series, optional) : series of timestamps to use with observations. Can be calculated later. inclusive (bool, optional) : if True, include events that are exactly timedelta distance away from the original time/observation """ # TODO: check if value is int or float if isinstance(value, basestring): from featuretools.utils.wrangle import _check_timedelta td = _check_timedelta(value) value, unit = td.value, td.unit self.value = value self._original_unit = None # to alert get_name that although we converted the unit to 'd' it was initially unit = self._check_unit_plural(unit) assert unit in self._readable_units or unit in self._readable_to_unit if unit in self._readable_to_unit: unit = self._readable_to_unit[unit] # weeks if unit in self._convert_to_days: self._original_unit = unit self.value = self.value * self._convert_to_days[unit] unit = 'd' self.unit = unit if unit == self._Observations and entity is None: raise Exception("Must define entity to use %s as unit" % (unit)) self.entity = entity self.data = data self.inclusive = inclusive
def __init__(self, base_feature, group_feature, time_index=None, where=None, use_previous=None): """Summary Args: agg_feature (type): subclass of :class:`.AggregationPrimitive`; aggregation method being used. This is passed by the constructors of the cumfeat subclasses base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature or variable calculated on group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature or variable used to group the rows before computation where (optional[:class:`.PrimitiveBase`]): use_previous (optional[:class:`.Timedelta`): """ self.return_type = self.agg_feature.return_type base_feature = self._check_feature(base_feature) td_entity_id = None if isinstance(use_previous, basestring): td_entity_id = base_feature.entity.id self.use_previous = _check_timedelta( use_previous, entity_id=td_entity_id) group_feature = self._check_feature(group_feature) self.group_feature = group_feature self.base_features = [base_feature, group_feature] if time_index is None: entity = base_feature.entity time_index = IdentityFeature(entity[entity.time_index]) self.base_features += [time_index] if where is not None: self.where = where super(CumFeature, self).__init__(*self.base_features)
def test_check_timedelta(es): time_units = list(Timedelta._readable_units.keys()) expanded_units = list(Timedelta._readable_units.values()) exp_to_standard_unit = {e: t for e, t in zip(expanded_units, time_units)} singular_units = [u[:-1] for u in expanded_units] sing_to_standard_unit = {s: t for s, t in zip(singular_units, time_units)} to_standard_unit = {} to_standard_unit.update(exp_to_standard_unit) to_standard_unit.update(sing_to_standard_unit) full_units = singular_units + expanded_units + time_units + time_units strings = ["2 {}".format(u) for u in singular_units + expanded_units + time_units] strings += ["2{}".format(u) for u in time_units] for i, s in enumerate(strings): unit = full_units[i] standard_unit = unit if unit in to_standard_unit: standard_unit = to_standard_unit[unit] td = _check_timedelta(s) assert td.get_value(standard_unit) == 2
def query_by_values(self, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None, return_sorted=False, start=None, end=None, random_seed=None, shuffle=False): """Query instances that have variable with given value Args: instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (dict[str-> :class:`Timedelta`] or :class:`Timedelta`, optional): Data older than time_last by more than this will be ignored return_sorted (bool) : Return instances in the same order as the instance_vals are passed. start (int) : If provided, only return instances equal to or after this index end (int) : If provided, only return instances before this index random_seed (int) : Provided to the shuffling procedure shuffle (bool) : If True, values will be shuffled before returning Returns: pd.DataFrame : instances that match constraints """ instance_vals = self._vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert (isinstance(training_window, Timedelta) and training_window.is_absolute()),\ "training window must be an absolute Timedelta" if instance_vals is None: df = self.df elif variable_id is None or variable_id == self.index: df = self.df.loc[instance_vals] df.dropna(subset=[self.index], inplace=True) elif variable_id in self.indexed_by: # some variables are indexed ahead of time index = self.indexed_by[variable_id] # generate pd.Series of all values from the index. Indexing # is much faster on this type. to_append = [ pd.Series(index[v]) for v in instance_vals if v in index ] my_id_vals = pd.Series([]).append(to_append) df = self.df.loc[my_id_vals] else: # filter by "row.variable_id IN instance_vals" mask = self.df[variable_id].isin(instance_vals) df = self.df[mask] sortby = variable_id if (return_sorted and not shuffle) else None return self._filter_and_sort(df=df, time_last=time_last, training_window=training_window, columns=columns, sortby=sortby, start=start, end=end, shuffle=shuffle, random_seed=random_seed)
def test_check_pd_timedelta(es): pdtd = pd.Timedelta(5, 'm') td = _check_timedelta(pdtd) assert td.get_value('s') == 300
def test_check_pd_timedelta(es): pdtd = pd.Timedelta(5, 'm') td = _check_timedelta(pdtd) assert td.unit == 's' assert td.value == 300
def query_by_values(self, entity_id, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None, include_cutoff_time=True): """Query instances that have variable with given value Args: entity_id (str): The id of the entity to query instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Window defining how much time before the cutoff time data can be used when calculating features. If None, all data before cutoff time is used. include_cutoff_time (bool): If True, data at cutoff time are included in calculating features Returns: pd.DataFrame : instances that match constraints with ids in order of underlying dataframe """ entity = self[entity_id] if not variable_id: variable_id = entity.index instance_vals = _vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert training_window.has_no_observations(), "Training window cannot be in observations" if instance_vals is None: df = entity.df.copy() elif isinstance(instance_vals, pd.Series) and instance_vals.empty: df = entity.df.head(0) else: if is_instance(instance_vals, (dd, ks), 'Series'): df = entity.df.merge(instance_vals.to_frame(), how="inner", on=variable_id) elif isinstance(instance_vals, pd.Series) and is_instance(entity.df, ks, 'DataFrame'): df = entity.df.merge(ks.DataFrame({variable_id: instance_vals}), how="inner", on=variable_id) else: df = entity.df[entity.df[variable_id].isin(instance_vals)] if isinstance(entity.df, pd.DataFrame): df = df.set_index(entity.index, drop=False) # ensure filtered df has same categories as original # workaround for issue below # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538 if pdtypes.is_categorical_dtype(entity.df[variable_id]): categories = pd.api.types.CategoricalDtype(categories=entity.df[variable_id].cat.categories) df[variable_id] = df[variable_id].astype(categories) df = self._handle_time(entity_id=entity_id, df=df, time_last=time_last, training_window=training_window, include_cutoff_time=include_cutoff_time) if columns is not None: df = df[columns] return df
def test_check_pd_timedelta(es): pdtd = pd.Timedelta(5, "m") td = _check_timedelta(pdtd) assert td.get_value("s") == 300
def query_by_values(self, instance_vals, variable_id=None, columns=None, time_last=None, training_window=None, return_sorted=False, start=None, end=None, random_seed=None, shuffle=False): """Query instances that have variable with given value Args: instance_vals (pd.Dataframe, pd.Series, list[str] or str) : Instance(s) to match. variable_id (str) : Variable to query on. If None, query on index. columns (list[str]) : Columns to return. Return all columns if None. time_last (pd.TimeStamp) : Query data up to and including this time. Only applies if entity has a time index. training_window (Timedelta, optional): Data older than time_last by more than this will be ignored return_sorted (bool) : Return instances in the same order as the instance_vals are passed. start (int) : If provided, only return instances equal to or after this index. end (int) : If provided, only return instances before this index. random_seed (int) : Provided to the shuffling procedure. shuffle (bool) : If True, values will be shuffled before returning. Returns: pd.DataFrame : instances that match constraints """ instance_vals = self._vals_to_series(instance_vals, variable_id) training_window = _check_timedelta(training_window) if training_window is not None: assert (isinstance(training_window, Timedelta) and training_window.is_absolute()),\ "training window must be an absolute Timedelta" if instance_vals is None: df = self.df elif variable_id is None or variable_id == self.index: df = self.df.reindex(instance_vals) df.dropna(subset=[self.index], inplace=True) elif variable_id in self.indexed_by: # some variables are indexed ahead of time index = self.indexed_by[variable_id] # generate pd.Series of all values from the index. Indexing # is much faster on this type. to_append = [pd.Series(index[v]) for v in instance_vals if v in index] my_id_vals = pd.Series([]).append(to_append) df = self.df.loc[my_id_vals] else: # filter by "row.variable_id IN instance_vals" mask = self.df[variable_id].isin(instance_vals) df = self.df[mask] sortby = variable_id if (return_sorted and not shuffle) else None return self._filter_and_sort(df=df, time_last=time_last, training_window=training_window, columns=columns, sortby=sortby, start=start, end=end, shuffle=shuffle, random_seed=random_seed)