def test_inference_by_sample(): period = pd.period_range(start=1970, periods=2, freq='T') duplicate = pd.PeriodIndex([period[0]] * 48) period = period.append(duplicate) df = pd.DataFrame({'id': range(period.size), 'period': period}) inferred_variable_types = infer_variable_types( df=df, link_vars=["link_var"], variable_types={}, time_index=None, secondary_time_index={}, ) vtype = inferred_variable_types['period'] info = 'inference by sample must return categorical' assert vtype == vtypes.Categorical, info period = pd.period_range(start=1970, periods=5, freq='T') df = pd.DataFrame({'id': range(period.size), 'period': period}) inferred_variable_types = infer_variable_types( df=df, link_vars=["link_var"], variable_types={}, time_index=None, secondary_time_index={}, ) vtype = inferred_variable_types['period'] info = 'inference by sample must return numeric' assert vtype == vtypes.Numeric, info
def test_infer_variable_types(): df = pd.DataFrame({ 'id': [0, 1, 2], 'category': ['a', 'b', 'a'], 'ints': ['1', '2', '1'], 'boolean': [True, False, True], 'date': ['3/11/2000', '3/12/2000', '3/13/2000'], 'integers': [1, 2, 1], 'integers_category': [1, 2, 1] }) df['integers_category'] = df['integers_category'].astype('category') variable_types = ['id'] inferred_variable_types = infer_variable_types( df=df, link_vars=[], variable_types=variable_types, time_index=None, secondary_time_index={}) # Check columns' number assert len(variable_types) + len(inferred_variable_types) == len( df.columns) # Check columns' types assert inferred_variable_types['category'] == vtypes.Categorical assert inferred_variable_types['ints'] == vtypes.Categorical assert inferred_variable_types['boolean'] == vtypes.Boolean assert inferred_variable_types['date'] == vtypes.Datetime assert inferred_variable_types['integers'] == vtypes.Numeric assert inferred_variable_types['integers_category'] == vtypes.Categorical
def _create_variables(self, variable_types, index, time_index, secondary_time_index): """Extracts the variables from a dataframe Args: variable_types (dict[str -> types/str/dict[str -> type]]) : An entity's variable_types dict maps string variable ids to types (:class:`.Variable`) or type_strings (str) or (type, kwargs) to pass keyword arguments to the Variable. index (str): Name of index column time_index (str or None): Name of time_index column secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns that each map to a list of columns that depend on that secondary time """ variables = [] variable_types = variable_types.copy() or {} string_to_class_map = find_variable_types() # TODO: Remove once Text has been removed from variable types string_to_class_map[Text.type_string] = Text for vid in variable_types.copy(): vtype = variable_types[vid] if isinstance(vtype, str): if vtype in string_to_class_map: variable_types[vid] = string_to_class_map[vtype] else: variable_types[vid] = string_to_class_map['unknown'] warnings.warn("Variable type {} was unrecognized, Unknown variable type was used instead".format(vtype)) if index not in variable_types: variable_types[index] = vtypes.Index link_vars = get_linked_vars(self) inferred_variable_types = infer_variable_types(self.df, link_vars, variable_types, time_index, secondary_time_index) inferred_variable_types.update(variable_types) for v in inferred_variable_types: # TODO document how vtype can be tuple vtype = inferred_variable_types[v] if isinstance(vtype, tuple): # vtype is (ft.Variable, dict_of_kwargs) _v = vtype[0](v, self, **vtype[1]) else: _v = inferred_variable_types[v](v, self) variables += [_v] # convert data once we've inferred self.df = convert_all_variable_data(df=self.df, variable_types=inferred_variable_types) # make sure index is at the beginning index_variable = [v for v in variables if v.id == index][0] self.variables = [index_variable] + [v for v in variables if v.id != index]
def _create_variables(self, variable_types, index, time_index, secondary_time_index): """Extracts the variables from a dataframe Args: variable_types (dict[str -> dict[str -> type]]) : An entity's variable_types dict maps string variable ids to types (:class:`.Variable`) or (type, kwargs) to pass keyword arguments to the Variable. index (str): Name of index column time_index (str or None): Name of time_index column secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns that each map to a list of columns that depend on that secondary time """ variables = [] variable_types = variable_types or {} if index not in variable_types: variable_types[index] = vtypes.Index link_vars = get_linked_vars(self) inferred_variable_types = infer_variable_types(self.df, link_vars, variable_types, time_index, secondary_time_index) inferred_variable_types.update(variable_types) for v in inferred_variable_types: # TODO document how vtype can be tuple vtype = inferred_variable_types[v] if isinstance(vtype, tuple): # vtype is (ft.Variable, dict_of_kwargs) _v = vtype[0](v, self, **vtype[1]) else: _v = inferred_variable_types[v](v, self) variables += [_v] # convert data once we've inferred self.df = convert_all_variable_data(df=self.df, variable_types=inferred_variable_types) # make sure index is at the beginning index_variable = [v for v in variables if v.id == index][0] self.variables = [index_variable] + [v for v in variables if v.id != index]
def test_infer_variable_types_empty_df(): # test empty dataframe empty_df = pd.DataFrame({ "id": [], "empty_int": [], "empty_category": [], "empty_object": [], "empty_date": [], "empty_boolean": [], }) empty_df['empty_int'] = empty_df['empty_int'].astype('int') empty_df['empty_category'] = empty_df['empty_category'].astype('category') empty_df['empty_object'] = empty_df['empty_object'].astype('object') empty_df['empty_date'] = empty_df['empty_date'].astype('datetime64[ns]') empty_df['empty_boolean'] = empty_df['empty_boolean'].astype(bool) variable_types = {'id': vtypes.Index} inferred_variable_types = infer_variable_types( df=empty_df, variable_types=variable_types, link_vars=[], time_index=None, secondary_time_index={}) # Check columns' types assert "id" not in inferred_variable_types assert inferred_variable_types['empty_int'] == vtypes.Numeric assert inferred_variable_types['empty_category'] == vtypes.Categorical assert inferred_variable_types['empty_object'] == vtypes.Categorical assert inferred_variable_types['empty_boolean'] == vtypes.Boolean assert inferred_variable_types['empty_date'] == vtypes.Datetime # Check columns' number assert len(variable_types) + len(inferred_variable_types) == len( empty_df.columns)