def test_inference_by_sample():
    period = pd.period_range(start=1970, periods=2, freq='T')
    duplicate = pd.PeriodIndex([period[0]] * 48)
    period = period.append(duplicate)
    df = pd.DataFrame({'id': range(period.size), 'period': period})

    inferred_variable_types = infer_variable_types(
        df=df,
        link_vars=["link_var"],
        variable_types={},
        time_index=None,
        secondary_time_index={},
    )

    vtype = inferred_variable_types['period']
    info = 'inference by sample must return categorical'
    assert vtype == vtypes.Categorical, info

    period = pd.period_range(start=1970, periods=5, freq='T')
    df = pd.DataFrame({'id': range(period.size), 'period': period})

    inferred_variable_types = infer_variable_types(
        df=df,
        link_vars=["link_var"],
        variable_types={},
        time_index=None,
        secondary_time_index={},
    )

    vtype = inferred_variable_types['period']
    info = 'inference by sample must return numeric'
    assert vtype == vtypes.Numeric, info
Exemple #2
0
def test_infer_variable_types():

    df = pd.DataFrame({
        'id': [0, 1, 2],
        'category': ['a', 'b', 'a'],
        'ints': ['1', '2', '1'],
        'boolean': [True, False, True],
        'date': ['3/11/2000', '3/12/2000', '3/13/2000'],
        'integers': [1, 2, 1],
        'integers_category': [1, 2, 1]
    })

    df['integers_category'] = df['integers_category'].astype('category')
    variable_types = ['id']

    inferred_variable_types = infer_variable_types(
        df=df,
        link_vars=[],
        variable_types=variable_types,
        time_index=None,
        secondary_time_index={})

    # Check columns' number
    assert len(variable_types) + len(inferred_variable_types) == len(
        df.columns)

    # Check columns' types
    assert inferred_variable_types['category'] == vtypes.Categorical
    assert inferred_variable_types['ints'] == vtypes.Categorical
    assert inferred_variable_types['boolean'] == vtypes.Boolean
    assert inferred_variable_types['date'] == vtypes.Datetime
    assert inferred_variable_types['integers'] == vtypes.Numeric
    assert inferred_variable_types['integers_category'] == vtypes.Categorical
Exemple #3
0
    def _create_variables(self, variable_types, index, time_index, secondary_time_index):
        """Extracts the variables from a dataframe

        Args:
            variable_types (dict[str -> types/str/dict[str -> type]]) : An entity's
                variable_types dict maps string variable ids to types (:class:`.Variable`)
                or type_strings (str) or (type, kwargs) to pass keyword arguments to the Variable.
            index (str): Name of index column
            time_index (str or None): Name of time_index column
            secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns
                that each map to a list of columns that depend on that secondary time
        """
        variables = []
        variable_types = variable_types.copy() or {}
        string_to_class_map = find_variable_types()
        # TODO: Remove once Text has been removed from variable types
        string_to_class_map[Text.type_string] = Text
        for vid in variable_types.copy():
            vtype = variable_types[vid]
            if isinstance(vtype, str):
                if vtype in string_to_class_map:
                    variable_types[vid] = string_to_class_map[vtype]
                else:
                    variable_types[vid] = string_to_class_map['unknown']
                    warnings.warn("Variable type {} was unrecognized, Unknown variable type was used instead".format(vtype))

        if index not in variable_types:
            variable_types[index] = vtypes.Index

        link_vars = get_linked_vars(self)
        inferred_variable_types = infer_variable_types(self.df,
                                                       link_vars,
                                                       variable_types,
                                                       time_index,
                                                       secondary_time_index)
        inferred_variable_types.update(variable_types)

        for v in inferred_variable_types:
            # TODO document how vtype can be tuple
            vtype = inferred_variable_types[v]
            if isinstance(vtype, tuple):
                # vtype is (ft.Variable, dict_of_kwargs)
                _v = vtype[0](v, self, **vtype[1])
            else:
                _v = inferred_variable_types[v](v, self)
            variables += [_v]
        # convert data once we've inferred
        self.df = convert_all_variable_data(df=self.df,
                                            variable_types=inferred_variable_types)
        # make sure index is at the beginning
        index_variable = [v for v in variables
                          if v.id == index][0]
        self.variables = [index_variable] + [v for v in variables
                                             if v.id != index]
Exemple #4
0
    def _create_variables(self, variable_types, index, time_index, secondary_time_index):
        """Extracts the variables from a dataframe

        Args:
            variable_types (dict[str -> dict[str -> type]]) : An entity's
                variable_types dict maps string variable ids to types (:class:`.Variable`)
                or (type, kwargs) to pass keyword arguments to the Variable.
            index (str): Name of index column
            time_index (str or None): Name of time_index column
            secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns
                that each map to a list of columns that depend on that secondary time
        """
        variables = []
        variable_types = variable_types or {}
        if index not in variable_types:
            variable_types[index] = vtypes.Index

        link_vars = get_linked_vars(self)
        inferred_variable_types = infer_variable_types(self.df,
                                                       link_vars,
                                                       variable_types,
                                                       time_index,
                                                       secondary_time_index)
        inferred_variable_types.update(variable_types)

        for v in inferred_variable_types:
            # TODO document how vtype can be tuple
            vtype = inferred_variable_types[v]
            if isinstance(vtype, tuple):
                # vtype is (ft.Variable, dict_of_kwargs)
                _v = vtype[0](v, self, **vtype[1])
            else:
                _v = inferred_variable_types[v](v, self)
            variables += [_v]
        # convert data once we've inferred
        self.df = convert_all_variable_data(df=self.df,
                                            variable_types=inferred_variable_types)
        # make sure index is at the beginning
        index_variable = [v for v in variables
                          if v.id == index][0]
        self.variables = [index_variable] + [v for v in variables
                                             if v.id != index]
def test_infer_variable_types_empty_df():
    # test empty dataframe
    empty_df = pd.DataFrame({
        "id": [],
        "empty_int": [],
        "empty_category": [],
        "empty_object": [],
        "empty_date": [],
        "empty_boolean": [],
    })

    empty_df['empty_int'] = empty_df['empty_int'].astype('int')
    empty_df['empty_category'] = empty_df['empty_category'].astype('category')
    empty_df['empty_object'] = empty_df['empty_object'].astype('object')
    empty_df['empty_date'] = empty_df['empty_date'].astype('datetime64[ns]')
    empty_df['empty_boolean'] = empty_df['empty_boolean'].astype(bool)
    variable_types = {'id': vtypes.Index}

    inferred_variable_types = infer_variable_types(
        df=empty_df,
        variable_types=variable_types,
        link_vars=[],
        time_index=None,
        secondary_time_index={})

    # Check columns' types
    assert "id" not in inferred_variable_types
    assert inferred_variable_types['empty_int'] == vtypes.Numeric
    assert inferred_variable_types['empty_category'] == vtypes.Categorical
    assert inferred_variable_types['empty_object'] == vtypes.Categorical
    assert inferred_variable_types['empty_boolean'] == vtypes.Boolean
    assert inferred_variable_types['empty_date'] == vtypes.Datetime

    # Check columns' number
    assert len(variable_types) + len(inferred_variable_types) == len(
        empty_df.columns)