Beispiel #1
0
def test_Q():
    df = pd.DataFrame({'var.name': [1, 2, 3], 'class': [1, 2, 3]})

    with pytest.raises(NameError):
        df >> define(y='var.name')

    with pytest.raises(NameError):
        df >> create(y='var.name')

    with pytest.raises(SyntaxError):
        df >> define(y='class+1')

    with pytest.raises(SyntaxError):
        df >> create(y='class+1')

    with pytest.raises(SyntaxError):
        df >> arrange('class+1')

    df >> define(y='Q("var.name")')
    df >> create(y='Q("var.name")')
    df >> define(y='Q("class")')
    df >> create(y='Q("class")')
    df >> define(y='class')
    df >> create(y='class')
    df >> arrange('class')
    df >> arrange('Q("class")+1')
Beispiel #2
0
def test_data_as_first_argument():
    def equals(df1, df2):
        return df1.equals(df2)

    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})

    assert equals(define(df.copy(), 'x*2'), df.copy() >> define('x*2'))
    assert equals(create(df, 'x*2'), df >> create('x*2'))
    assert len(sample_n(df, 5)) == len(df >> sample_n(5))
    assert len(sample_frac(df, .3)) == len(df >> sample_frac(.3))
    assert equals(select(df, 'x'), df >> select('x'))
    assert equals(rename(df.copy(), z='x'), df.copy() >> rename(z='x'))
    assert equals(distinct(df), df >> distinct())
    assert equals(arrange(df, 'np.sin(x)'), df >> arrange('np.sin(x)'))
    assert equals(group_by(df, 'x'), df >> group_by('x'))
    assert equals(ungroup(group_by(df, 'x')), df >> group_by('x') >> ungroup())
    assert equals(summarize(df, 'sum(x)'), df >> summarize('sum(x)'))
    assert equals(query(df, 'x % 2'), df >> query('x % 2'))
    assert equals(tally(df, 'x'), df >> tally('x'))

    def xsum(gdf):
        return [gdf['x'].sum()]

    assert equals(do(group_by(df, 'y'), xsum=xsum),
                  df >> group_by('y') >> do(xsum=xsum))

    assert len(head(df, 4) == 4)
    assert len(tail(df, 4) == 4)
Beispiel #3
0
def test_options_context():
    # Straight test
    set_option('modify_input_data', False)
    assert not get_option('modify_input_data')
    with options(modify_input_data=True):
        assert get_option('modify_input_data')
    assert not get_option('modify_input_data')

    # With some data
    df = pd.DataFrame({'x': [0, 1, 2, 3]})

    df2 = df >> define(y='2*x')
    assert not df.equals(df2)

    with options(modify_input_data=True):
        df3 = df >> define(z='3*x')
    assert df.equals(df3)
    assert df is df3

    df4 = df >> define(w='4*x')
    assert not df.equals(df4)

    # That the options context manager should not muffle
    # an exception.
    with pytest.raises(ValueError):
        with options(modify_input_data=True):
            raise ValueError()

    # The above exception should not leave a modified option
    assert not get_option('modify_input_data')

    with pytest.raises(ValueError):
        assert not get_option('time_travel')
Beispiel #4
0
def test_define():
    x = np.array([1, 2, 3])
    y = np.array([4, 5, 6])
    df = pd.DataFrame({'x': x})

    # No args
    df2 = df >> define()
    assert len(df2.columns) == 1

    # All types of args
    df2 = df >> define(('x*2', 'x*2'), ('x*3', 'x*3'),
                       x_sq='x**2',
                       x_cumsum='np.cumsum(x)',
                       y=y,
                       w=9)

    assert len(df2.columns) == 7
    assert all(df2['x*2'] == x * 2)
    assert all(df2['x*3'] == x * 3)
    assert all(df2['x_sq'] == x**2)
    assert all(df2['x_cumsum'] == np.cumsum(x))
    assert all(df2['y'] == y)
    assert all(df2['w'] == 9)

    result = df >> define('x*4')
    assert len(result.columns) == 2

    # Branches
    with pytest.raises(ValueError):
        df >> define(z=[1, 2, 3, 4])

    # Works with group_by
    result = df >> group_by('x < 3') >> define(z='len(x)')
    assert all(result['z'] == [2, 2, 1])

    # Potentially problematic index
    def non_range_index_func(s):
        return pd.Series([11, 12, 13], index=[21, 22, 23])

    result = df >> define(z='non_range_index_func(x)')
    assert all(result['z'] == [11, 12, 13])

    # Can create categorical column
    result = df >> define(xcat='pd.Categorical(x)')
    assert all(result['xcat'] == result['x'])
    assert pdtypes.is_categorical_dtype(result['xcat'])

    # Messing with indices
    result = (df >> query('x >= 2') >> group_by('x') >> define(y='x'))
    assert all(result['x'] == result['y'])

    # Do not modify group column
    with pytest.raises(ValueError):
        df >> group_by('x') >> define(x='2*x')

    # Series-like iterables
    # https://github.com/has2k1/plydata/issues/21
    result = df >> define(y=pd.Series(y))
    assert all(result['y'] == y)
Beispiel #5
0
def test_DataOperator():
    s = {1, 2, 3}  # unrecognized datastore
    data = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 2, 3]})

    with pytest.raises(TypeError):
        s >> define(z='x')

    # Currying
    result = define(z=[3, 2, 1])(data)
    assert 'x' in result
    assert 'y' in result
    assert 'z' in result
Beispiel #6
0
def test_distinct():
    # Index                  0, 1, 2, 3, 4, 5, 6
    df = pd.DataFrame({'x': [1, 1, 2, 3, 4, 4, 5], 'y': [1, 2, 3, 4, 5, 5, 6]})
    I = pd.Index  # noqa: E741

    result = df >> distinct()
    assert result.index.equals(I([0, 1, 2, 3, 4, 6]))

    result = df >> distinct(('x', 'y'), z='x+1')
    assert result.index.equals(I([0, 1, 2, 3, 4, 6]))

    result = df >> distinct('last')
    assert result.index.equals(I([0, 1, 2, 3, 5, 6]))

    result = df >> distinct(False)
    assert result.index.equals(I([0, 1, 2, 3, 6]))

    result = df >> distinct(['x'])
    assert result.index.equals(I([0, 2, 3, 4, 6]))

    result = df >> distinct(['x'], 'last')
    assert result.index.equals(I([1, 2, 3, 5, 6]))

    result = df >> distinct(z='x%2')
    assert result.index.equals(I([0, 2]))

    result1 = df >> define(z='x%2') >> distinct(['x', 'z'])
    result2 = df >> distinct(['x'], z='x%2')
    assert result1.equals(result2)

    with pytest.raises(Exception):
        df >> distinct(['x'], 'last', 'cause_exception')
Beispiel #7
0
    def _apply_transforms(df, definitions):
        """
        df (Pandas.DataFrame): Dataframe containing raw data queried from Census
            API
        definitions (List[Tuple[str, str]]): List of (name, definition) pairs.
            Column definitions should be strings containing valid Python expressions. 
            Expressions can reference other columns in df by name. Example:

            [
                (
                    "Column Name",
                    "(B02001_001E - B02001_002E) / B02001_001E"
                )
            ]

            This expression references columns containing data for census variables
            B02001_001E (population, all races) and B02001_002E (population, white).

            It calculates the proportion of a geography's population identifying as 
            a race other than White.

            See https://plydata.readthedocs.io/en/latest/generated/plydata.one_table_verbs.define.html
            for more on how these expressions are evaluated 

        returns (Pandas DataFrame): Dataframe containing transformed columns
        """

        all_vars = df.columns.values
        df = plydata.define(df, *definitions).drop(all_vars, axis=1)
        return df
Beispiel #8
0
def create_readme_image():
    kwargs = dict(width=6, height=4)
    df = pd.DataFrame({'x': np.linspace(0, 2 * np.pi, 500)})

    p = (df >> define(y='np.sin(x)') >> define_where(
        'y>=0', sign=('"positive"', '"negative"')) >>
         (ggplot(aes('x', 'y')) + geom_line(aes(color='sign'), size=1.5)))
    p.save('readme-image.png', **kwargs)
Beispiel #9
0
def test_define():
    x = np.array([1, 2, 3])
    y = np.array([4, 5, 6])
    d = custom_dict({'x': x})

    # No args
    d >> define()
    assert len(d) == 1

    # All types of args
    result = d >> define(('x*2', 'x*2'), ('x*3', 'x*3'),
                         x_sq='x**2',
                         x_cumsum='np.cumsum(x)',
                         y=y)

    assert len(result) == 6
    assert all(result['x*2'] == x * 2)
    assert all(result['x*3'] == x * 3)
    assert all(result['x_sq'] == x**2)
    assert all(result['x_cumsum'] == np.cumsum(x))
    assert all(result['y'] == y)
Beispiel #10
0
def test_call():
    def remove_column_a(df):
        _df = df.copy()
        del _df['a']
        return _df

    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, np.nan]})

    # External function
    result = df >> call(remove_column_a)
    assert 'a' not in result
    assert 'b' in result

    # dataframe method
    result = df >> call('.dropna')
    assert len(result) == 2

    # dataframe method with arguments
    result = df >> define(c='a*2') >> call('.dropna', axis=1)
    assert 'a' in result
    assert 'b' not in result
    assert 'c' in result
Beispiel #11
0
def test_data_mutability():
    # These tests affirm that we know the consequences of the verbs.
    # A test in the Mutable section should not fail without a change
    # in implementation. That change should be triggered when Pandas
    # implements a consistent copy-on-write policy.
    #
    # When a test in the mutable section fails, it is bad news. The
    # should be no memory usage gains by reusing the original data,
    # except for the case of `rename`.
    df = pd.DataFrame({'x': [0, 1, 2, 3, 4, 5], 'y': [0, 0, 1, 1, 2, 3]})

    # Default to not mutable
    df >> define(z='x**2')
    assert 'z' not in df

    df >> group_by(z='x**2')
    assert 'z' not in df

    arr = df >> pull('x')
    arr[0] = 99
    assert df.loc[0, 'x'] != 99

    df2 = df >> slice_rows(3)
    df2.loc[0, 'x'] = 999
    assert df.loc[0, 'x'] != 999

    set_option('modify_input_data', True)

    df2 = df.copy()
    df2 >> define(z='x**2')
    assert 'z' in df2

    df2 = df.copy()
    df2 >> group_by(z='x**2')
    assert 'z' in df2

    df2 = df.copy()
    arr = df2 >> pull('x')
    arr[0] = 99
    assert df2.loc[0, 'x'] == 99

    # Not mutable
    df2 = df.copy()
    df2 >> create(z='x**2')
    assert 'z' not in df2

    df2 >> sample_n(3) >> define(z='x**2')
    assert 'z' not in df2

    df2 >> sample_frac(.5) >> define(z='x**2')
    assert 'z' not in df2

    df2 >> select('x') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> select('x', 'y') >> define(z='x**2')
    assert 'z' not in df2

    # dataframe.rename has copy-on-write (if copy=False) that affects
    # only the new frame. This creates possibility for "action at a
    # distance" effects on the new frame when the original is modified
    result = df2 >> rename(x='z')
    df2['y'] = 3
    result['x'] = 4
    assert 'z' not in df2
    assert df2.loc[0, 'y'] != 4
    assert result.loc[0, 'x'] != 3
    assert result is df2

    df2 >> arrange('x') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> query('x%2') >> define(z='x**2')
    assert 'z' not in df2

    df2 >> group_indices(z='x%2')
    assert 'z' not in df2

    set_option('modify_input_data', False)
Beispiel #12
0
 def test_arrange(self):
     result = self.df >> define(z='np.sin(x)') >> arrange('z')
     assert isinstance(result, GroupedDataFrame)
Beispiel #13
0
 def test_define(self):
     result = self.df.copy() >> define(z='2*x')
     assert isinstance(result, GroupedDataFrame)
Beispiel #14
0
 def test_define(self):
     v = define(y='x*2')
     self._test(v)