Example #1
0
    def test_time(self):
        t = datetime(1, 1, 1, 3, 30, 0)
        deltas = np.random.randint(1, 20, 3).cumsum()
        ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas])
        df = DataFrame({'a': np.random.randn(len(ts)),
                        'b': np.random.randn(len(ts))},
                       index=ts)
        ax = df.plot()

        # verify tick labels
        ticks = ax.get_xticks()
        labels = ax.get_xticklabels()
        for t, l in zip(ticks, labels):
            m, s = divmod(int(t), 60)
            h, m = divmod(m, 60)
            xp = l.get_text()
            if len(xp) > 0:
                rs = time(h, m, s).strftime('%H:%M:%S')
                self.assertEqual(xp, rs)

        # change xlim
        ax.set_xlim('1:30', '5:00')

        # check tick labels again
        ticks = ax.get_xticks()
        labels = ax.get_xticklabels()
        for t, l in zip(ticks, labels):
            m, s = divmod(int(t), 60)
            h, m = divmod(m, 60)
            xp = l.get_text()
            if len(xp) > 0:
                rs = time(h, m, s).strftime('%H:%M:%S')
                self.assertEqual(xp, rs)
Example #2
0
    def components(self):
        """
        Return a dataframe of the components (days, hours, minutes,
        seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.

        Returns
        -------
        a DataFrame
        """
        from pandas import DataFrame

        columns = ['days', 'hours', 'minutes', 'seconds',
                   'milliseconds', 'microseconds', 'nanoseconds']
        hasnans = self.hasnans
        if hasnans:
            def f(x):
                if isnull(x):
                    return [np.nan] * len(columns)
                return x.components
        else:
            def f(x):
                return x.components

        result = DataFrame([f(x) for x in self])
        result.columns = columns
        if not hasnans:
            result = result.astype('int64')
        return result
Example #3
0
 def test_query_single_element_booleans(self, parser, engine):
     columns = 'bid', 'bidsize', 'ask', 'asksize'
     data = np.random.randint(2, size=(1, len(columns))).astype(bool)
     df = DataFrame(data, columns=columns)
     res = df.query('bid & ask', engine=engine, parser=parser)
     expected = df[df.bid & df.ask]
     assert_frame_equal(res, expected)
    def test_append_concat(self):
        rng = date_range('5/8/2012 1:45', periods=10, freq='5T')
        ts = Series(np.random.randn(len(rng)), rng)
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)

        result = ts.append(ts)
        result_df = df.append(df)
        ex_index = DatetimeIndex(np.tile(rng.values, 2))
        tm.assert_index_equal(result.index, ex_index)
        tm.assert_index_equal(result_df.index, ex_index)

        appended = rng.append(rng)
        tm.assert_index_equal(appended, ex_index)

        appended = rng.append([rng, rng])
        ex_index = DatetimeIndex(np.tile(rng.values, 3))
        tm.assert_index_equal(appended, ex_index)

        # different index names
        rng1 = rng.copy()
        rng2 = rng.copy()
        rng1.name = 'foo'
        rng2.name = 'bar'
        assert rng1.append(rng1).name == 'foo'
        assert rng1.append(rng2).name is None
def compute_confusion_matrix(target, predicted, normalize=True, sort = True):
    """ returns a confusion matrix as a data frame with labels
    Parameters:
        target (array): The values that are predicted.
        predicted (array): predicted values.
        normalize (bool): If True, Normalize
        normalize (bool): If true sort by value.
    Returns (DataFrame): df with the confusion matrix.
    """

    # Determine the uniqu values in the target list, sort them and assign as labels.
    labels = np.unique(list(target))
    labels.sort()

    # Compute the confusion matrix, place into data frame and normailize if desired.
    confusion = metrics.confusion_matrix(target, predicted, labels)
    confusion = DataFrame(confusion, index=labels, columns=labels)
    if normalize:
        confusion = confusion.apply(lambda x: x / np.sum(x), axis=1)

    # if sort is true: find the max value for each and then sort, the confusion matrix
    if sort:
        #get the max values, order and then use to order the confusion matrix on both axes
        max_values =confusion.max(axis = 1)
        max_values.sort(inplace = True, ascending=False)
        order = max_values.index
        confusion = confusion.loc[order,order]
    return confusion
Example #6
0
    def test_nested_scope(self):
        from pandas.core.computation.ops import UndefinedVariableError
        engine = self.engine
        parser = self.parser
        # smoke test
        x = 1  # noqa
        result = pd.eval('x + 1', engine=engine, parser=parser)
        assert result == 2

        df = DataFrame(np.random.randn(5, 3))
        df2 = DataFrame(np.random.randn(5, 3))

        # don't have the pandas parser
        with pytest.raises(SyntaxError):
            df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser)

        with pytest.raises(UndefinedVariableError):
            df.query('(df>0) & (df2>0)', engine=engine, parser=parser)

        expected = df[(df > 0) & (df2 > 0)]
        result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine,
                         parser=parser)
        assert_frame_equal(expected, result)

        expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
        result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]',
                         engine=engine, parser=parser)
        assert_frame_equal(expected, result)
Example #7
0
 def test_query_with_nested_special_character(self, parser, engine):
     skip_if_no_pandas_parser(parser)
     df = DataFrame({'a': ['a', 'b', 'test & test'],
                     'b': [1, 2, 3]})
     res = df.query('a == "test & test"', parser=parser, engine=engine)
     expec = df[df.a == 'test & test']
     assert_frame_equal(res, expec)
Example #8
0
    def test_query_with_partially_named_multiindex(self, parser, engine):
        skip_if_no_pandas_parser(parser)
        a = np.random.choice(['red', 'green'], size=10)
        b = np.arange(10)
        index = MultiIndex.from_arrays([a, b])
        index.names = [None, 'rating']
        df = DataFrame(np.random.randn(10, 2), index=index)
        res = df.query('rating == 1', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values('rating').values, index=index,
                     name='rating')
        exp = df[ind == 1]
        assert_frame_equal(res, exp)

        res = df.query('rating != 1', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values('rating').values, index=index,
                     name='rating')
        exp = df[ind != 1]
        assert_frame_equal(res, exp)

        res = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values(0).values, index=index)
        exp = df[ind == "red"]
        assert_frame_equal(res, exp)

        res = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
        ind = Series(df.index.get_level_values(0).values, index=index)
        exp = df[ind != "red"]
        assert_frame_equal(res, exp)
Example #9
0
    def test_nested_scope(self):
        engine = self.engine
        parser = self.parser

        skip_if_no_pandas_parser(parser)

        df = DataFrame(np.random.randn(5, 3))
        df2 = DataFrame(np.random.randn(5, 3))
        expected = df[(df > 0) & (df2 > 0)]

        result = df.query('(@df > 0) & (@df2 > 0)', engine=engine,
                          parser=parser)
        assert_frame_equal(result, expected)

        result = pd.eval('df[df > 0 and df2 > 0]', engine=engine,
                         parser=parser)
        assert_frame_equal(result, expected)

        result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]',
                         engine=engine, parser=parser)
        expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
        assert_frame_equal(result, expected)

        result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser)
        expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser)
        assert_frame_equal(result, expected)
Example #10
0
 def test_stable_descending_sort(self):
     # GH #6399
     df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']],
                    columns=['sort_col', 'order'])
     sorted_df = df.sort_values(by='sort_col', kind='mergesort',
                                ascending=False)
     assert_frame_equal(df, sorted_df)
Example #11
0
class TestDataFrameEvalWithFrame(object):

    def setup_method(self, method):
        self.frame = DataFrame(np.random.randn(10, 3), columns=list('abc'))

    def teardown_method(self, method):
        del self.frame

    def test_simple_expr(self, parser, engine):
        res = self.frame.eval('a + b', engine=engine, parser=parser)
        expect = self.frame.a + self.frame.b
        assert_series_equal(res, expect)

    def test_bool_arith_expr(self, parser, engine):
        res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser)
        expect = self.frame.a[self.frame.a < 1] + self.frame.b
        assert_series_equal(res, expect)

    @pytest.mark.parametrize('op', ['+', '-', '*', '/'])
    def test_invalid_type_for_operator_raises(self, parser, engine, op):
        df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
        msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'"

        with pytest.raises(TypeError, match=msg):
            df.eval('a {0} b'.format(op), engine=engine, parser=parser)
Example #12
0
 def test_frame_datetime64_handling_groupby(self):
     # it works!
     df = DataFrame([(3, np.datetime64('2012-07-03')),
                     (3, np.datetime64('2012-07-04'))],
                    columns=['a', 'date'])
     result = df.groupby('a').first()
     assert result['date'][3] == Timestamp('2012-07-03')
Example #13
0
 def test_groupby_max_datetime64(self):
     # GH 5869
     # datetimelike dtype conversion from int
     df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
     expected = df.groupby('A')['A'].apply(lambda x: x.max())
     result = df.groupby('A')['A'].max()
     assert_series_equal(result, expected)
Example #14
0
    def test_timegrouper_with_reg_groups_freq(self, freq):
        # GH 6764 multiple grouping with/without sort
        df = DataFrame({
            'date': pd.to_datetime([
                '20121002', '20121007', '20130130', '20130202', '20130305',
                '20121002', '20121207', '20130130', '20130202', '20130305',
                '20130202', '20130305'
            ]),
            'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
            'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
                           359, 801],
            'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
        }).set_index('date')

        expected = (
            df.groupby('user_id')['whole_cost']
              .resample(freq)
              .sum(min_count=1)  # XXX
              .dropna()
              .reorder_levels(['date', 'user_id'])
              .sort_index()
              .astype('int64')
        )
        expected.name = 'whole_cost'

        result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
                                           'user_id'])['whole_cost'].sum()
        assert_series_equal(result1, expected)

        result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[
            'whole_cost'].sum()
        assert_series_equal(result2, expected)
Example #15
0
    def test_groupby_groups_datetimeindex(self):
        # GH#1430
        periods = 1000
        ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods)
        df = DataFrame({'high': np.arange(periods),
                        'low': np.arange(periods)}, index=ind)
        grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))

        # it works!
        groups = grouped.groups
        assert isinstance(list(groups.keys())[0], datetime)

        # GH#11442
        index = pd.date_range('2015/01/01', periods=5, name='date')
        df = pd.DataFrame({'A': [5, 6, 7, 8, 9],
                           'B': [1, 2, 3, 4, 5]}, index=index)
        result = df.groupby(level='date').groups
        dates = ['2015-01-05', '2015-01-04', '2015-01-03',
                 '2015-01-02', '2015-01-01']
        expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date')
                    for date in dates}
        tm.assert_dict_equal(result, expected)

        grouped = df.groupby(level='date')
        for date in dates:
            result = grouped.get_group(date)
            data = [[df.loc[date, 'A'], df.loc[date, 'B']]]
            expected_index = pd.DatetimeIndex([date], name='date')
            expected = pd.DataFrame(data,
                                    columns=list('AB'),
                                    index=expected_index)
            tm.assert_frame_equal(result, expected)
Example #16
0
    def test_crosstab_margins(self):
        a = np.random.randint(0, 7, size=100)
        b = np.random.randint(0, 3, size=100)
        c = np.random.randint(0, 5, size=100)

        df = DataFrame({'a': a, 'b': b, 'c': c})

        result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'),
                          margins=True)

        self.assertEqual(result.index.names, ('a',))
        self.assertEqual(result.columns.names, ['b', 'c'])

        all_cols = result['All', '']
        exp_cols = df.groupby(['a']).size().astype('i8')
        exp_cols = exp_cols.append(Series([len(df)], index=['All']))

        tm.assert_series_equal(all_cols, exp_cols)

        all_rows = result.ix['All']
        exp_rows = df.groupby(['b', 'c']).size().astype('i8')
        exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')]))

        exp_rows = exp_rows.reindex(all_rows.index)
        exp_rows = exp_rows.fillna(0).astype(np.int64)
        tm.assert_series_equal(all_rows, exp_rows)
Example #17
0
    def test_coercion_with_loc(self):
        for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
            start_dataframe = DataFrame({'foo': start_data})
            start_dataframe.loc[0, ['foo']] = None

            expected_dataframe = DataFrame({'foo': expected_result})
            tm.assert_frame_equal(start_dataframe, expected_dataframe)
Example #18
0
    def test_parse_dates_noconvert_thousands(self):
        # see gh-14066
        data = 'a\n04.15.2016'

        expected = DataFrame([datetime(2016, 4, 15)], columns=['a'])
        result = self.read_csv(StringIO(data), parse_dates=['a'],
                               thousands='.')
        tm.assert_frame_equal(result, expected)

        exp_index = DatetimeIndex(['2016-04-15'], name='a')
        expected = DataFrame(index=exp_index)
        result = self.read_csv(StringIO(data), index_col=0,
                               parse_dates=True, thousands='.')
        tm.assert_frame_equal(result, expected)

        data = 'a,b\n04.15.2016,09.16.2013'

        expected = DataFrame([[datetime(2016, 4, 15),
                               datetime(2013, 9, 16)]],
                             columns=['a', 'b'])
        result = self.read_csv(StringIO(data), parse_dates=['a', 'b'],
                               thousands='.')
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([[datetime(2016, 4, 15),
                               datetime(2013, 9, 16)]],
                             columns=['a', 'b'])
        expected = expected.set_index(['a', 'b'])
        result = self.read_csv(StringIO(data), index_col=[0, 1],
                               parse_dates=True, thousands='.')
        tm.assert_frame_equal(result, expected)
Example #19
0
    def test_groupby_function_tuple_1677(self):
        df = DataFrame(np.random.rand(100),
                       index=date_range("1/1/2000", periods=100))
        monthly_group = df.groupby(lambda x: (x.year, x.month))

        result = monthly_group.mean()
        assert isinstance(result.index[0], tuple)
Example #20
0
 def test_frame_reset_index(self):
     dr = date_range('2012-06-02', periods=10, tz='US/Eastern')
     df = DataFrame(np.random.randn(len(dr)), dr)
     roundtripped = df.reset_index().set_index('index')
     xp = df.index.tz
     rs = roundtripped.index.tz
     self.assertEquals(xp, rs)
Example #21
0
    def test_join_aware(self):
        rng = date_range('1/1/2011', periods=10, freq='H')
        ts = Series(np.random.randn(len(rng)), index=rng)

        ts_utc = ts.tz_localize('utc')

        self.assertRaises(Exception, ts.__add__, ts_utc)
        self.assertRaises(Exception, ts_utc.__add__, ts)

        test1 = DataFrame(np.zeros((6,3)),
                          index=date_range("2012-11-15 00:00:00", periods=6,
                                           freq="100L", tz="US/Central"))
        test2 = DataFrame(np.zeros((3,3)),
                          index=date_range("2012-11-15 00:00:00", periods=3,
                                           freq="250L", tz="US/Central"),
                          columns=range(3,6))

        result = test1.join(test2, how='outer')
        ex_index = test1.index.union(test2.index)

        self.assertTrue(result.index.equals(ex_index))
        self.assertTrue(result.index.tz.zone == 'US/Central')

        # non-overlapping
        rng = date_range("2012-11-15 00:00:00", periods=6,
                         freq="H", tz="US/Central")

        rng2 = date_range("2012-11-15 12:00:00", periods=6,
                         freq="H", tz="US/Eastern")

        result = rng.union(rng2)
        self.assertTrue(result.tz.zone == 'UTC')
Example #22
0
    def test_loc_setitem_frame_multiples(self):
        # multiple setting
        df = DataFrame({'A': ['foo', 'bar', 'baz'],
                        'B': Series(
                            range(3), dtype=np.int64)})
        rhs = df.loc[1:2]
        rhs.index = df.index[0:2]
        df.loc[0:1] = rhs
        expected = DataFrame({'A': ['bar', 'baz', 'baz'],
                              'B': Series(
                                  [1, 2, 2], dtype=np.int64)})
        tm.assert_frame_equal(df, expected)

        # multiple setting with frame on rhs (with M8)
        df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'),
                        'val': Series(
                            range(5), dtype=np.int64)})
        expected = DataFrame({'date': [Timestamp('20000101'), Timestamp(
            '20000102'), Timestamp('20000101'), Timestamp('20000102'),
            Timestamp('20000103')],
            'val': Series(
            [0, 1, 0, 1, 2], dtype=np.int64)})
        rhs = df.loc[0:2]
        rhs.index = df.index[2:5]
        df.loc[2:4] = rhs
        tm.assert_frame_equal(df, expected)
Example #23
0
    def test_to_string_format_na(self):
        fmt.reset_printoptions()
        df = DataFrame({'A' : [np.nan, -1, -2.1234, 3, 4],
                        'B' : [np.nan, 'foo', 'foooo', 'fooooo', 'bar']})
        result = df.to_string()

        expected = ('        A       B\n'
                    '0     NaN     NaN\n'
                    '1 -1.0000     foo\n'
                    '2 -2.1234   foooo\n'
                    '3  3.0000  fooooo\n'
                    '4  4.0000     bar')
        self.assertEqual(result, expected)

        df = DataFrame({'A' : [np.nan, -1., -2., 3., 4.],
                        'B' : [np.nan, 'foo', 'foooo', 'fooooo', 'bar']})
        result = df.to_string()

        expected = ('    A       B\n'
                    '0 NaN     NaN\n'
                    '1  -1     foo\n'
                    '2  -2   foooo\n'
                    '3   3  fooooo\n'
                    '4   4     bar')
        self.assertEqual(result, expected)
Example #24
0
    def test_to_string_repr_unicode(self):
        buf = StringIO()

        unicode_values = [u'\u03c3'] * 10
        unicode_values = np.array(unicode_values, dtype=object)
        df = DataFrame({'unicode' : unicode_values})
        df.to_string(col_space=10, buf=buf)

        # it works!
        repr(df)

        idx = Index(['abc', u'\u03c3a', 'aegdvg'])
        ser = Series(np.random.randn(len(idx)), idx)
        rs = repr(ser).split('\n')
        line_len = len(rs[0])
        for line in rs[1:]:
            try:
                line = line.decode('utf-8')
            except:
                pass
            self.assert_(len(line) == line_len)

        # it works even if sys.stdin in None
        sys.stdin = None
        repr(df)
        sys.stdin = sys.__stdin__
Example #25
0
 def test_to_string_with_formatters_unicode(self):
     df = DataFrame({u'c/\u03c3':[1,2,3]})
     result = df.to_string(formatters={u'c/\u03c3': lambda x: '%s' % x})
     self.assertEqual(result, (u'  c/\u03c3\n'
                                '0   1\n'
                                '1   2\n'
                                '2   3'))
    def test_resample_anchored_intraday(self):
        # #1471, #1458

        rng = date_range('1/1/2012', '4/1/2012', freq='100min')
        df = DataFrame(rng.month, index=rng)

        result = df.resample('M')
        expected = df.resample('M', kind='period').to_timestamp(how='end')
        tm.assert_frame_equal(result, expected)

        result = df.resample('M', closed='left')
        exp = df.tshift(1, freq='D').resample('M', kind='period')
        exp = exp.to_timestamp(how='end')

        tm.assert_frame_equal(result, exp)

        rng = date_range('1/1/2012', '4/1/2012', freq='100min')
        df = DataFrame(rng.month, index=rng)

        result = df.resample('Q')
        expected = df.resample('Q', kind='period').to_timestamp(how='end')
        tm.assert_frame_equal(result, expected)

        result = df.resample('Q', closed='left')
        expected = df.tshift(1, freq='D').resample('Q', kind='period',
                                                   closed='left')
        expected = expected.to_timestamp(how='end')
        tm.assert_frame_equal(result, expected)

        ts = _simple_ts('2012-04-29 23:00', '2012-04-30 5:00', freq='h')
        resampled = ts.resample('M')
        self.assert_(len(resampled) == 1)
Example #27
0
    def test_eng_float_formatter(self):
        df = DataFrame({'A' : [1.41, 141., 14100, 1410000.]})

        fmt.set_eng_float_format()
        result = df.to_string()
        expected = ('             A\n'
                    '0    1.410E+00\n'
                    '1  141.000E+00\n'
                    '2   14.100E+03\n'
                    '3    1.410E+06')
        self.assertEqual(result, expected)

        fmt.set_eng_float_format(use_eng_prefix=True)
        result = df.to_string()
        expected = ('         A\n'
                    '0    1.410\n'
                    '1  141.000\n'
                    '2  14.100k\n'
                    '3   1.410M')
        self.assertEqual(result, expected)

        fmt.set_eng_float_format(accuracy=0)
        result = df.to_string()
        expected = ('         A\n'
                    '0    1E+00\n'
                    '1  141E+00\n'
                    '2   14E+03\n'
                    '3    1E+06')
        self.assertEqual(result, expected)

        fmt.reset_printoptions()
Example #28
0
    def test_groupby_categorical_index_and_columns(self, observed):
        # GH18432
        columns = ['A', 'B', 'A', 'B']
        categories = ['B', 'A']
        data = np.ones((5, 4), int)
        cat_columns = CategoricalIndex(columns,
                                       categories=categories,
                                       ordered=True)
        df = DataFrame(data=data, columns=cat_columns)
        result = df.groupby(axis=1, level=0, observed=observed).sum()
        expected_data = 2 * np.ones((5, 2), int)

        if observed:
            # if we are not-observed we undergo a reindex
            # so need to adjust the output as our expected sets us up
            # to be non-observed
            expected_columns = CategoricalIndex(['A', 'B'],
                                                categories=categories,
                                                ordered=True)
        else:
            expected_columns = CategoricalIndex(categories,
                                                categories=categories,
                                                ordered=True)
        expected = DataFrame(data=expected_data, columns=expected_columns)
        assert_frame_equal(result, expected)

        # test transposed version
        df = DataFrame(data.T, index=cat_columns)
        result = df.groupby(axis=0, level=0, observed=observed).sum()
        expected = DataFrame(data=expected_data.T, index=expected_columns)
        assert_frame_equal(result, expected)
Example #29
0
def test_group_selection_cache():
    # GH 12839 nth, head, and tail should return same result consistently
    df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
    expected = df.iloc[[0, 2]].set_index('A')

    g = df.groupby('A')
    result1 = g.head(n=2)
    result2 = g.nth(0)
    assert_frame_equal(result1, df)
    assert_frame_equal(result2, expected)

    g = df.groupby('A')
    result1 = g.tail(n=2)
    result2 = g.nth(0)
    assert_frame_equal(result1, df)
    assert_frame_equal(result2, expected)

    g = df.groupby('A')
    result1 = g.nth(0)
    result2 = g.head(n=2)
    assert_frame_equal(result1, expected)
    assert_frame_equal(result2, df)

    g = df.groupby('A')
    result1 = g.nth(0)
    result2 = g.tail(n=2)
    assert_frame_equal(result1, expected)
    assert_frame_equal(result2, df)
 def test_frame_reset_index(self):
     dr = date_range("2012-06-02", periods=10, tz=self.tzstr("US/Eastern"))
     df = DataFrame(np.random.randn(len(dr)), dr)
     roundtripped = df.reset_index().set_index("index")
     xp = df.index.tz
     rs = roundtripped.index.tz
     self.assertEqual(xp, rs)
Example #31
0
from pandas import DataFrame


print("HELLO!")

df = DataFrame({'a':[1,2,3], 'b':[4,5,6]})

print(df.head)

from test_folder.my_mod import enlarge

x=11
print(enlarge(x))
Example #32
0
def _apply_query_metadata(df: pd.DataFrame,
                          query_metadata: _QueryMetadata) -> pd.DataFrame:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        df.query_metadata = query_metadata.raw_payload
    return df
Example #33
0
def heston_calibration(df_option, ival=None):
    """
    calibrate heston model
    """

    # extract rates and div yields from the data set    
    df_tmp = DataFrame.filter(df_option, items=['dtExpiry', 'iRate', 'iDiv'])
    grouped = df_tmp.groupby('dtExpiry')

    def aggregate(serie):
        return serie[serie.index[0]]

    df_rates = grouped.agg(aggregate)

    # Get first index:
    first_index = 0

    dtTrade = df_option['dtTrade'][first_index]
    # back out the spot from any forward
    iRate = df_option['iRate'][first_index]
    iDiv = df_option['iDiv'][first_index]
    TTM = df_option['TTM'][first_index]
    Fwd = df_option['Fwd'][first_index]
    spot = SimpleQuote(Fwd*np.exp(-(iRate-iDiv)*TTM))
    print('Spot: %f risk-free rate: %f div. yield: %f' % (spot.value, iRate, iDiv))

    # build array of option helpers
    hh = heston_helpers(spot, df_option, dtTrade, df_rates)
    options = hh['options']
    spot = hh['spot']

    risk_free_ts = dfToZeroCurve(df_rates['iRate'], dtTrade)
    dividend_ts = dfToZeroCurve(df_rates['iDiv'], dtTrade)

    # initial values for parameters
    if ival is None:
        ival = {'v0': 0.1, 'kappa': 1.0, 'theta': 0.1,
        'sigma': 0.5, 'rho': -.5}

    process = HestonProcess(
        risk_free_ts, dividend_ts, spot, ival['v0'], ival['kappa'],
         ival['theta'], ival['sigma'], ival['rho'])

    model = HestonModel(process)
    engine = AnalyticHestonEngine(model, 64)

    for option in options:
        option.set_pricing_engine(engine)

    om = LevenbergMarquardt(1e-8, 1e-8, 1e-8)
    model.calibrate(
        options, om, EndCriteria(400, 40, 1.0e-8, 1.0e-8, 1.0e-8)
    )

    print('model calibration results:')
    print('v0: %f kappa: %f theta: %f sigma: %f rho: %f' %
          (model.v0, model.kappa, model.theta, model.sigma,
           model.rho))

    calib_error = (1.0/len(options)) * sum(
        [pow(o.calibration_error()*100.0,2) for o in options])

    print('SSE: %f' % calib_error)

    # merge the fitted volatility and the input data set
    return merge_df(df_option, options, 'Heston')
Example #34
0
 def test_plot_submethod_works(self):
     df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")})
     df.groupby("z").plot.scatter("x", "y")
     tm.close()
     df.groupby("z")["x"].plot.line()
     tm.close()
Example #35
0
def batch_map_cnot_circuits(source, modes, architectures, n_qubits=None, populations=30, iterations=15, crossover_probs=0.8,
                            mutation_probs=0.5, dest_folder=None, metrics_file=None, n_compile=1):
    modes = make_into_list(modes)
    architectures = make_into_list(architectures)
    populations = make_into_list(populations)
    iterations = make_into_list(iterations)
    crossover_probs = make_into_list(crossover_probs)
    mutation_probs = make_into_list(mutation_probs)

    if os.path.isfile(source):
        source, file = os.path.split(source)
        files = [file]
    else:
        files = [f for f in os.listdir(source) if os.path.isfile(os.path.join(source, f))]

    if not os.path.exists(source):
        raise IOError("Folder does not exist: " + source)
    if dest_folder is None:
        dest_folder = source
    else:
        os.makedirs(dest_folder, exist_ok=True)

    arch_iter = []
    circuits = {}
    metrics = []
    for architecture in architectures:
        if architecture in dynamic_size_architectures:
            if n_qubits is None:
                raise KeyError("Number of qubits not specified for architecture" + architecture)
            else:
                n_qubits = make_into_list(n_qubits)
                arch_iter.extend([create_architecture(architecture, n_qubits=q) for q in n_qubits])
        else:
            arch_iter.append(create_architecture(architecture))
    for architecture in arch_iter:
        circuits[architecture.name] = {}
        for mode in modes:
            if mode == QUIL_COMPILER:
                n_compile_list = range(n_compile)
            else:
                n_compile_list = [None]
            new_dest_folder = os.path.join(dest_folder, architecture.name, mode)
            os.makedirs(new_dest_folder, exist_ok=True)
            if mode in genetic_elim_modes:
                pop_iter = populations
                iter_iter = iterations
                crossover_iter = crossover_probs
                mutation_iter = mutation_probs
                circuits[architecture.name][mode] = {}
            else:
                if mode == QUIL_COMPILER:
                    circuits[architecture.name][mode] = []
                pop_iter = [None]
                iter_iter = [None]
                crossover_iter = [None]
                mutation_iter = [None]

            for population in pop_iter:
                for iteration in iter_iter:
                    for crossover_prob in crossover_iter:
                        for mutation_prob in mutation_iter:
                            for file in files:
                                if os.path.splitext(file)[1].lower() == ".qasm":
                                    origin_file = os.path.join(source, file)
                                    for i in n_compile_list:
                                        dest_filename = create_dest_filename(origin_file, population, iteration, crossover_prob, mutation_prob, i)
                                        dest_file = os.path.join(dest_folder, architecture.name, mode, dest_filename)
                                        try:
                                            start_time = time.time()
                                            circuit = map_cnot_circuit(origin_file, architecture, mode=mode, dest_file=dest_file,
                                                                       population=population, iterations=iteration,
                                                                       crossover_prob=crossover_prob, mutation_prob=mutation_prob)
                                            end_time = time.time()
                                            if metrics_file is not None:
                                                metrics.append(make_metrics(circuit, origin_file, architecture.name, mode, dest_file, population, iteration, crossover_prob, mutation_prob, end_time-start_time, i))
                                            if mode in genetic_elim_modes:
                                                circuits[architecture.name][mode][(population, iteration, crossover_prob, mutation_prob)] = circuit
                                            elif mode == QUIL_COMPILER:
                                                circuits[architecture.name][mode].append(circuit)
                                            else:
                                                circuits[architecture.name][mode] = circuit
                                        except KeyError as e: # Should only happen with quilc
                                            if mode == QUIL_COMPILER:
                                                print("\033[31mCould not compile", origin_file, "into", dest_file, end="\033[0m\n")
                                            else:
                                                raise e

    if len(metrics) > 0 and DataFrame != None:
        df = DataFrame(metrics)
        if os.path.exists(metrics_file): # append to the file - do not overwrite!
            df.to_csv(metrics_file, columns=get_metric_header(), header=False, index=False, mode='a')
        else:
            df.to_csv(metrics_file, columns=get_metric_header(), index=False)
    return circuits
Example #36
0
def table(coords_src,
          coords_dest=None,
          ids_origin=None,
          ids_dest=None,
          output='np',
          minutes=False,
          url_config=RequestConfig,
          send_as_polyline=True):
    """
    Function wrapping OSRM 'table' function in order to get a matrix of
    time distance as a numpy array or as a DataFrame

    Parameters
    ----------

    coords_src : list
        A list of coord as (lat, long) , like :
             list_coords = [(21.3224, 45.2358),
                            (21.3856, 42.0094),
                            (20.9574, 41.5286)] (coords have to be float)
    coords_dest : list, optional
        A list of coord as (lat, long) , like :
             list_coords = [(21.3224, 45.2358),
                            (21.3856, 42.0094),
                            (20.9574, 41.5286)] (coords have to be float)
    ids_origin : list, optional
        A list of name/id to use to label the source axis of
        the result `DataFrame` (default: None).
    ids_dest : list, optional
        A list of name/id to use to label the destination axis of
        the result `DataFrame` (default: None).
    output : str, optional
            The type of durations matrice to return (DataFrame or numpy array)
                'raw' for the (parsed) json response from OSRM
                'pandas', 'df' or 'DataFrame' for a DataFrame
                'numpy', 'array' or 'np' for a numpy array (default is "np")
    url_config: osrm.RequestConfig, optional
        Parameters regarding the host, version and profile to use


    Returns
    -------
        - if output=='raw' : a dict, the parsed json response.
        - if output=='np' : a numpy.ndarray containing the time in minutes,
                            a list of snapped origin coordinates,
                            a list of snapped destination coordinates.
        - if output=='pandas' : a labeled DataFrame containing the time matrix in minutes,
                                a list of snapped origin coordinates,
                                a list of snapped destination coordinates.
    """
    if output.lower() in ('numpy', 'array', 'np'):
        output = 1
    elif output.lower() in ('pandas', 'dataframe', 'df'):
        output = 2
    else:
        output = 3

    host = check_host(url_config.host)
    url = ''.join(
        [host, '/table/', url_config.version, '/', url_config.profile, '/'])

    if not send_as_polyline:
        if not coords_dest:
            url = ''.join([
                url, ';'.join([
                    ','.join([str(coord[0]), str(coord[1])])
                    for coord in coords_src
                ])
            ])
        else:
            src_end = len(coords_src)
            dest_end = src_end + len(coords_dest)
            url = ''.join([
                url, ';'.join([
                    ','.join([str(coord[0]), str(coord[1])])
                    for coord in _chain(coords_src, coords_dest)
                ]), '?sources=', ';'.join([str(i) for i in range(src_end)]),
                '&destinations=',
                ';'.join([str(j) for j in range(src_end, dest_end)])
            ])
    else:
        if not coords_dest:
            url = ''.join([
                url, "polyline(",
                polyline_encode([(c[1], c[0]) for c in coords_src]), ")"
            ])
        else:
            src_end = len(coords_src)
            dest_end = src_end + len(coords_dest)
            url = ''.join([
                url, "polyline(",
                polyline_encode([
                    (c[1], c[0]) for c in _chain(coords_src, coords_dest)
                ]), ")", '?sources=',
                ';'.join([str(i) for i in range(src_end)]), '&destinations=',
                ';'.join([str(j) for j in range(src_end, dest_end)])
            ])

    rep = urlopen(url)
    parsed_json = json.loads(rep.read().decode('utf-8'))

    if "code" not in parsed_json or "Ok" not in parsed_json["code"]:
        raise ValueError('No distance table return by OSRM instance')

    elif output == 3:
        return parsed_json

    else:
        durations = np.array(parsed_json["durations"], dtype=float)
        new_src_coords = [ft["location"] for ft in parsed_json["sources"]]
        new_dest_coords = None if not coords_dest \
            else [ft["location"] for ft in parsed_json["destinations"]]

        if minutes:  # Conversion in minutes with 2 decimals:
            durations = np.around((durations / 60), 2)
        if output == 2:
            if not ids_origin:
                ids_origin = [i for i in range(len(coords_src))]
            if not ids_dest:
                ids_dest = ids_origin if not coords_dest \
                    else [i for i in range(len(coords_dest))]

            durations = DataFrame(durations,
                                  index=ids_origin,
                                  columns=ids_dest,
                                  dtype=float)

        return durations, new_src_coords, new_dest_coords
Example #37
0
    def test_astype_categorical_to_other(self):

        value = np.random.RandomState(0).randint(0, 10000, 100)
        df = DataFrame({"value": value})
        labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=["value"], ascending=True)
        df["value_group"] = pd.cut(df.value,
                                   range(0, 10500, 500),
                                   right=False,
                                   labels=cat_labels)

        s = df["value_group"]
        expected = s
        tm.assert_series_equal(s.astype("category"), expected)
        tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
        msg = r"could not convert string to float|invalid literal for float\(\)"
        with pytest.raises(ValueError, match=msg):
            s.astype("float64")

        cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
        exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
        tm.assert_series_equal(cat.astype("str"), exp)
        s2 = Series(Categorical(["1", "2", "3", "4"]))
        exp2 = Series([1, 2, 3, 4]).astype(int)
        tm.assert_series_equal(s2.astype("int"), exp2)

        # object don't sort correctly, so just compare that we have the same
        # values
        def cmp(a, b):
            tm.assert_almost_equal(np.sort(np.unique(a)),
                                   np.sort(np.unique(b)))

        expected = Series(np.array(s.values), name="value_group")
        cmp(s.astype("object"), expected)
        cmp(s.astype(np.object_), expected)

        # array conversion
        tm.assert_almost_equal(np.array(s), np.array(s.values))

        tm.assert_series_equal(s.astype("category"), s)
        tm.assert_series_equal(s.astype(CategoricalDtype()), s)

        roundtrip_expected = s.cat.set_categories(
            s.cat.categories.sort_values()).cat.remove_unused_categories()
        tm.assert_series_equal(
            s.astype("object").astype("category"), roundtrip_expected)
        tm.assert_series_equal(
            s.astype("object").astype(CategoricalDtype()), roundtrip_expected)

        # invalid conversion (these are NOT a dtype)
        msg = ("dtype '<class 'pandas.core.arrays.categorical.Categorical'>' "
               "not understood")

        for invalid in [
                lambda x: x.astype(Categorical),
                lambda x: x.astype("object").astype(Categorical),
        ]:
            with pytest.raises(TypeError, match=msg):
                invalid(s)
Example #38
0
def test_cython_transform_frame(op, args, targop):
    s = Series(np.random.randn(1000))
    s_missing = s.copy()
    s_missing.iloc[2:10] = np.nan
    labels = np.random.randint(0, 50, size=1000).astype(float)
    strings = list("qwertyuiopasdfghjklz")
    strings_missing = strings[:]
    strings_missing[5] = np.nan
    df = DataFrame(
        {
            "float": s,
            "float_missing": s_missing,
            "int": [1, 1, 1, 1, 2] * 200,
            "datetime": pd.date_range("1990-1-1", periods=1000),
            "timedelta": pd.timedelta_range(1, freq="s", periods=1000),
            "string": strings * 50,
            "string_missing": strings_missing * 50,
        },
        columns=[
            "float",
            "float_missing",
            "int",
            "datetime",
            "timedelta",
            "string",
            "string_missing",
        ],
    )
    df["cat"] = df["string"].astype("category")

    df2 = df.copy()
    df2.index = pd.MultiIndex.from_product([range(100), range(10)])

    # DataFrame - Single and MultiIndex,
    # group by values, index level, columns
    for df in [df, df2]:
        for gb_target in [
            dict(by=labels),
            dict(level=0),
            dict(by="string"),
        ]:  # dict(by='string_missing')]:
            # dict(by=['int','string'])]:

            gb = df.groupby(**gb_target)
            # whitelisted methods set the selection before applying
            # bit a of hack to make sure the cythonized shift
            # is equivalent to pre 0.17.1 behavior
            if op == "shift":
                gb._set_group_selection()

            if op != "shift" and "int" not in gb_target:
                # numeric apply fastpath promotes dtype so have
                # to apply separately and concat
                i = gb[["int"]].apply(targop)
                f = gb[["float", "float_missing"]].apply(targop)
                expected = pd.concat([f, i], axis=1)
            else:
                expected = gb.apply(targop)

            expected = expected.sort_index(axis=1)
            tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1))
            tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1))
            # individual columns
            for c in df:
                if c not in ["float", "int", "float_missing"] and op != "shift":
                    msg = "No numeric types to aggregate"
                    with pytest.raises(DataError, match=msg):
                        gb[c].transform(op)
                    with pytest.raises(DataError, match=msg):
                        getattr(gb[c], op)()
                else:
                    expected = gb[c].apply(targop)
                    expected.name = c
                    tm.assert_series_equal(expected, gb[c].transform(op, *args))
                    tm.assert_series_equal(expected, getattr(gb[c], op)(*args))
Example #39
0
def test_transform():
    data = Series(np.arange(9) // 3, index=np.arange(9))

    index = np.arange(9)
    np.random.shuffle(index)
    data = data.reindex(index)

    grouped = data.groupby(lambda x: x // 3)

    transformed = grouped.transform(lambda x: x * x.sum())
    assert transformed[7] == 12

    # GH 8046
    # make sure that we preserve the input order

    df = DataFrame(
        np.arange(6, dtype="int64").reshape(3, 2), columns=["a", "b"], index=[0, 2, 1]
    )
    key = [0, 0, 1]
    expected = (
        df.sort_index()
        .groupby(key)
        .transform(lambda x: x - x.mean())
        .groupby(key)
        .mean()
    )
    result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(key).mean()
    assert_frame_equal(result, expected)

    def demean(arr):
        return arr - arr.mean()

    people = DataFrame(
        np.random.randn(5, 5),
        columns=["a", "b", "c", "d", "e"],
        index=["Joe", "Steve", "Wes", "Jim", "Travis"],
    )
    key = ["one", "two", "one", "two", "one"]
    result = people.groupby(key).transform(demean).groupby(key).mean()
    expected = people.groupby(key).apply(demean).groupby(key).mean()
    assert_frame_equal(result, expected)

    # GH 8430
    df = tm.makeTimeDataFrame()
    g = df.groupby(pd.Grouper(freq="M"))
    g.transform(lambda x: x - 1)

    # GH 9700
    df = DataFrame({"a": range(5, 10), "b": range(5)})
    result = df.groupby("a").transform(max)
    expected = DataFrame({"b": range(5)})
    tm.assert_frame_equal(result, expected)
Example #40
0
__author__ = 'farhan'

from pandas import DataFrame, Series
import pandas as pd
import numpy as np
import gov_data_fetcher
import matplotlib.pyplot as plt

records = gov_data_fetcher.fetch_records()
frame = DataFrame(records)
results = Series([x.split()[0] for x in frame.a.dropna()])
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows',
                            'Not Windows')
by_tz_os = cframe.groupby(['tz', operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
indexer = agg_counts.sum(1).argsort()
count_subset = agg_counts.take(indexer)[-10:]
#count_subset.plot(kind='barh', stacked=True)
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)
plt.show()
#print(count_subset)
Example #41
0
def decompose(other_args: List[str], ticker: str, stock: pd.DataFrame):
    """Decompose time series as:
        - Additive Time Series = Level + CyclicTrend + Residual + Seasonality
        - Multiplicative Time Series = Level * CyclicTrend * Residual * Seasonality

    Parameters
    ----------
    other_args : str
        Command line arguments to be processed with argparse
    ticker : str
        Ticker of the stock
    stock : pd.DataFrame
        Stock data
    """
    parser = argparse.ArgumentParser(
        add_help=False,
        prog="decompose",
        description="""
            Decompose time series as:
              - Additive Time Series = Level + CyclicTrend + Residual + Seasonality
              - Multiplicative Time Series = Level * CyclicTrend * Residual * Seasonality
        """,
    )
    parser.add_argument(
        "-m",
        "--multiplicative",
        action="store_true",
        default=False,
        dest="multiplicative",
        help="decompose using multiplicative model instead of additive",
    )

    try:
        ns_parser = parse_known_args_and_warn(parser, other_args)
        if not ns_parser:
            return

        stock = stock["5. adjusted close"]

        seasonal_periods = 5
        # Hodrick-Prescott filter
        # See Ravn and Uhlig: http://home.uchicago.edu/~huhlig/papers/uhlig.ravn.res.2002.pdf
        lamb = 107360000000

        fig = plt.figure(figsize=plot_autoscale(),
                         dpi=PLOT_DPI,
                         constrained_layout=True)
        spec = gridspec.GridSpec(ncols=4, nrows=5, figure=fig)

        fig.add_subplot(spec[0, :])
        plt.plot(stock)

        plt.title(ticker + " (Time-Series)")

        if ns_parser.multiplicative:
            resultMul = seasonal_decompose(stock,
                                           model="multiplicative",
                                           period=seasonal_periods)
            cycleMul, trendMul = sm.tsa.filters.hpfilter(
                resultMul.trend[resultMul.trend.notna().values], lamb=lamb)

            # Multiplicative model
            fig.add_subplot(spec[1, :4])
            plt.plot(resultMul.trend, lw=2, c="purple")
            plt.xlim([stock.index[0], stock.index[-1]])
            plt.title("Multiplicative Cyclic-Trend")

            fig.add_subplot(spec[2, 0:2])
            plt.plot(trendMul, lw=2, c="tab:blue")
            plt.xlim([stock.index[0], stock.index[-1]])
            plt.title("Multiplicative Trend component")

            fig.add_subplot(spec[2, 2:])
            plt.plot(cycleMul, lw=2, c="green")
            plt.xlim([stock.index[0], stock.index[-1]])
            plt.title("Multiplicative Cycle component")

            fig.add_subplot(spec[3, :])
            plt.plot(resultMul.seasonal, lw=2, c="orange")
            plt.xlim([stock.index[0], stock.index[-1]])
            plt.title("Multiplicative Seasonal effect")

            fig.add_subplot(spec[4, :])
            plt.plot(resultMul.resid, lw=2, c="red")
            plt.xlim([stock.index[0], stock.index[-1]])
            plt.title("Multiplicative Residuals")

        else:
            resultAdd = seasonal_decompose(stock,
                                           model="additive",
                                           period=seasonal_periods)
            cycleAdd, trendAdd = sm.tsa.filters.hpfilter(
                resultAdd.trend[resultAdd.trend.notna().values], lamb=lamb)

            # Additive model
            fig.add_subplot(spec[1, :4])
            plt.plot(resultAdd.trend, lw=2, c="purple")
            plt.xlim([stock.index[0], stock.index[-1]])
            plt.title("Additive Cyclic-Trend")

            fig.add_subplot(spec[2, 0:2])
            plt.plot(trendAdd, lw=2, c="tab:blue")
            plt.xlim([stock.index[0], stock.index[-1]])
            plt.title("Additive Trend component")

            fig.add_subplot(spec[2, 2:])
            plt.plot(cycleAdd, lw=2, c="green")
            plt.xlim([stock.index[0], stock.index[-1]])
            plt.title("Additive Cycle component")

            fig.add_subplot(spec[3, :])
            plt.plot(resultAdd.seasonal, lw=2, c="orange")
            plt.xlim([stock.index[0], stock.index[-1]])
            plt.title("Additive Seasonal effect")

            fig.add_subplot(spec[4, :])
            plt.plot(resultAdd.resid, lw=2, c="red")
            plt.xlim([stock.index[0], stock.index[-1]])
            plt.title("Additive Residuals")

        if gtff.USE_ION:
            plt.ion()

        plt.show()
        print("")

        # From  # https://otexts.com/fpp2/seasonal-strength.html

        print("Time-Series Level is " + str(round(stock.mean(), 2)))

        if ns_parser.multiplicative:
            FtMul = max(0, 1 - np.var(
                resultMul.resid)) / np.var(resultMul.trend + resultMul.resid)
            print("Strength of Trend: %.4f" % FtMul)
            FsMul = max(
                0,
                1 - np.var(resultMul.resid) /
                np.var(resultMul.seasonal + resultMul.resid),
            )
            print("Strength of Seasonality: %.4f" % FsMul)

        else:
            FtAdd = max(
                0,
                1 - np.var(resultAdd.resid) /
                np.var(resultAdd.trend + resultAdd.resid),
            )
            print("Strength of Trend: %.4f" % FtAdd)
            FsAdd = max(
                0,
                1 - np.var(resultAdd.resid) /
                np.var(resultAdd.seasonal + resultAdd.resid),
            )
            print("Strength of Seasonality: %.4f" % FsAdd)
        print("")

    except Exception as e:
        print(e, "\n")
        return
Example #42
0
def test_groupby_transform_with_int():

    # GH 3740, make sure that we might upcast on item-by-item transform

    # floats
    df = DataFrame(
        dict(
            A=[1, 1, 1, 2, 2, 2],
            B=Series(1, dtype="float64"),
            C=Series([1, 2, 3, 1, 2, 3], dtype="float64"),
            D="foo",
        )
    )
    with np.errstate(all="ignore"):
        result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
    expected = DataFrame(
        dict(B=np.nan, C=Series([-1, 0, 1, -1, 0, 1], dtype="float64"))
    )
    assert_frame_equal(result, expected)

    # int case
    df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=[1, 2, 3, 1, 2, 3], D="foo"))
    with np.errstate(all="ignore"):
        result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
    expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1]))
    assert_frame_equal(result, expected)

    # int that needs float conversion
    s = Series([2, 3, 4, 10, 5, -1])
    df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D="foo"))
    with np.errstate(all="ignore"):
        result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())

    s1 = s.iloc[0:3]
    s1 = (s1 - s1.mean()) / s1.std()
    s2 = s.iloc[3:6]
    s2 = (s2 - s2.mean()) / s2.std()
    expected = DataFrame(dict(B=np.nan, C=concat([s1, s2])))
    assert_frame_equal(result, expected)

    # int downcasting
    result = df.groupby("A").transform(lambda x: x * 2 / 2)
    expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1]))
    assert_frame_equal(result, expected)
Example #43
0
def cdf(other_args: List[str], ticker: str, stock: pd.DataFrame,
        start: datetime):
    """Plot cumulative distribution function

    Parameters
    ----------
    other_args : str
        Command line arguments to be processed with argparse
    ticker : str
        Ticker of the stock
    stock : pd.DataFrame
        Stock data
    """
    parser = argparse.ArgumentParser(
        add_help=False,
        prog="cdf",
        description="""
            Cumulative distribution function
        """,
    )

    try:
        ns_parser = parse_known_args_and_warn(parser, other_args)
        if not ns_parser:
            return

        plt.figure(figsize=plot_autoscale(), dpi=PLOT_DPI)

        stock = stock["5. adjusted close"]

        cdf = stock.value_counts().sort_index().div(len(stock)).cumsum()
        cdf.plot(lw=2)
        plt.title(
            f"Cumulative Distribution Function of {ticker} from {start.strftime('%Y-%m-%d')}"
        )
        plt.ylabel("Probability")
        plt.xlabel("Share Price")
        minVal = stock.values.min()
        q25 = np.quantile(stock.values, 0.25)
        medianVal = np.quantile(stock.values, 0.5)
        q75 = np.quantile(stock.values, 0.75)
        data = [
            (minVal, q25),
            (0.25, 0.25),
            "r",
            (q25, q25),
            (0, 0.25),
            "r",
            (minVal, medianVal),
            (0.5, 0.5),
            "r",
            (medianVal, medianVal),
            (0, 0.5),
            "r",
            (minVal, q75),
            (0.75, 0.75),
            "r",
            (q75, q75),
            (0, 0.75),
            "r",
        ]
        plt.plot(*data, ls="--")
        plt.text(minVal + (q25 - minVal) / 2,
                 0.27,
                 "Q1",
                 color="r",
                 fontweight="bold")
        plt.text(
            minVal + (medianVal - minVal) / 2,
            0.52,
            "Median",
            color="r",
            fontweight="bold",
        )
        plt.text(minVal + (q75 - minVal) / 2,
                 0.77,
                 "Q3",
                 color="r",
                 fontweight="bold")
        plt.xlim(cdf.index[0], cdf.index[-1])
        plt.grid(True)

        if gtff.USE_ION:
            plt.ion()

        plt.show()
        print("")

    except Exception as e:
        print(e, "\n")
        return
Example #44
0
 def test_values_with_duplicate_columns(self):
     df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
     result = df.values
     expected = np.array([[1, 2.5], [3, 4.5]])
     assert (result == expected).all().all()
Example #45
0
def print_predictions(predictions: pd.DataFrame) -> None:
    print('Predictions:')
    print('============')
    for sample_id, pred in predictions.to_numpy():
        print('{}: {}'.format(sample_id, pred))
Example #46
0
def rolling(other_args: List[str], ticker: str, stock: pd.DataFrame):
    """Rolling mean and std deviation

    Parameters
    ----------
    other_args : str
        Command line arguments to be processed with argparse
    ticker : str
        Ticker of the stock
    stock : pd.DataFrame
        Stock data
    """
    parser = argparse.ArgumentParser(
        add_help=False,
        prog="rolling",
        description="""
            Rolling mean and std deviation
        """,
    )
    parser.add_argument(
        "-w",
        "--window",
        dest="rolling_window",
        type=check_positive,
        default=100,
        help="rolling window",
    )

    try:
        ns_parser = parse_known_args_and_warn(parser, other_args)
        if not ns_parser:
            return

        stock = stock["5. adjusted close"]

        rolling_mean = stock.rolling(ns_parser.rolling_window,
                                     center=True,
                                     min_periods=1).mean()
        rolling_std = stock.rolling(ns_parser.rolling_window,
                                    center=True,
                                    min_periods=1).std()

        fig, axMean = plt.subplots(figsize=plot_autoscale(), dpi=PLOT_DPI)

        axMean.plot(stock.index,
                    stock.values,
                    label=ticker,
                    linewidth=2,
                    color="black")
        axMean.plot(rolling_mean, linestyle="--", linewidth=3, color="blue")
        axMean.set_xlabel("Time")
        axMean.set_ylabel("Share Price", color="blue")
        axMean.legend(["Real values", "Rolling Mean"], loc=2)
        axMean.tick_params(axis="y", labelcolor="blue")
        axStd = axMean.twinx()
        axStd.plot(rolling_std,
                   label="Rolling std",
                   linestyle="--",
                   color="green",
                   linewidth=3)
        axStd.set_ylabel("Std Deviation")
        axStd.legend(["Rolling std"], loc=1)
        axStd.set_ylabel("Share Price standard deviation", color="green")
        axStd.tick_params(axis="y", labelcolor="green")
        axMean.set_title("Rolling mean and std with window " +
                         str(ns_parser.rolling_window) + " applied to " +
                         ticker)
        plt.xlim([stock.index[0], stock.index[-1]])
        plt.grid(b=True, which="major", color="#666666", linestyle="-")
        plt.minorticks_on()
        plt.grid(b=True,
                 which="minor",
                 color="#999999",
                 linestyle="-",
                 alpha=0.2)

        if gtff.USE_ION:
            plt.ion()

        plt.show()
        print("")

    except Exception as e:
        print(e, "\n")
        return
Example #47
0
        # State pool
        states = ['GA', 'FL', 'fl', 'NY', 'NJ', 'TX']

        # Make a random list of states
        random_states = [states[np.randint(low=0, high=len(states))] for i in range(len(rng))]

        Output.extend(zip(random_states, random_status, data, rng))

    return Output


if __name__ == "__main__":
    ### Setting up Data
    np.seed(500) # Set seed so we can reproduce results
    dataset = CreateDataSet(4)
    df = DataFrame(data=dataset, columns=['State', 'Status', 'CustomerCount', 'StatusDate'])

    print df.info()
    #<class 'pandas.core.frame.DataFrame'>
    #Int64Index: 836 entries, 0 to 835
    #Data columns (total 4 columns):
    #State            836 non-null object
    #Status           836 non-null int64
    #CustomerCount    836 non-null int64
    #StatusDate       836 non-null datetime64[ns]
    #dtypes: datetime64[ns](1), int64(2), object(1)None

    print df.head()

    # How to write data to read_excel
    #df.to_excel('Lesson3.xlsx', index=False)
Example #48
0
def store_predictions_as_csv(predictions: pd.DataFrame, file_path: str) -> None:
    print('\nWriting predictions to file "{}".'.format(file_path))
    predictions.to_json(file_path, orient='records')
Example #49
0
def featurize_site(df: pd.DataFrame, site_stats=("mean", "std_dev")) -> pd.DataFrame:
    """ Decorate input `pandas.DataFrame` of structures with site
    features from matminer.

    Currently creates the set of all matminer structure features with
    the `matminer.featurizers.structure.SiteStatsFingerprint`.

    Args:
        df (pandas.DataFrame): the input dataframe with `"structure"`
            column containing `pymatgen.Structure` objects.
        site_stats (Tuple[str]): the matminer site stats to use in the
            `SiteStatsFingerprint` for all features.

    Returns:
        pandas.DataFrame: the decorated DataFrame.

    """

    logging.info("Applying site featurizers...")

    df = df.copy()
    df.columns = ["Input data|" + x for x in df.columns]

    site_fingerprints = (
        AGNIFingerprints(),
        GeneralizedRadialDistributionFunction.from_preset("gaussian"),
        OPSiteFingerprint(),
        CrystalNNFingerprint.from_preset("ops"),
        VoronoiFingerprint(),
        GaussianSymmFunc(),
        ChemEnvSiteFingerprint.from_preset("simple"),
        CoordinationNumber(),
        LocalPropertyDifference(),
        BondOrientationalParameter(),
        AverageBondLength(VoronoiNN()),
        AverageBondAngle(VoronoiNN())
    )

    for fingerprint in site_fingerprints:
        site_stats_fingerprint = SiteStatsFingerprint(
            fingerprint,
            stats=site_stats
        )

        df = site_stats_fingerprint.featurize_dataframe(
            df,
            "Input data|structure",
            multiindex=False,
            ignore_errors=True
        )

        fingerprint_name = fingerprint.__class__.__name__

        # rename some features for backwards compatibility with pretrained models
        if fingerprint_name == "GeneralizedRadialDistributionFunction":
            fingerprint_name = "GeneralizedRDF"
        elif fingerprint_name == "AGNIFingerprints":
            fingerprint_name = "AGNIFingerPrint"
        elif fingerprint_name == "BondOrientationalParameter":
            fingerprint_name = "BondOrientationParameter"
        elif fingerprint_name == "GaussianSymmFunc":
            fingerprint_name = "ChemEnvSiteFingerprint|GaussianSymmFunc"

        if "|" not in fingerprint_name:
            fingerprint_name += "|"

        df.columns = [f"{fingerprint_name}{x}" if "|" not in x else x for x in df.columns]

    df = df.loc[:, (df != 0).any(axis=0)]

    return clean_df(df)
Example #50
0
    def backtest(self, args: Dict) -> DataFrame:
        """
        Implements backtesting functionality

        NOTE: This method is used by Hyperopt at each iteration. Please keep it optimized.
        Of course try to not have ugly code. By some accessor are sometime slower than functions.
        Avoid, logging on this method

        :param args: a dict containing:
            stake_amount: btc amount to use for each trade
            processed: a processed dictionary with format {pair, data}
            max_open_trades: maximum number of concurrent trades (default: 0, disabled)
            realistic: do we try to simulate realistic trades? (default: True)
        :return: DataFrame
        """
        headers = ['date', 'buy', 'open', 'close', 'sell']
        processed = args['processed']
        max_open_trades = args.get('max_open_trades', 0)
        realistic = args.get('realistic', False)
        trades = []
        trade_count_lock: Dict = {}
        for pair, pair_data in processed.items():
            pair_data['buy'], pair_data[
                'sell'] = 0, 0  # cleanup from previous run

            ticker_data = self.populate_sell_trend(
                self.populate_buy_trend(pair_data))[headers].copy()

            # to avoid using data from future, we buy/sell with signal from previous candle
            ticker_data.loc[:, 'buy'] = ticker_data['buy'].shift(1)
            ticker_data.loc[:, 'sell'] = ticker_data['sell'].shift(1)

            ticker_data.drop(ticker_data.head(1).index, inplace=True)

            # Convert from Pandas to list for performance reasons
            # (Looping Pandas is slow.)
            ticker = [x for x in ticker_data.itertuples()]

            lock_pair_until = None
            for index, row in enumerate(ticker):
                if row.buy == 0 or row.sell == 1:
                    continue  # skip rows where no buy signal or that would immediately sell off

                if realistic:
                    if lock_pair_until is not None and row.date <= lock_pair_until:
                        continue
                if max_open_trades > 0:
                    # Check if max_open_trades has already been reached for the given date
                    if not trade_count_lock.get(row.date, 0) < max_open_trades:
                        continue

                    trade_count_lock[row.date] = trade_count_lock.get(
                        row.date, 0) + 1

                trade_entry = self._get_sell_trade_entry(
                    pair, row, ticker[index + 1:], trade_count_lock, args)

                if trade_entry:
                    lock_pair_until = trade_entry.close_time
                    trades.append(trade_entry)
                else:
                    # Set lock_pair_until to end of testing period if trade could not be closed
                    # This happens only if the buy-signal was with the last candle
                    lock_pair_until = ticker_data.iloc[-1].date

        return DataFrame.from_records(trades, columns=BacktestResult._fields)
Example #51
0
		Pandas DataFrame of series framed for supervised learning.
	"""
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg


raw = DataFrame()
raw['ob1'] = [x for x in range(10)]
raw['ob2'] = [x for x in range(50, 60)]
values = raw.values
data = series_to_supervised(values, 3, 3)
print(data)
    def test_partial_slicing_dataframe(self):
        # GH14856
        # Test various combinations of string slicing resolution vs.
        # index resolution
        # - If string resolution is less precise than index resolution,
        # string is considered a slice
        # - If string resolution is equal to or more precise than index
        # resolution, string is considered an exact match
        formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H',
                   '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S']
        resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second']
        for rnum, resolution in enumerate(resolutions[2:], 2):
            # we check only 'day', 'hour', 'minute' and 'second'
            unit = Timedelta("1 " + resolution)
            middate = datetime(2012, 1, 1, 0, 0, 0)
            index = DatetimeIndex([middate - unit,
                                   middate, middate + unit])
            values = [1, 2, 3]
            df = DataFrame({'a': values}, index, dtype=np.int64)
            assert df.index.resolution == resolution

            # Timestamp with the same resolution as index
            # Should be exact match for Series (return scalar)
            # and raise KeyError for Frame
            for timestamp, expected in zip(index, values):
                ts_string = timestamp.strftime(formats[rnum])
                # make ts_string as precise as index
                result = df['a'][ts_string]
                assert isinstance(result, np.int64)
                assert result == expected
                pytest.raises(KeyError, df.__getitem__, ts_string)

            # Timestamp with resolution less precise than index
            for fmt in formats[:rnum]:
                for element, theslice in [[0, slice(None, 1)],
                                          [1, slice(1, None)]]:
                    ts_string = index[element].strftime(fmt)

                    # Series should return slice
                    result = df['a'][ts_string]
                    expected = df['a'][theslice]
                    tm.assert_series_equal(result, expected)

                    # Frame should return slice as well
                    result = df[ts_string]
                    expected = df[theslice]
                    tm.assert_frame_equal(result, expected)

            # Timestamp with resolution more precise than index
            # Compatible with existing key
            # Should return scalar for Series
            # and raise KeyError for Frame
            for fmt in formats[rnum + 1:]:
                ts_string = index[1].strftime(fmt)
                result = df['a'][ts_string]
                assert isinstance(result, np.int64)
                assert result == 2
                pytest.raises(KeyError, df.__getitem__, ts_string)

            # Not compatible with existing key
            # Should raise KeyError
            for fmt, res in list(zip(formats, resolutions))[rnum + 1:]:
                ts = index[1] + Timedelta("1 " + res)
                ts_string = ts.strftime(fmt)
                pytest.raises(KeyError, df['a'].__getitem__, ts_string)
                pytest.raises(KeyError, df.__getitem__, ts_string)
Example #53
0
def find_and_set_winnings(
    wagers: pd.DataFrame,
    numbers_wagered: pd.DataFrame,
    drawings: pd.DataFrame,
    wagers_table_name: str,
    conn: sqla.engine.Connection,
) -> pd.DataFrame:
    """
    Function to find the prize amount of each item in the
    'wagers' DataFrame.

    @param wagers: DataFrame containing keno wagers data.
    @param numbers_wagered: DataFrame containing numbers_wagered data.
    @param drawings: DataFrame containing keno drawings data.

    @returns wagers: modified 'wagers' DataFrame.

    """
    metadata = sqla.MetaData(bind=conn)
    wagers_table = sqla.Table(wagers_table_name, metadata, autoload=True)

    def calculate_prize(row: pd.Series) -> pd.Series:
        """
        Function applied to all rows in the 'wagers' DataFrame.
        Utilized normally via pd.apply.

        Principally, this function is responsible for the bit-wise AND'ing of
        two lottery numbers, allowing for fast matching of theretofore
        mentioned numbers.

        @param x: 'numbers_wagered_id' and 'draw_number_id' element of the 'wagers' DataFrame
        @param spots: DataFrame containing spots data.
        @param drawings: DataFrame containing keno drawings data.

        @returns match_mask: array of high and low bits of the match,
                            hamming weight (number of spots played),
                            and date.
        """
        try:
            numbers_wagered_id = row["numbers_wagered_id"]
            draw_number_id = row["draw_number_id"]

            high_bits1 = numbers_wagered.at[numbers_wagered_id, "high_bits"]
            low_bits1 = numbers_wagered.at[numbers_wagered_id, "low_bits"]
            number_played = numbers_wagered.at[numbers_wagered_id, "numbers_played"]

            high_bits2 = drawings.at[draw_number_id, "high_bits"]
            low_bits2 = drawings.at[draw_number_id, "low_bits"]

            match_mask = [low_bits1 & low_bits2, high_bits1 & high_bits2]
            numbers_matched = sum(map(popcount64d, match_mask))

            row["low_match_mask"] = match_mask[0]
            row["high_match_mask"] = match_mask[1]

            row["numbers_matched"] = numbers_matched
            row["prize"] = PRIZE_DICT.get(number_played, {}).get(numbers_matched, 0)

            conn.execute(wagers_table.insert(), **row)

        except Exception as e:
            print(e)

        return row

    return wagers.assign(
        low_match_mask=0, high_match_mask=0, numbers_matched=0, prize=0
    ).apply(calculate_prize, axis=1)
Example #54
0
def get_features_relevance_redundancy(
    target_nmi: pd.DataFrame,
    cross_nmi: pd.DataFrame,
    n_feat: Optional[int] = None,
    rr_parameters: Optional[Dict[str, Union[float, Callable[[int], float]]]] = None,
    return_pc: bool = False
) -> List:
    """
    Select features from the Relevance Redundancy (RR) score between the input
    features and the target output.

    The RR is defined following Equation 2 of De Breuck et al, arXiv:2004:14766,
    with default values,

    ..math:: p = \\max{0.1, 4.5 -  n^{0.4}},

    and

    ..math:: c = 10^{-6} n^3,

    where :math:`n` is the number of features in the "chosen" subset for that iteration.
    These values can be overriden with the `rr_parameters` dictionary argument.

    Args:
        target_nmi (pandas.DataFrame): dataframe  containing the Normalized
            Mutual Information (NMI) between a list of input features and a
            target variable, as computed from :py:func:`nmi_target`.
        cross_nmi (pandas.DataFrame): dataframe containing the NMI between the
            input features, as computed from :py:func:`get_cross_nmi`.
        n_feat (int): Number of features for which the RR score needs to be computed (default: all features).
        rr_parameters (dict): Allows tuning of p and c parameters. Currently
            allows fixing of p and c to constant values instead of using the
            dynamical evaluation. Expects to find keys `"p"` and `"c"`, containing
            either a callable that takes `n` as an argument and returns the
            desired `p` or `c`, or another dictionary containing the key `"value"`
            that stores a constant value of `p` or `c`.
        return_pc: Whether to return p and c values in the output dictionaries.

    Returns:
        list: List of dictionaries containing the results of the relevance-redundancy selection algorithm.

    """
    # Initial checks
    if set(cross_nmi.index) != set(cross_nmi.columns):
        raise ValueError('The cross_nmi DataFrame should have its indices and columns identical.')
    if not set(target_nmi.index).issubset(set(cross_nmi.index)):
        raise ValueError('The indices of the target DataFrame should be included in the cross_nmi DataFrame indices.')

    # Define the functions for the parameters
    if rr_parameters is None:
        get_p = get_rr_p_parameter_default
        get_c = get_rr_c_parameter_default
    else:
        if 'p' not in rr_parameters or 'c' not in rr_parameters:
            raise ValueError('When tuning p and c with rr_parameters in get_features_relevance_redundancy, '
                             'both parameters should be tuned')
        # Set up p
        if callable(rr_parameters["p"]):
            get_p = rr_parameters["p"]
        elif rr_parameters['p'].get('function') == 'constant':
            def get_p(_):
                return rr_parameters['p']['value']
        else:
            raise ValueError(
                'If not passing a callable, "p" dict must contain keys "function" and "value".'
            )
        # Set up c
        if callable(rr_parameters["c"]):
            get_c = rr_parameters["c"]
        elif rr_parameters['c'].get('function') == 'constant':
            def get_c(_):
                return rr_parameters['c']['value']
        else:
            raise ValueError(
                'If not passing a callable, "c" dict must contain keys "function" and "value".'
            )

    # Set up the output list
    out = []

    # The first feature is the one with the largest target NMI
    target_column = target_nmi.columns[0]
    first_feature = target_nmi.nlargest(1, columns=target_column).index[0]
    feature_set = [first_feature]
    feat_out = {'feature': first_feature, 'RR_score': None, 'NMI_target': target_nmi[target_column][first_feature]}
    if return_pc:
        feat_out['RR_p'] = None
        feat_out['RR_c'] = None
    out.append(feat_out)

    # Default is to get the RR score for all features
    if n_feat is None:
        n_feat = len(target_nmi.index)

    missing = [x for x in cross_nmi.index if x not in target_nmi.index]
    cross_nmi = cross_nmi.drop(missing, axis=0).drop(missing, axis=1)
    # Loop on the number of features
    for n in range(1, n_feat):
        logging.debug("In selection of feature {}/{} features...".format(n+1, n_feat))
        if (n+1) % 50 == 0:
            logging.info("Selected {}/{} features...".format(n, n_feat))
        p = get_p(n)
        c = get_c(n)

        # Compute the RR score
        score = cross_nmi.copy()
        # Remove features already selected for the index
        score = score.drop(feature_set, axis=0)
        # Use features already selected to compute the maximum NMI between
        # the remaining features and those already selected
        score = score[feature_set]

        # Get the scores of the remaining features
        for i in score.index:
            row = score.loc[i, :]
            score.loc[i, :] = target_nmi.loc[i, target_column] / (row ** p + c)

        # Get the next feature (the one with the highest score)
        scores_remaining_features = score.min(axis=1)
        next_feature = scores_remaining_features.idxmax(axis=0)
        feature_set.append(next_feature)

        # Add the results for the next feature to the list
        feat_out = {'feature': next_feature, 'RR_score': scores_remaining_features[next_feature],
                    'NMI_target': target_nmi[target_column][next_feature]}
        if return_pc:
            feat_out['RR_p'] = p
            feat_out['RR_c'] = c

        out.append(feat_out)

    return out
Example #55
0
    def test_to_csv_na_rep(self):
        # see gh-11553
        #
        # Testing if NaN values are correctly represented in the index.
        df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]})
        expected_rows = ['a,b,c',
                         '0.0,0,2',
                         '_,1,3']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index('a').to_csv(na_rep='_') == expected
        assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected

        # now with an index containing only NaNs
        df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]})
        expected_rows = ['a,b,c',
                         '_,0,2',
                         '_,1,3']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index('a').to_csv(na_rep='_') == expected
        assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected

        # check if na_rep parameter does not break anything when no NaN
        df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]})
        expected_rows = ['a,b,c',
                         '0,0,2',
                         '0,1,3']
        expected = tm.convert_rows_list_to_csv_str(expected_rows)

        assert df.set_index('a').to_csv(na_rep='_') == expected
        assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected
def feature_pre_processor(path):
    prepare_feature_groundtruth = partial(prepare_feature, f'{path}/labels.csv')
    ftrs = return_from_path(prepare_feature_groundtruth,
                            f'{path}/img',
                            '.jpg')
    return DataFrame(ftrs)
Example #57
0
    def test_infer_output_shape_listlike_columns(self):
        # GH 16353

        df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'])

        result = df.apply(lambda x: [1, 2, 3], axis=1)
        expected = Series([[1, 2, 3] for t in df.itertuples()])
        assert_series_equal(result, expected)

        result = df.apply(lambda x: [1, 2], axis=1)
        expected = Series([[1, 2] for t in df.itertuples()])
        assert_series_equal(result, expected)

        # GH 17970
        df = DataFrame({"a": [1, 2, 3]}, index=list('abc'))

        result = df.apply(lambda row: np.ones(1), axis=1)
        expected = Series([np.ones(1) for t in df.itertuples()],
                          index=df.index)
        assert_series_equal(result, expected)

        result = df.apply(lambda row: np.ones(2), axis=1)
        expected = Series([np.ones(2) for t in df.itertuples()],
                          index=df.index)
        assert_series_equal(result, expected)

        # GH 17892
        df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'),
                                 pd.Timestamp('2010-02-04'),
                                 pd.Timestamp('2010-02-05'),
                                 pd.Timestamp('2010-02-06')],
                           'b': [9, 5, 4, 3],
                           'c': [5, 3, 4, 2],
                           'd': [1, 2, 3, 4]})

        def fun(x):
            return (1, 2)

        result = df.apply(fun, axis=1)
        expected = Series([(1, 2) for t in df.itertuples()])
        assert_series_equal(result, expected)
Example #58
0
    def test_to_csv_multi_index(self):
        # see gh-6618
        df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))

        exp_rows = [',1',
                    ',2',
                    '0,1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ['1', '2', '1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

        df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]),
                       index=pd.MultiIndex.from_arrays([[1], [2]]))

        exp_rows = [',,1', ',,2', '1,2,1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ['1', '2', '1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp

        df = DataFrame(
            [1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']]))

        exp_rows = [',foo', ',bar', '0,1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv() == exp

        exp_rows = ['foo', 'bar', '1']
        exp = tm.convert_rows_list_to_csv_str(exp_rows)
        assert df.to_csv(index=False) == exp
Example #59
0
    def test_applymap(self, float_frame):
        applied = float_frame.applymap(lambda x: x * 2)
        tm.assert_frame_equal(applied, float_frame * 2)
        float_frame.applymap(type)

        # GH 465: function returning tuples
        result = float_frame.applymap(lambda x: (x, x))
        assert isinstance(result['A'][0], tuple)

        # GH 2909: object conversion to float in constructor?
        df = DataFrame(data=[1, 'a'])
        result = df.applymap(lambda x: x)
        assert result.dtypes[0] == object

        df = DataFrame(data=[1., 'a'])
        result = df.applymap(lambda x: x)
        assert result.dtypes[0] == object

        # GH 2786
        df = DataFrame(np.random.random((3, 4)))
        df2 = df.copy()
        cols = ['a', 'a', 'a', 'a']
        df.columns = cols

        expected = df2.applymap(str)
        expected.columns = cols
        result = df.applymap(str)
        tm.assert_frame_equal(result, expected)

        # datetime/timedelta
        df['datetime'] = Timestamp('20130101')
        df['timedelta'] = pd.Timedelta('1 min')
        result = df.applymap(str)
        for f in ['datetime', 'timedelta']:
            assert result.loc[0, f] == str(df.loc[0, f])

        # GH 8222
        empty_frames = [pd.DataFrame(),
                        pd.DataFrame(columns=list('ABC')),
                        pd.DataFrame(index=list('ABC')),
                        pd.DataFrame({'A': [], 'B': [], 'C': []})]
        for frame in empty_frames:
            for func in [round, lambda x: x]:
                result = frame.applymap(func)
                tm.assert_frame_equal(result, frame)
Example #60
0
class TestDataFrameAggregate():

    def test_agg_transform(self, axis, float_frame):
        other_axis = 1 if axis in {0, 'index'} else 0

        with np.errstate(all='ignore'):

            f_abs = np.abs(float_frame)
            f_sqrt = np.sqrt(float_frame)

            # ufunc
            result = float_frame.transform(np.sqrt, axis=axis)
            expected = f_sqrt.copy()
            assert_frame_equal(result, expected)

            result = float_frame.apply(np.sqrt, axis=axis)
            assert_frame_equal(result, expected)

            result = float_frame.transform(np.sqrt, axis=axis)
            assert_frame_equal(result, expected)

            # list-like
            result = float_frame.apply([np.sqrt], axis=axis)
            expected = f_sqrt.copy()
            if axis in {0, 'index'}:
                expected.columns = pd.MultiIndex.from_product(
                    [float_frame.columns, ['sqrt']])
            else:
                expected.index = pd.MultiIndex.from_product(
                    [float_frame.index, ['sqrt']])
            assert_frame_equal(result, expected)

            result = float_frame.transform([np.sqrt], axis=axis)
            assert_frame_equal(result, expected)

            # multiple items in list
            # these are in the order as if we are applying both
            # functions per series and then concatting
            result = float_frame.apply([np.abs, np.sqrt], axis=axis)
            expected = zip_frames([f_abs, f_sqrt], axis=other_axis)
            if axis in {0, 'index'}:
                expected.columns = pd.MultiIndex.from_product(
                    [float_frame.columns, ['absolute', 'sqrt']])
            else:
                expected.index = pd.MultiIndex.from_product(
                    [float_frame.index, ['absolute', 'sqrt']])
            assert_frame_equal(result, expected)

            result = float_frame.transform([np.abs, 'sqrt'], axis=axis)
            assert_frame_equal(result, expected)

    def test_transform_and_agg_err(self, axis, float_frame):
        # cannot both transform and agg
        with pytest.raises(ValueError):
            float_frame.transform(['max', 'min'], axis=axis)

        with pytest.raises(ValueError):
            with np.errstate(all='ignore'):
                float_frame.agg(['max', 'sqrt'], axis=axis)

        with pytest.raises(ValueError):
            with np.errstate(all='ignore'):
                float_frame.transform(['max', 'sqrt'], axis=axis)

        df = pd.DataFrame({'A': range(5), 'B': 5})

        def f():
            with np.errstate(all='ignore'):
                df.agg({'A': ['abs', 'sum'], 'B': ['mean', 'max']}, axis=axis)

    @pytest.mark.parametrize('method', [
        'abs', 'shift', 'pct_change', 'cumsum', 'rank',
    ])
    def test_transform_method_name(self, method):
        # GH 19760
        df = pd.DataFrame({"A": [-1, 2]})
        result = df.transform(method)
        expected = operator.methodcaller(method)(df)
        tm.assert_frame_equal(result, expected)

    def test_demo(self):
        # demonstration tests
        df = pd.DataFrame({'A': range(5), 'B': 5})

        result = df.agg(['min', 'max'])
        expected = DataFrame({'A': [0, 4], 'B': [5, 5]},
                             columns=['A', 'B'],
                             index=['min', 'max'])
        tm.assert_frame_equal(result, expected)

        result = df.agg({'A': ['min', 'max'], 'B': ['sum', 'max']})
        expected = DataFrame({'A': [4.0, 0.0, np.nan],
                              'B': [5.0, np.nan, 25.0]},
                             columns=['A', 'B'],
                             index=['max', 'min', 'sum'])
        tm.assert_frame_equal(result.reindex_like(expected), expected)

    def test_agg_multiple_mixed_no_warning(self):
        # GH 20909
        mdf = pd.DataFrame({'A': [1, 2, 3],
                            'B': [1., 2., 3.],
                            'C': ['foo', 'bar', 'baz'],
                            'D': pd.date_range('20130101', periods=3)})
        expected = pd.DataFrame({"A": [1, 6], 'B': [1.0, 6.0],
                                 "C": ['bar', 'foobarbaz'],
                                 "D": [pd.Timestamp('2013-01-01'), pd.NaT]},
                                index=['min', 'sum'])
        # sorted index
        with tm.assert_produces_warning(None):
            result = mdf.agg(['min', 'sum'])

        tm.assert_frame_equal(result, expected)

        with tm.assert_produces_warning(None):
            result = mdf[['D', 'C', 'B', 'A']].agg(['sum', 'min'])

        # For backwards compatibility, the result's index is
        # still sorted by function name, so it's ['min', 'sum']
        # not ['sum', 'min'].
        expected = expected[['D', 'C', 'B', 'A']]
        tm.assert_frame_equal(result, expected)

    def test_agg_dict_nested_renaming_depr(self):

        df = pd.DataFrame({'A': range(5), 'B': 5})

        # nested renaming
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            df.agg({'A': {'foo': 'min'},
                    'B': {'bar': 'max'}})

    def test_agg_reduce(self, axis, float_frame):
        other_axis = 1 if axis in {0, 'index'} else 0
        name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values()

        # all reducers
        expected = pd.concat([float_frame.mean(axis=axis),
                              float_frame.max(axis=axis),
                              float_frame.sum(axis=axis),
                              ], axis=1)
        expected.columns = ['mean', 'max', 'sum']
        expected = expected.T if axis in {0, 'index'} else expected

        result = float_frame.agg(['mean', 'max', 'sum'], axis=axis)
        assert_frame_equal(result, expected)

        # dict input with scalars
        func = OrderedDict([(name1, 'mean'), (name2, 'sum')])
        result = float_frame.agg(func, axis=axis)
        expected = Series([float_frame.loc(other_axis)[name1].mean(),
                           float_frame.loc(other_axis)[name2].sum()],
                          index=[name1, name2])
        assert_series_equal(result, expected)

        # dict input with lists
        func = OrderedDict([(name1, ['mean']), (name2, ['sum'])])
        result = float_frame.agg(func, axis=axis)
        expected = DataFrame({
            name1: Series([float_frame.loc(other_axis)[name1].mean()],
                          index=['mean']),
            name2: Series([float_frame.loc(other_axis)[name2].sum()],
                          index=['sum'])})
        expected = expected.T if axis in {1, 'columns'} else expected
        assert_frame_equal(result, expected)

        # dict input with lists with multiple
        func = OrderedDict([(name1, ['mean', 'sum']), (name2, ['sum', 'max'])])
        result = float_frame.agg(func, axis=axis)
        expected = DataFrame(OrderedDict([
            (name1, Series([float_frame.loc(other_axis)[name1].mean(),
                           float_frame.loc(other_axis)[name1].sum()],
                           index=['mean', 'sum'])),
            (name2, Series([float_frame.loc(other_axis)[name2].sum(),
                           float_frame.loc(other_axis)[name2].max()],
                           index=['sum', 'max'])),
        ]))
        expected = expected.T if axis in {1, 'columns'} else expected
        assert_frame_equal(result, expected)

    def test_nuiscance_columns(self):

        # GH 15015
        df = DataFrame({'A': [1, 2, 3],
                        'B': [1., 2., 3.],
                        'C': ['foo', 'bar', 'baz'],
                        'D': pd.date_range('20130101', periods=3)})

        result = df.agg('min')
        expected = Series([1, 1., 'bar', pd.Timestamp('20130101')],
                          index=df.columns)
        assert_series_equal(result, expected)

        result = df.agg(['min'])
        expected = DataFrame([[1, 1., 'bar', pd.Timestamp('20130101')]],
                             index=['min'], columns=df.columns)
        assert_frame_equal(result, expected)

        result = df.agg('sum')
        expected = Series([6, 6., 'foobarbaz'],
                          index=['A', 'B', 'C'])
        assert_series_equal(result, expected)

        result = df.agg(['sum'])
        expected = DataFrame([[6, 6., 'foobarbaz']],
                             index=['sum'], columns=['A', 'B', 'C'])
        assert_frame_equal(result, expected)

    def test_non_callable_aggregates(self):

        # GH 16405
        # 'size' is a property of frame/series
        # validate that this is working
        df = DataFrame({'A': [None, 2, 3],
                        'B': [1.0, np.nan, 3.0],
                        'C': ['foo', None, 'bar']})

        # Function aggregate
        result = df.agg({'A': 'count'})
        expected = Series({'A': 2})

        assert_series_equal(result, expected)

        # Non-function aggregate
        result = df.agg({'A': 'size'})
        expected = Series({'A': 3})

        assert_series_equal(result, expected)

        # Mix function and non-function aggs
        result1 = df.agg(['count', 'size'])
        result2 = df.agg({'A': ['count', 'size'],
                          'B': ['count', 'size'],
                          'C': ['count', 'size']})
        expected = pd.DataFrame({'A': {'count': 2, 'size': 3},
                                 'B': {'count': 2, 'size': 3},
                                 'C': {'count': 2, 'size': 3}})

        assert_frame_equal(result1, result2, check_like=True)
        assert_frame_equal(result2, expected, check_like=True)

        # Just functional string arg is same as calling df.arg()
        result = df.agg('count')
        expected = df.count()

        assert_series_equal(result, expected)

        # Just a string attribute arg same as calling df.arg
        result = df.agg('size')
        expected = df.size

        assert result == expected

    @pytest.mark.parametrize("df, func, expected", chain(
        _get_cython_table_params(
            DataFrame(), [
                ('sum', Series()),
                ('max', Series()),
                ('min', Series()),
                ('all', Series(dtype=bool)),
                ('any', Series(dtype=bool)),
                ('mean', Series()),
                ('prod', Series()),
                ('std', Series()),
                ('var', Series()),
                ('median', Series()),
            ]),
        _get_cython_table_params(
            DataFrame([[np.nan, 1], [1, 2]]), [
                ('sum', Series([1., 3])),
                ('max', Series([1., 2])),
                ('min', Series([1., 1])),
                ('all', Series([True, True])),
                ('any', Series([True, True])),
                ('mean', Series([1, 1.5])),
                ('prod', Series([1., 2])),
                ('std', Series([np.nan, 0.707107])),
                ('var', Series([np.nan, 0.5])),
                ('median', Series([1, 1.5])),
            ]),
    ))
    def test_agg_cython_table(self, df, func, expected, axis):
        # GH 21224
        # test reducing functions in
        # pandas.core.base.SelectionMixin._cython_table
        result = df.agg(func, axis=axis)
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("df, func, expected", chain(
        _get_cython_table_params(
            DataFrame(), [
                ('cumprod', DataFrame()),
                ('cumsum', DataFrame()),
            ]),
        _get_cython_table_params(
            DataFrame([[np.nan, 1], [1, 2]]), [
                ('cumprod', DataFrame([[np.nan, 1], [1., 2.]])),
                ('cumsum', DataFrame([[np.nan, 1], [1., 3.]])),
            ]),
    ))
    def test_agg_cython_table_transform(self, df, func, expected, axis):
        # GH 21224
        # test transforming functions in
        # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
        result = df.agg(func, axis=axis)
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("df, func, expected", _get_cython_table_params(
        DataFrame([['a', 'b'], ['b', 'a']]), [
            ['cumprod', TypeError],
        ]),
    )
    def test_agg_cython_table_raises(self, df, func, expected, axis):
        # GH 21224
        with pytest.raises(expected):
            df.agg(func, axis=axis)

    @pytest.mark.parametrize("num_cols", [2, 3, 5])
    def test_frequency_is_original(self, num_cols):
        # GH 22150
        index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"])
        original = index.copy()
        df = DataFrame(1, index=index, columns=range(num_cols))
        df.apply(lambda x: x)
        assert index.freq == original.freq