Example #1
0
    def test_subset(self):
        N = 10
        rng = date_range('1/1/1990', periods=N, freq='53s')
        df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
                       index=rng)
        df.loc[4:8, 'A'] = np.nan
        dates = date_range('1/1/1990', periods=N * 3,
                           freq='25s')

        # with a subset of A should be the same
        result = df.asof(dates, subset='A')
        expected = df.asof(dates)
        tm.assert_frame_equal(result, expected)

        # same with A/B
        result = df.asof(dates, subset=['A', 'B'])
        expected = df.asof(dates)
        tm.assert_frame_equal(result, expected)

        # B gives self.df.asof
        result = df.asof(dates, subset='B')
        expected = df.resample('25s', closed='right').ffill().reindex(dates)
        expected.iloc[20:] = 9

        tm.assert_frame_equal(result, expected)
Example #2
0
    def test_subset(self):

        N = 10
        rng = date_range('1/1/1990', periods=N, freq='53s')
        df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
                       index=rng)
        df.loc[4:8, 'A'] = np.nan
        dates = date_range('1/1/1990', periods=N * 3,
                           freq='25s')

        # with a subset of A should be the same
        result = df.asof(dates, subset='A')
        expected = df.asof(dates)
        assert_frame_equal(result, expected)

        # same with A/B
        result = df.asof(dates, subset=['A', 'B'])
        expected = df.asof(dates)
        assert_frame_equal(result, expected)

        # B gives self.df.asof
        result = df.asof(dates, subset='B')
        expected = df.resample('25s', closed='right').ffill().reindex(dates)
        expected.iloc[20:] = 9

        assert_frame_equal(result, expected)
Example #3
0
    def test_asof_periodindex_mismatched_freq(self):
        N = 50
        rng = period_range("1/1/1990", periods=N, freq="H")
        df = DataFrame(np.random.randn(N), index=rng)

        # Mismatched freq
        msg = "Input has different freq"
        with pytest.raises(IncompatibleFrequency, match=msg):
            df.asof(rng.asfreq("D"))
Example #4
0
    def test_missing(self):
        # GH 15118
        # no match found - `where` value before earliest date in index
        N = 10
        rng = date_range('1/1/1990', periods=N, freq='53s')
        df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, index=rng)
        result = df.asof('1989-12-31')

        expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31'))
        tm.assert_series_equal(result, expected)

        result = df.asof(to_datetime(['1989-12-31']))
        expected = DataFrame(index=to_datetime(['1989-12-31']),
                             columns=['A', 'B'],
                             dtype='float64')
        tm.assert_frame_equal(result, expected)
Example #5
0
    def test_missing(self):
        # GH 15118
        # no match found - `where` value before earliest date in index
        N = 10
        rng = date_range('1/1/1990', periods=N, freq='53s')
        df = DataFrame({'A': np.arange(N), 'B': np.arange(N)},
                       index=rng)
        result = df.asof('1989-12-31')

        expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31'))
        tm.assert_series_equal(result, expected)

        result = df.asof(to_datetime(['1989-12-31']))
        expected = DataFrame(index=to_datetime(['1989-12-31']),
                             columns=['A', 'B'], dtype='float64')
        tm.assert_frame_equal(result, expected)
Example #6
0
 def test_time_zone_aware_index(self, stamp, expected):
     # GH21194
     # Testing awareness of DataFrame index considering different
     # UTC and timezone
     df = DataFrame(data=[1, 2],
                    index=[Timestamp('2018-01-01 21:00:05.001+00:00'),
                           Timestamp('2018-01-01 22:35:10.550+00:00')])
     result = df.asof(stamp)
     tm.assert_series_equal(result, expected)
Example #7
0
 def test_time_zone_aware_index(self, stamp, expected):
     # GH21194
     # Testing awareness of DataFrame index considering different
     # UTC and timezone
     df = DataFrame(data=[1, 2],
                    index=[
                        Timestamp('2018-01-01 21:00:05.001+00:00'),
                        Timestamp('2018-01-01 22:35:10.550+00:00')
                    ])
     result = df.asof(stamp)
     tm.assert_series_equal(result, expected)
Example #8
0
def _concat_executions(market_data: pd.DataFrame,
                       executions: Union[List, pd.DataFrame]):
    if isinstance(executions, List):
        executions_df = pd.DataFrame(executions).set_index('datetime')
    elif isinstance(executions, pd.DataFrame):
        executions_df = executions.set_index('datetime')
    else:
        raise Exception(
            f'executions只支持格式:{_concat_executions.__annotations__["executions"]}'
        )

    executions_df.index = pd.to_datetime(executions_df.index)
    executions_df = executions_df.sort_index()
    market_data = market_data.sort_index()
    executions_df.index = market_data.asof(executions_df.index)['datetime']
    executions_df_grouped = executions_df.groupby('datetime').apply(
        lambda df: df.to_dict('records'))
    executions_df_grouped.name = 'trades'
    market_data = market_data.merge(executions_df_grouped,
                                    'left',
                                    left_index=True,
                                    right_index=True)
    return market_data
Example #9
0
class AsOfDataFrame(object):
    goal_time = 0.2

    def setup(self):
        self.N = 10000
        self.M = 100
        self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s')
        self.dates = date_range(start='1/1/1990',
                                periods=(self.N * 10),
                                freq='5s')
        self.ts = DataFrame(np.random.randn(self.N, self.M), index=self.rng)
        self.ts2 = self.ts.copy()
        self.ts2.iloc[250:5000] = np.nan
        self.ts3 = self.ts.copy()
        self.ts3.iloc[-5000:] = np.nan

    # test speed of pre-computing NAs.
    def time_asof(self):
        self.ts.asof(self.dates)

    # should be roughly the same as above.
    def time_asof_nan(self):
        self.ts2.asof(self.dates)

    # test speed of the code path for a scalar index
    # with pre-computing all NAs.
    def time_asof_single(self):
        self.ts.asof(self.dates[0])

    # should be roughly the same as above.
    def time_asof_nan_single(self):
        self.ts3.asof(self.dates[-1])

    # test speed of the code path for a scalar index
    # before the start. should be without the cost of
    # pre-computing all the NAs.
    def time_asof_single_early(self):
        self.ts.asof(self.dates[0] - dt.timedelta(10))
def split_into_examples(df: pd.DataFrame, label: str, examples: [np.ndarray],
                        labels_of_examples: [str], time_series_length,
                        interval_in_seconds, config,
                        failure_times_of_examples: [str], failure_time,
                        window_times_of_examples: [str], y, i_dataset):
    thread_list = []

    # sample time_series_length many values form each of the intervals if their length is near the configured value
    if not config.use_over_lapping_windows:

        # split case into single intervals with the configured length
        interval_list = [
            g for c, g in df.groupby(
                pd.Grouper(level='timestamp',
                           freq=str(interval_in_seconds) + 's'))
        ]

        for g in interval_list:
            g_len = (g.index[-1] - g.index[0]).total_seconds()

            # ensure time interval is long enough
            if interval_in_seconds - 0.5 <= g_len <= interval_in_seconds + 0.5:
                t = DFConverter(g, time_series_length, False)
                thread_list.append(t)
    else:
        # print("df.index[0]: ", df.index[0], "df.index[-1]: ", df.index[-1])
        start_time = df.index[0]
        end_time = df.index[-1]
        # slide over data frame and extract windows until the window would exceed the last time step
        while start_time + pd.to_timedelta(
                config.over_lapping_window_interval_in_seconds,
                unit='s') < end_time:
            # generate a list with indexes for window
            index = pd.date_range(start_time,
                                  periods=config.time_series_length,
                                  freq=config.resample_frequency)
            # print("from: ", index[0], "to: ", index[-1])

            # for use_over_lapping_windows doesn't do more than converting the part of the df into a numpy array
            # using the converter thread overhead to be able to so no further different handling is needed
            t = DFConverter(df.asof(index), time_series_length, True)
            thread_list.append(t)

            # update next start time for next window
            start_time = start_time + pd.to_timedelta(
                config.over_lapping_window_interval_in_seconds, unit='s')

    # sampling done multi threaded with the amount of cores configured
    thread_limit = config.max_parallel_cores if len(
        thread_list) > config.max_parallel_cores else len(thread_list)
    threads_finished = 0

    while threads_finished < len(thread_list):
        if threads_finished + thread_limit > len(thread_list):
            thread_limit = len(thread_list) - threads_finished

        r = threads_finished + thread_limit
        for i in range(threads_finished, r):
            thread_list[i].start()

        for i in range(threads_finished, r):
            thread_list[i].join()

        for i in range(threads_finished, r):
            examples.append(thread_list[i].result)
            labels_of_examples.append(label)

            if failure_time == "":
                failure_times_of_examples.append("noFailure-" +
                                                 str(i_dataset) + "-" + str(y))
            else:
                failure_times_of_examples.append(str(failure_time))

            window_times_of_examples.append(thread_list[i].windowTimesAsString)

        threads_finished += thread_limit