Example #1
0
    def test_first_last_valid(self):
        N = len(self.frame.index)
        mat = randn(N)
        mat[:5] = nan
        mat[-5:] = nan

        frame = DataFrame({'foo': mat}, index=self.frame.index)
        index = frame.first_valid_index()

        assert index == frame.index[5]

        index = frame.last_valid_index()
        assert index == frame.index[-6]

        # GH12800
        empty = DataFrame()
        assert empty.last_valid_index() is None
        assert empty.first_valid_index() is None

        # GH17400: no valid entries
        frame[:] = nan
        assert frame.last_valid_index() is None
        assert frame.first_valid_index() is None

        # GH20499: its preserves freq with holes
        frame.index = date_range("20110101", periods=N, freq="B")
        frame.iloc[1] = 1
        frame.iloc[-2] = 1
        assert frame.first_valid_index() == frame.index[1]
        assert frame.last_valid_index() == frame.index[-2]
        assert frame.first_valid_index().freq == frame.index.freq
        assert frame.last_valid_index().freq == frame.index.freq
Example #2
0
def _keep_name_new_row(df: pd.DataFrame) -> pd.DataFrame:
    """Function to insert row in the dataframe"""
    empty_row = pd.DataFrame(
        {
            'tid': '',
            'pid': '',
            'category': '',
            'best_round': '',
            'name': df.loc[df.first_valid_index(), 'name']
        },
        index=[-1])
    df.loc[df.first_valid_index(), 'name'] = ""

    return pd.concat([empty_row, df])
Example #3
0
 def _write_ticker_by_day(self, df: DataFrame, data_type):
     """
     Compose file name from ticker and
     :param df: Dataframe, all rows have the same day and ticker in index
     :param data_type: tag to use in file name
     """
     # Form file name
     date = df.first_valid_index()[0].date()
     ticker = df.first_valid_index()[1].replace('/', '_')
     file_name = '%s_%s_%s.csv' % (ticker, data_type,
                                   date.strftime('%Y-%m-%d'))
     file_path = os.path.join(self._data_dir, file_name)
     self._logger.debug("Writing %s to %s", data_type,
                        os.path.abspath(file_path))
     # Write to file
     df.to_csv(file_path, mode='a', header=False)
Example #4
0
def time_filter_data(dataframe: pd.DataFrame,
                     timestamp_start: int = None,
                     timestamp_end: int = None) -> pd.DataFrame:
    """reduce a dataframe based on the provided times start and end timestamp. It is assumed that
    the provided time stamp are not necessarily in the data, an approximation is used to slice as
    accurately as possible. If start is not provided, it is assumed to be the
    start of the data frame. If end is not provided its assumed to be the end of the data frame.

    Note: the index will be sorted in order to enable slicing

    Args:
        dataframe (pd.DataFrame): Data frame to be sliced
        timestamp_start (int): index of first data point (inclusive, unix timestamp) .
        timestamp_end (int): index of last data point (inclusive, unix time stamp)

    Returns:
        dataframe (pd.DataFrame): sliced pd DataFrame.

    """

    dataframe = dataframe.sort_index()
    if timestamp_start is None:
        print("start index was not provided")
        timestamp_start = dataframe.first_valid_index()

    if timestamp_end is None:
        print("end index is not provided")
        timestamp_end = dataframe.last_valid_index()

    reduced_dataframe = dataframe[(dataframe.index > timestamp_start)
                                  & (dataframe.index < timestamp_end)]

    return reduced_dataframe
Example #5
0
def slice_by_index(dataframe: pd.DataFrame,
                   timestamp_start: int = None,
                   timestamp_end: int = None) -> pd.DataFrame:
    """cuts out the data in between the timestamps given and returns the data to both sides of the
    time range given. If one start is not provided, it is assumed to be the start of the data frame.
    If end is not provided its assumed to be the end of the data frame

    Args:
        dataframe (pd.DataFrame): Data frame to be sliced
        timestamp_start (int): index of first data point (inclusive, unix timestamp) .
        timestamp_end (int): index of last data point (inclusive, unix time stamp)

    Returns:
        dataframe (pd.DataFrame): sliced pd DataFrame.

    """
    if timestamp_start is None:
        timestamp_start = dataframe.first_valid_index()

    if timestamp_end is None:
        timestamp_end = dataframe.last_valid_index()

    dataframe = dataframe[(dataframe.index < timestamp_start) |
                          (dataframe.index > timestamp_end)]

    return dataframe
    def _augment_account(cls, account: pd.DataFrame) -> pd.DataFrame:
        account_isins = set(isin for isins in account["isins"].tolist() if isins for isin in isins)
        prices_df = fund_cache.get_prices(account_isins)
        prices_ratios_df = prices_df.pct_change() + 1

        values_series = account.reindex(index=prices_df.index)["value"]
        isins_series = account.reindex(index=prices_df.index, method="bfill")["isins"]
        augmented = pd.concat([values_series, isins_series], axis=1) \
            .truncate(before=account.first_valid_index(),
                      after=account.last_valid_index())
        last_valid_index_loc = augmented.index.get_loc(values_series.last_valid_index())
        # Bfill from last valid entry
        for i in range(last_valid_index_loc, 0, -1):
            dt = augmented.index[i]
            curr_value, prev_value = augmented.iloc[i]["value"], augmented.iloc[i - 1]["value"]
            isins = augmented.iloc[i]["isins"]
            if np.isnan(prev_value):
                if not isins:
                    augmented.at[augmented.index[i - 1], "value"] = curr_value
                else:
                    augmented.at[augmented.index[i - 1], "value"] = \
                        curr_value / prices_ratios_df.loc[dt, isins].mean()
        # Ffill from last valid entry to today
        for i in range(last_valid_index_loc, len(augmented.index) - 1):
            dt = augmented.index[i]
            curr_value, next_value = augmented.iloc[i]["value"], augmented.iloc[i + 1]["value"]
            next_isins = augmented.iloc[i + 1]["isins"]
            if np.isnan(next_value):
                if not next_isins:
                    augmented.at[augmented.index[i + 1], "value"] = curr_value
                else:
                    augmented.at[augmented.index[i + 1], "value"] = \
                        curr_value * prices_ratios_df.loc[dt, next_isins].mean()
        augmented["value"] = augmented["value"].bfill()  # fill initial NaNs with 100
        return augmented
Example #7
0
    def test_first_last_valid_preserves_freq(self):
        # GH#20499: its preserves freq with holes
        index = date_range("20110101", periods=30, freq="B")
        frame = DataFrame(np.nan, columns=["foo"], index=index)

        frame.iloc[1] = 1
        frame.iloc[-2] = 1
        assert frame.first_valid_index() == frame.index[1]
        assert frame.last_valid_index() == frame.index[-2]
        assert frame.first_valid_index().freq == frame.index.freq
        assert frame.last_valid_index().freq == frame.index.freq

        ts = frame["foo"]
        assert ts.first_valid_index() == ts.index[1]
        assert ts.last_valid_index() == ts.index[-2]
        assert ts.first_valid_index().freq == ts.index.freq
        assert ts.last_valid_index().freq == ts.index.freq
Example #8
0
    def get_kickoffs_from_game(
            game: Game, proto_game: game_pb2, id_creation: Callable,
            player_map: Dict[str, Player], data_frame: pd.DataFrame,
            kickoff_frames: pd.DataFrame,
            first_touch_frames: pd.DataFrame) -> Dict[int, KickoffStats]:
        kickoffs = dict()
        goals = proto_game.game_metadata.goals
        num_goals = len(goals)
        last_frame = data_frame.last_valid_index()
        first_frame = data_frame.first_valid_index()
        for index, frame in enumerate(kickoff_frames):
            starting_kickoff_time = data_frame.game.time[frame]
            cur_kickoff = proto_game.game_stats.kickoff_stats.add()
            end_frame = first_touch_frames[index]
            smaller_data_frame = data_frame.loc[
                max(first_frame, frame - 1):min(end_frame + 20, last_frame)]
            cur_kickoff.start_frame = frame
            cur_kickoff.touch_frame = end_frame
            ending_time = smaller_data_frame['game']['time'][end_frame]
            time = cur_kickoff.touch_time = ending_time - starting_kickoff_time
            differs = smaller_data_frame['game']['time'][frame:end_frame].diff(
            )
            summed_time_diff = differs.sum()
            summed_time = smaller_data_frame['game']['delta'][
                frame:end_frame].sum()
            if summed_time > 0:
                cur_kickoff.touch_time = summed_time
            logger.error("STRAIGHT TIME " + str(time))
            logger.error("SUM TIME" + str(summed_time))
            sum_vs_adding_diff = time - summed_time

            # find who touched the ball first
            closest_player_distance = 10000000
            closest_player_id = 0

            if index < num_goals:
                BaseKickoff.get_goal_data(cur_kickoff, goals[index],
                                          data_frame)

            # get player stats
            for player in player_map.values():
                if player.name not in data_frame:
                    continue
                kickoff_player = BaseKickoff.get_player_stats(
                    cur_kickoff, player, smaller_data_frame, frame, end_frame)

                if kickoff_player.ball_dist < closest_player_distance:
                    closest_player_distance = kickoff_player.ball_dist
                    closest_player_id = player.id.id

            if closest_player_distance != 10000000:
                # Todo use hit analysis
                cur_kickoff.touch.first_touch_player.id = closest_player_id
            cur_kickoff.type = BaseKickoff.get_kickoff_type(
                cur_kickoff.touch.players)
            kickoffs[frame] = cur_kickoff
        return kickoffs
Example #9
0
    def test_first_last_valid(self):
        N = len(self.frame.index)
        mat = randn(N)
        mat[:5] = nan
        mat[-5:] = nan

        frame = DataFrame({'foo': mat}, index=self.frame.index)
        index = frame.first_valid_index()

        self.assertEqual(index, frame.index[5])

        index = frame.last_valid_index()
        self.assertEqual(index, frame.index[-6])

        # GH12800
        empty = DataFrame()
        self.assertIsNone(empty.last_valid_index())
        self.assertIsNone(empty.first_valid_index())
Example #10
0
    def test_first_last_valid(self):
        N = len(self.frame.index)
        mat = randn(N)
        mat[:5] = nan
        mat[-5:] = nan

        frame = DataFrame({'foo': mat}, index=self.frame.index)
        index = frame.first_valid_index()

        self.assertEqual(index, frame.index[5])

        index = frame.last_valid_index()
        self.assertEqual(index, frame.index[-6])

        # GH12800
        empty = DataFrame()
        self.assertIsNone(empty.last_valid_index())
        self.assertIsNone(empty.first_valid_index())
Example #11
0
    def test_first_last_valid(self):
        N = len(self.frame.index)
        mat = randn(N)
        mat[:5] = nan
        mat[-5:] = nan

        frame = DataFrame({'foo': mat}, index=self.frame.index)
        index = frame.first_valid_index()

        assert index == frame.index[5]

        index = frame.last_valid_index()
        assert index == frame.index[-6]

        # GH12800
        empty = DataFrame()
        assert empty.last_valid_index() is None
        assert empty.first_valid_index() is None
Example #12
0
    def test_first_last_valid(self):
        N = len(self.frame.index)
        mat = randn(N)
        mat[:5] = nan
        mat[-5:] = nan

        frame = DataFrame({'foo': mat}, index=self.frame.index)
        index = frame.first_valid_index()

        assert index == frame.index[5]

        index = frame.last_valid_index()
        assert index == frame.index[-6]

        # GH12800
        empty = DataFrame()
        assert empty.last_valid_index() is None
        assert empty.first_valid_index() is None
Example #13
0
    def test_first_last_valid_all_nan(self, index_func):
        # GH#17400: no valid entries
        index = index_func(30)
        frame = DataFrame(np.nan, columns=["foo"], index=index)

        assert frame.last_valid_index() is None
        assert frame.first_valid_index() is None

        ser = frame["foo"]
        assert ser.first_valid_index() is None
        assert ser.last_valid_index() is None
Example #14
0
    def test_first_last_valid(self, index_func):
        N = 30
        index = index_func(N)
        mat = np.random.randn(N)
        mat[:5] = np.nan
        mat[-5:] = np.nan

        frame = DataFrame({"foo": mat}, index=index)
        assert frame.first_valid_index() == frame.index[5]
        assert frame.last_valid_index() == frame.index[-6]

        ser = frame["foo"]
        assert ser.first_valid_index() == frame.index[5]
        assert ser.last_valid_index() == frame.index[-6]
Example #15
0
    def get_dataframe_daterange(
            self, dataframe: DataFrame) -> Tuple[Timestamp, Timestamp]:
        """Returns the daterange for the passed DataFrame

        Args:
            dataframe: DataFrame to parse
        Returns:
            tuple (Timestamp, Timestamp): Start and end Timestamps for data
        """
        from pandas import DatetimeIndex
        from openghg.util import timestamp_tzaware

        if not isinstance(dataframe.index, DatetimeIndex):
            raise TypeError(
                "Only DataFrames with a DatetimeIndex must be passed")

        # Here we want to make the pandas Timestamps timezone aware
        start = timestamp_tzaware(dataframe.first_valid_index())
        end = timestamp_tzaware(dataframe.last_valid_index())

        return start, end
Example #16
0
def df_to_data(df: pd.DataFrame) -> Tuple[List]:
    """Converts GT dataframe to detections and their classes with confidences 1.0

    Args:
        df (pd.DataFrame): input dataframe for GT

    Returns:
        Tuple[List]: tuple of detection rects, detection classes
    """
    # get rects (boxes+scores) and classes for this specific dataframe
    rects = np.zeros((len(df), 5))
    scores = np.zeros(len(df))

    if len(df) == 0:
        return rects, scores
    fi = df.first_valid_index()
    w, h = df['width'][fi], df['height'][fi]
    for i in range(len(df)):
        rects[i] = np.array([df['xmin'][fi+i], df['ymin']
                             [fi+i], df['xmax'][fi+i], df['ymax'][fi+i], 1.0])
        classes[i] = 1.0  # df['class'][fi+1]

    return rects, classes
Example #17
0
def ohlcv_resample(ohlcv: pd.DataFrame, **kwargs):
    period = int(kwargs.get('period', 7))
    interval = kwargs.get('interval', 'D')
    process_fun = kwargs.get('process_fun', lambda x: x)
    rename_fun = kwargs.get('rename_fun', None)
    result = []
    df = ohlcv.sort_index()
    for i in range(period):
        _df = df.iloc[i:]
        nth_day = _df.resample('{}{}'.format(period, interval),
                               closed='left',
                               label='right',
                               convention='end',
                               kind='timestamp'
                ).agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum'}).copy()

        result.append(process_fun(nth_day))
    _result = pd.concat(result, sort=True).sort_index()
    if rename_fun:
        _result.columns = rename_fun([c for c in _result.columns])
    if kwargs.get('trim', True):
        _result = _result.loc[ohlcv.first_valid_index():ohlcv.last_valid_index()]
    return _result
Example #18
0
def force_full_index(dataframe: pd.DataFrame,
                     resampling_step: int = None,
                     resampling_unit: str = "min",
                     timestamp_start: int = None,
                     timestamp_end: int = None) -> pd.DataFrame:
    """ forces a full index. Missing index will be replaced by Nan.

        Note: resampling should be done before to benefit from sampling strategies.

        Args:
            dataframe(dataframe): data frame containing NaN values
            resampling_step (int, 8): This is the desired time step of final dataframe.
            resampling_unit (str, 't'): unit of desired time step
            timestamp_start (string, none): index at which the dataframe starts
            timestamp_end (string, none): index at which the dataframe ends
        Returns
            dataframe(pandas.Dataframe): dataframe with full index
    """

    if timestamp_start is None:
        print("start index was not provided")
        timestamp_start = dataframe.first_valid_index()

    if timestamp_end is None:
        print("end index is not provided")
        timestamp_end = dataframe.last_valid_index()

    freq = str(resampling_step) + resampling_unit

    new_index = pd.date_range(start=timestamp_start,
                              end=timestamp_end,
                              freq=freq)
    new_index = new_index.astype(numpy.int64) // 10**9
    delta_time_tmp = dataframe.reindex(index=new_index, fill_value=numpy.nan)

    return delta_time_tmp
Example #19
0
    def test_first_last_valid(
        self, float_frame, data, idx, expected_first, expected_last
    ):
        N = len(float_frame.index)
        mat = np.random.randn(N)
        mat[:5] = np.nan
        mat[-5:] = np.nan

        frame = DataFrame({"foo": mat}, index=float_frame.index)
        index = frame.first_valid_index()

        assert index == frame.index[5]

        index = frame.last_valid_index()
        assert index == frame.index[-6]

        # GH12800
        empty = DataFrame()
        assert empty.last_valid_index() is None
        assert empty.first_valid_index() is None

        # GH17400: no valid entries
        frame[:] = np.nan
        assert frame.last_valid_index() is None
        assert frame.first_valid_index() is None

        # GH20499: its preserves freq with holes
        frame.index = date_range("20110101", periods=N, freq="B")
        frame.iloc[1] = 1
        frame.iloc[-2] = 1
        assert frame.first_valid_index() == frame.index[1]
        assert frame.last_valid_index() == frame.index[-2]
        assert frame.first_valid_index().freq == frame.index.freq
        assert frame.last_valid_index().freq == frame.index.freq

        # GH 21441
        df = DataFrame(data, index=idx)
        assert expected_first == df.first_valid_index()
        assert expected_last == df.last_valid_index()
Example #20
0
    def test_first_last_valid(self, data, idx,
                              expected_first, expected_last):
        N = len(self.frame.index)
        mat = randn(N)
        mat[:5] = np.nan
        mat[-5:] = np.nan

        frame = DataFrame({'foo': mat}, index=self.frame.index)
        index = frame.first_valid_index()

        assert index == frame.index[5]

        index = frame.last_valid_index()
        assert index == frame.index[-6]

        # GH12800
        empty = DataFrame()
        assert empty.last_valid_index() is None
        assert empty.first_valid_index() is None

        # GH17400: no valid entries
        frame[:] = np.nan
        assert frame.last_valid_index() is None
        assert frame.first_valid_index() is None

        # GH20499: its preserves freq with holes
        frame.index = date_range("20110101", periods=N, freq="B")
        frame.iloc[1] = 1
        frame.iloc[-2] = 1
        assert frame.first_valid_index() == frame.index[1]
        assert frame.last_valid_index() == frame.index[-2]
        assert frame.first_valid_index().freq == frame.index.freq
        assert frame.last_valid_index().freq == frame.index.freq

        # GH 21441
        df = DataFrame(data, index=idx)
        assert expected_first == df.first_valid_index()
        assert expected_last == df.last_valid_index()
Example #21
0
 def test_first_last_valid_frame(self, data, idx, expected_first,
                                 expected_last):
     # GH#21441
     df = DataFrame(data, index=idx)
     assert expected_first == df.first_valid_index()
     assert expected_last == df.last_valid_index()
# In[232]:


df=pd.DataFrame({"A":[None,None,2,4,5],"B":[None,None,None,44,2],"C":[None,None,None,1,5]})


# In[233]:


df


# In[234]:


df.first_valid_index()


# In[235]:


df=pd.Series([None,None,"sam","alex","sophia",None])


# In[236]:


df


# In[237]: