def test_first_last_valid(self): N = len(self.frame.index) mat = randn(N) mat[:5] = nan mat[-5:] = nan frame = DataFrame({'foo': mat}, index=self.frame.index) index = frame.first_valid_index() assert index == frame.index[5] index = frame.last_valid_index() assert index == frame.index[-6] # GH12800 empty = DataFrame() assert empty.last_valid_index() is None assert empty.first_valid_index() is None # GH17400: no valid entries frame[:] = nan assert frame.last_valid_index() is None assert frame.first_valid_index() is None # GH20499: its preserves freq with holes frame.index = date_range("20110101", periods=N, freq="B") frame.iloc[1] = 1 frame.iloc[-2] = 1 assert frame.first_valid_index() == frame.index[1] assert frame.last_valid_index() == frame.index[-2] assert frame.first_valid_index().freq == frame.index.freq assert frame.last_valid_index().freq == frame.index.freq
def _keep_name_new_row(df: pd.DataFrame) -> pd.DataFrame: """Function to insert row in the dataframe""" empty_row = pd.DataFrame( { 'tid': '', 'pid': '', 'category': '', 'best_round': '', 'name': df.loc[df.first_valid_index(), 'name'] }, index=[-1]) df.loc[df.first_valid_index(), 'name'] = "" return pd.concat([empty_row, df])
def _write_ticker_by_day(self, df: DataFrame, data_type): """ Compose file name from ticker and :param df: Dataframe, all rows have the same day and ticker in index :param data_type: tag to use in file name """ # Form file name date = df.first_valid_index()[0].date() ticker = df.first_valid_index()[1].replace('/', '_') file_name = '%s_%s_%s.csv' % (ticker, data_type, date.strftime('%Y-%m-%d')) file_path = os.path.join(self._data_dir, file_name) self._logger.debug("Writing %s to %s", data_type, os.path.abspath(file_path)) # Write to file df.to_csv(file_path, mode='a', header=False)
def time_filter_data(dataframe: pd.DataFrame, timestamp_start: int = None, timestamp_end: int = None) -> pd.DataFrame: """reduce a dataframe based on the provided times start and end timestamp. It is assumed that the provided time stamp are not necessarily in the data, an approximation is used to slice as accurately as possible. If start is not provided, it is assumed to be the start of the data frame. If end is not provided its assumed to be the end of the data frame. Note: the index will be sorted in order to enable slicing Args: dataframe (pd.DataFrame): Data frame to be sliced timestamp_start (int): index of first data point (inclusive, unix timestamp) . timestamp_end (int): index of last data point (inclusive, unix time stamp) Returns: dataframe (pd.DataFrame): sliced pd DataFrame. """ dataframe = dataframe.sort_index() if timestamp_start is None: print("start index was not provided") timestamp_start = dataframe.first_valid_index() if timestamp_end is None: print("end index is not provided") timestamp_end = dataframe.last_valid_index() reduced_dataframe = dataframe[(dataframe.index > timestamp_start) & (dataframe.index < timestamp_end)] return reduced_dataframe
def slice_by_index(dataframe: pd.DataFrame, timestamp_start: int = None, timestamp_end: int = None) -> pd.DataFrame: """cuts out the data in between the timestamps given and returns the data to both sides of the time range given. If one start is not provided, it is assumed to be the start of the data frame. If end is not provided its assumed to be the end of the data frame Args: dataframe (pd.DataFrame): Data frame to be sliced timestamp_start (int): index of first data point (inclusive, unix timestamp) . timestamp_end (int): index of last data point (inclusive, unix time stamp) Returns: dataframe (pd.DataFrame): sliced pd DataFrame. """ if timestamp_start is None: timestamp_start = dataframe.first_valid_index() if timestamp_end is None: timestamp_end = dataframe.last_valid_index() dataframe = dataframe[(dataframe.index < timestamp_start) | (dataframe.index > timestamp_end)] return dataframe
def _augment_account(cls, account: pd.DataFrame) -> pd.DataFrame: account_isins = set(isin for isins in account["isins"].tolist() if isins for isin in isins) prices_df = fund_cache.get_prices(account_isins) prices_ratios_df = prices_df.pct_change() + 1 values_series = account.reindex(index=prices_df.index)["value"] isins_series = account.reindex(index=prices_df.index, method="bfill")["isins"] augmented = pd.concat([values_series, isins_series], axis=1) \ .truncate(before=account.first_valid_index(), after=account.last_valid_index()) last_valid_index_loc = augmented.index.get_loc(values_series.last_valid_index()) # Bfill from last valid entry for i in range(last_valid_index_loc, 0, -1): dt = augmented.index[i] curr_value, prev_value = augmented.iloc[i]["value"], augmented.iloc[i - 1]["value"] isins = augmented.iloc[i]["isins"] if np.isnan(prev_value): if not isins: augmented.at[augmented.index[i - 1], "value"] = curr_value else: augmented.at[augmented.index[i - 1], "value"] = \ curr_value / prices_ratios_df.loc[dt, isins].mean() # Ffill from last valid entry to today for i in range(last_valid_index_loc, len(augmented.index) - 1): dt = augmented.index[i] curr_value, next_value = augmented.iloc[i]["value"], augmented.iloc[i + 1]["value"] next_isins = augmented.iloc[i + 1]["isins"] if np.isnan(next_value): if not next_isins: augmented.at[augmented.index[i + 1], "value"] = curr_value else: augmented.at[augmented.index[i + 1], "value"] = \ curr_value * prices_ratios_df.loc[dt, next_isins].mean() augmented["value"] = augmented["value"].bfill() # fill initial NaNs with 100 return augmented
def test_first_last_valid_preserves_freq(self): # GH#20499: its preserves freq with holes index = date_range("20110101", periods=30, freq="B") frame = DataFrame(np.nan, columns=["foo"], index=index) frame.iloc[1] = 1 frame.iloc[-2] = 1 assert frame.first_valid_index() == frame.index[1] assert frame.last_valid_index() == frame.index[-2] assert frame.first_valid_index().freq == frame.index.freq assert frame.last_valid_index().freq == frame.index.freq ts = frame["foo"] assert ts.first_valid_index() == ts.index[1] assert ts.last_valid_index() == ts.index[-2] assert ts.first_valid_index().freq == ts.index.freq assert ts.last_valid_index().freq == ts.index.freq
def get_kickoffs_from_game( game: Game, proto_game: game_pb2, id_creation: Callable, player_map: Dict[str, Player], data_frame: pd.DataFrame, kickoff_frames: pd.DataFrame, first_touch_frames: pd.DataFrame) -> Dict[int, KickoffStats]: kickoffs = dict() goals = proto_game.game_metadata.goals num_goals = len(goals) last_frame = data_frame.last_valid_index() first_frame = data_frame.first_valid_index() for index, frame in enumerate(kickoff_frames): starting_kickoff_time = data_frame.game.time[frame] cur_kickoff = proto_game.game_stats.kickoff_stats.add() end_frame = first_touch_frames[index] smaller_data_frame = data_frame.loc[ max(first_frame, frame - 1):min(end_frame + 20, last_frame)] cur_kickoff.start_frame = frame cur_kickoff.touch_frame = end_frame ending_time = smaller_data_frame['game']['time'][end_frame] time = cur_kickoff.touch_time = ending_time - starting_kickoff_time differs = smaller_data_frame['game']['time'][frame:end_frame].diff( ) summed_time_diff = differs.sum() summed_time = smaller_data_frame['game']['delta'][ frame:end_frame].sum() if summed_time > 0: cur_kickoff.touch_time = summed_time logger.error("STRAIGHT TIME " + str(time)) logger.error("SUM TIME" + str(summed_time)) sum_vs_adding_diff = time - summed_time # find who touched the ball first closest_player_distance = 10000000 closest_player_id = 0 if index < num_goals: BaseKickoff.get_goal_data(cur_kickoff, goals[index], data_frame) # get player stats for player in player_map.values(): if player.name not in data_frame: continue kickoff_player = BaseKickoff.get_player_stats( cur_kickoff, player, smaller_data_frame, frame, end_frame) if kickoff_player.ball_dist < closest_player_distance: closest_player_distance = kickoff_player.ball_dist closest_player_id = player.id.id if closest_player_distance != 10000000: # Todo use hit analysis cur_kickoff.touch.first_touch_player.id = closest_player_id cur_kickoff.type = BaseKickoff.get_kickoff_type( cur_kickoff.touch.players) kickoffs[frame] = cur_kickoff return kickoffs
def test_first_last_valid(self): N = len(self.frame.index) mat = randn(N) mat[:5] = nan mat[-5:] = nan frame = DataFrame({'foo': mat}, index=self.frame.index) index = frame.first_valid_index() self.assertEqual(index, frame.index[5]) index = frame.last_valid_index() self.assertEqual(index, frame.index[-6]) # GH12800 empty = DataFrame() self.assertIsNone(empty.last_valid_index()) self.assertIsNone(empty.first_valid_index())
def test_first_last_valid(self): N = len(self.frame.index) mat = randn(N) mat[:5] = nan mat[-5:] = nan frame = DataFrame({'foo': mat}, index=self.frame.index) index = frame.first_valid_index() assert index == frame.index[5] index = frame.last_valid_index() assert index == frame.index[-6] # GH12800 empty = DataFrame() assert empty.last_valid_index() is None assert empty.first_valid_index() is None
def test_first_last_valid_all_nan(self, index_func): # GH#17400: no valid entries index = index_func(30) frame = DataFrame(np.nan, columns=["foo"], index=index) assert frame.last_valid_index() is None assert frame.first_valid_index() is None ser = frame["foo"] assert ser.first_valid_index() is None assert ser.last_valid_index() is None
def test_first_last_valid(self, index_func): N = 30 index = index_func(N) mat = np.random.randn(N) mat[:5] = np.nan mat[-5:] = np.nan frame = DataFrame({"foo": mat}, index=index) assert frame.first_valid_index() == frame.index[5] assert frame.last_valid_index() == frame.index[-6] ser = frame["foo"] assert ser.first_valid_index() == frame.index[5] assert ser.last_valid_index() == frame.index[-6]
def get_dataframe_daterange( self, dataframe: DataFrame) -> Tuple[Timestamp, Timestamp]: """Returns the daterange for the passed DataFrame Args: dataframe: DataFrame to parse Returns: tuple (Timestamp, Timestamp): Start and end Timestamps for data """ from pandas import DatetimeIndex from openghg.util import timestamp_tzaware if not isinstance(dataframe.index, DatetimeIndex): raise TypeError( "Only DataFrames with a DatetimeIndex must be passed") # Here we want to make the pandas Timestamps timezone aware start = timestamp_tzaware(dataframe.first_valid_index()) end = timestamp_tzaware(dataframe.last_valid_index()) return start, end
def df_to_data(df: pd.DataFrame) -> Tuple[List]: """Converts GT dataframe to detections and their classes with confidences 1.0 Args: df (pd.DataFrame): input dataframe for GT Returns: Tuple[List]: tuple of detection rects, detection classes """ # get rects (boxes+scores) and classes for this specific dataframe rects = np.zeros((len(df), 5)) scores = np.zeros(len(df)) if len(df) == 0: return rects, scores fi = df.first_valid_index() w, h = df['width'][fi], df['height'][fi] for i in range(len(df)): rects[i] = np.array([df['xmin'][fi+i], df['ymin'] [fi+i], df['xmax'][fi+i], df['ymax'][fi+i], 1.0]) classes[i] = 1.0 # df['class'][fi+1] return rects, classes
def ohlcv_resample(ohlcv: pd.DataFrame, **kwargs): period = int(kwargs.get('period', 7)) interval = kwargs.get('interval', 'D') process_fun = kwargs.get('process_fun', lambda x: x) rename_fun = kwargs.get('rename_fun', None) result = [] df = ohlcv.sort_index() for i in range(period): _df = df.iloc[i:] nth_day = _df.resample('{}{}'.format(period, interval), closed='left', label='right', convention='end', kind='timestamp' ).agg({'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum'}).copy() result.append(process_fun(nth_day)) _result = pd.concat(result, sort=True).sort_index() if rename_fun: _result.columns = rename_fun([c for c in _result.columns]) if kwargs.get('trim', True): _result = _result.loc[ohlcv.first_valid_index():ohlcv.last_valid_index()] return _result
def force_full_index(dataframe: pd.DataFrame, resampling_step: int = None, resampling_unit: str = "min", timestamp_start: int = None, timestamp_end: int = None) -> pd.DataFrame: """ forces a full index. Missing index will be replaced by Nan. Note: resampling should be done before to benefit from sampling strategies. Args: dataframe(dataframe): data frame containing NaN values resampling_step (int, 8): This is the desired time step of final dataframe. resampling_unit (str, 't'): unit of desired time step timestamp_start (string, none): index at which the dataframe starts timestamp_end (string, none): index at which the dataframe ends Returns dataframe(pandas.Dataframe): dataframe with full index """ if timestamp_start is None: print("start index was not provided") timestamp_start = dataframe.first_valid_index() if timestamp_end is None: print("end index is not provided") timestamp_end = dataframe.last_valid_index() freq = str(resampling_step) + resampling_unit new_index = pd.date_range(start=timestamp_start, end=timestamp_end, freq=freq) new_index = new_index.astype(numpy.int64) // 10**9 delta_time_tmp = dataframe.reindex(index=new_index, fill_value=numpy.nan) return delta_time_tmp
def test_first_last_valid( self, float_frame, data, idx, expected_first, expected_last ): N = len(float_frame.index) mat = np.random.randn(N) mat[:5] = np.nan mat[-5:] = np.nan frame = DataFrame({"foo": mat}, index=float_frame.index) index = frame.first_valid_index() assert index == frame.index[5] index = frame.last_valid_index() assert index == frame.index[-6] # GH12800 empty = DataFrame() assert empty.last_valid_index() is None assert empty.first_valid_index() is None # GH17400: no valid entries frame[:] = np.nan assert frame.last_valid_index() is None assert frame.first_valid_index() is None # GH20499: its preserves freq with holes frame.index = date_range("20110101", periods=N, freq="B") frame.iloc[1] = 1 frame.iloc[-2] = 1 assert frame.first_valid_index() == frame.index[1] assert frame.last_valid_index() == frame.index[-2] assert frame.first_valid_index().freq == frame.index.freq assert frame.last_valid_index().freq == frame.index.freq # GH 21441 df = DataFrame(data, index=idx) assert expected_first == df.first_valid_index() assert expected_last == df.last_valid_index()
def test_first_last_valid(self, data, idx, expected_first, expected_last): N = len(self.frame.index) mat = randn(N) mat[:5] = np.nan mat[-5:] = np.nan frame = DataFrame({'foo': mat}, index=self.frame.index) index = frame.first_valid_index() assert index == frame.index[5] index = frame.last_valid_index() assert index == frame.index[-6] # GH12800 empty = DataFrame() assert empty.last_valid_index() is None assert empty.first_valid_index() is None # GH17400: no valid entries frame[:] = np.nan assert frame.last_valid_index() is None assert frame.first_valid_index() is None # GH20499: its preserves freq with holes frame.index = date_range("20110101", periods=N, freq="B") frame.iloc[1] = 1 frame.iloc[-2] = 1 assert frame.first_valid_index() == frame.index[1] assert frame.last_valid_index() == frame.index[-2] assert frame.first_valid_index().freq == frame.index.freq assert frame.last_valid_index().freq == frame.index.freq # GH 21441 df = DataFrame(data, index=idx) assert expected_first == df.first_valid_index() assert expected_last == df.last_valid_index()
def test_first_last_valid_frame(self, data, idx, expected_first, expected_last): # GH#21441 df = DataFrame(data, index=idx) assert expected_first == df.first_valid_index() assert expected_last == df.last_valid_index()
# In[232]: df=pd.DataFrame({"A":[None,None,2,4,5],"B":[None,None,None,44,2],"C":[None,None,None,1,5]}) # In[233]: df # In[234]: df.first_valid_index() # In[235]: df=pd.Series([None,None,"sam","alex","sophia",None]) # In[236]: df # In[237]: