def _init_reference(self) -> None: reference_initialized = False while not reference_initialized: if len(self._maxtimes) < 1: raise ValueError("Not enough data to work on") self._reference_id = self._maxtimes.index[0] # create reference dataframe self._reference_df = self._create_person_dataframe( self._reference_id) self._reference_df = self._remove_duplicate_dates( self._reference_df) # ignore too small dataframes if len(self._reference_df.index) < 2: self._maxtimes = self._maxtimes.drop(self._reference_id) self._mintimes = self._mintimes.drop(self._reference_id) continue self._reference_df = utils.remove_not_progressing_solves( self._reference_df) # ignore too small dataframes if len(self._reference_df.index) < 2: self._maxtimes = self._maxtimes.drop(self._reference_id) self._mintimes = self._mintimes.drop(self._reference_id) continue self._reference_df = utils.interpolate_dates(self._reference_df) self._set_reference_values(self._reference_df) reference_initialized = True
def _get_date_for_new_time(self, dataframe: DataFrame, column_id: str, time: float) -> Tuple[datetime, float]: # use data from the group (i.e. more spaced data) for a more precise value person_df = self._create_person_dataframe(column_id) person_df = self._remove_duplicate_dates(person_df) person_df = utils.remove_not_progressing_solves(person_df) next_to_last_date = person_df.index[len(person_df) - 2] next_to_last_value = person_df.iloc[len(person_df) - 2, 0] last_date = person_df.index[len(person_df) - 1] last_value = person_df.iloc[len(person_df) - 1, 0] days_delta = (last_date - next_to_last_date).days # number of days to add to next_to_last_date number_of_days_to_add = ((next_to_last_value - time) * days_delta) / ( next_to_last_value - last_value) # number of days to add to last_date number_of_days_to_add = number_of_days_to_add - days_delta # upper round to make sure date encloses time number_of_days_to_add = math.ceil(number_of_days_to_add) new_date = self._find_date_for_value( dataframe, column_id, last_value) + timedelta(days=number_of_days_to_add) # recompute corresponding time to match the ceiled date new_time = last_value - (( (next_to_last_value - last_value) * number_of_days_to_add) / days_delta) return new_date, new_time
def test_remove_not_progressing_solves_not_default_column() -> None: df_before = pd.DataFrame( { 'event': ['333', '333', '333'], 'best': [50, 60, 30] }, index=[0, 1, 2]) df_expected = pd.DataFrame({ 'event': ['333', '333'], 'best': [50, 30] }, index=[0, 2]) df_after = utils.remove_not_progressing_solves(df_before, column_number=1) assert df_expected.equals(df_after)
def _launch_main_process(self, log_progression: bool = False, log_debug: bool = False) -> None: if log_progression: # prepare process progression indication total_loops = len(self._maxtimes[1:len(self._maxtimes)]) print_every_percent = 0.05 loops_percent = round(total_loops * 0.05, 0) if loops_percent == 0: loops_percent = 1 start_time = time.time() previous_time = start_time for i, row in enumerate( self._maxtimes[1:len(self._maxtimes)].itertuples()): if log_progression: current_time = time.time() current_running_time = current_time - previous_time previous_time = current_time total_running_time = current_time - start_time estimated_running_time = (total_loops * total_running_time) / (i + 1) # don't print every iteration if i == 0 or i == total_loops - 1 or (i + 1) % loops_percent == 0: print( f'{(i + 1)}/{total_loops} loops, total elapsed/remaining/estimated: {round(total_running_time, 0)}/{round(estimated_running_time - total_running_time, 0)}/{round(estimated_running_time, 0)} seconds' ) person_df = self._create_person_dataframe(row.Index) person_df = self._remove_duplicate_dates(person_df) # ignore too small dataframes if len(person_df.index) < 2: continue person_df = utils.remove_not_progressing_solves(person_df) # ignore too small dataframes if len(person_df.index) < 2: continue # search matching date matching_date = self._find_closest_date(row[1], log_debug) # align dates delta = matching_date - person_df.index[0] person_df = self._shift_date(person_df, delta) # interpolate person_df = utils.interpolate_dates(person_df) # add current df to final df self._df_to_concat.append(person_df) if log_progression: print('Final concatenation...') self._processed_results = pd.concat(self._df_to_concat, axis=1, sort=False) self._df_to_concat = [self._processed_results] if log_progression: print('Done')
def test_remove_not_progressing_solves_nothing_to_remove() -> None: df_before = pd.DataFrame({'best': [50, 40, 30]}, index=[0, 1, 2]) df_expected = df_before df_after = utils.remove_not_progressing_solves(df_before) assert df_expected.equals(df_after)
def test_remove_not_progressing_solves_superior_in_the_middle() -> None: df_before = pd.DataFrame({'best': [50, 40, 45, 35]}, index=[0, 1, 2, 3]) df_expected = pd.DataFrame({'best': [50, 40, 35]}, index=[0, 1, 3]) df_after = utils.remove_not_progressing_solves(df_before) assert df_expected.equals(df_after)
def test_remove_not_progressing_solves_mixed() -> None: df_before = pd.DataFrame({'best': [50, 60, 60, 50, 45, 45, 70, 45]}, index=[0, 1, 2, 3, 4, 5, 6, 7]) df_expected = pd.DataFrame({'best': [50, 45]}, index=[0, 4]) df_after = utils.remove_not_progressing_solves(df_before) assert df_expected.equals(df_after)
def test_remove_not_progressing_solves_equals_starting() -> None: df_before = pd.DataFrame({'best': [50, 50, 40, 35]}, index=[0, 1, 2, 3]) df_expected = pd.DataFrame({'best': [50, 40, 35]}, index=[0, 2, 3]) df_after = utils.remove_not_progressing_solves(df_before) assert df_expected.equals(df_after)