class DimensionTable(PostgresTable): # copy a dimension table that slowly changes, *not* a transaction table table = Parameter(default='') fn = TaskParameter(default=Task) columns = ListParameter(default=[]) id_cols = ListParameter(default=[]) merge_cols = DictParameter(default={}) column_separator = '\t' def requires(self): return self.clone(self.fn) def rows(self): from pandas import read_pickle, DataFrame, merge, concat connection = self.output().connect() cursor = connection.cursor() sql = f""" SELECT {', '.join(['id'] + list(self.columns))} FROM {self.table}; """ cursor.execute(sql) results = cursor.fetchall() current_df = DataFrame(results, columns=['id'] + list(self.columns)) with self.input().open('r') as f: df = read_pickle(f, compression=None) if not df.empty: # get list of dim values that are already in the database, but have # changed their attributes merged = merge(current_df, df, on=self.columns, how='inner') current_df = concat([current_df, merged], axis=0) is_duplicate = current_df.duplicated(keep=False) # duplicates = current_df[is_duplicate] new = current_df[~is_duplicate] to_delete = new['id'].tolist() to_copy = df[df[list(self.id_cols)].isin(new[list( self.id_cols)].to_dict(orient='list')).all(axis=1)] to_copy = to_copy[list(self.columns)] delete_sql = f""" DELETE FROM {self.table} WHERE id IN ({', '.join(to_delete)}); """ cursor.execute(delete_sql) for index, line in to_copy.iterrows(): # returns (index, Series) tuple yield line.values.tolist()
class TransactionFactTable(PostgresTable): """ Copy a pandas DataFrame in a transaction fact table fashion to PostGreSQL. :param table: The table to write to. :param columns: The columns, in order they show up in the PostGreSQL table. :param fn: The Task that provides the data to load. :param id_cols: The columns to be used to identify whether a row is already in the table. :param merge_cols: The columns representing dimension tables. In a dictionary format, with the key being the `left` to merge with, and the value being a tuple of (`table`, `right`). """ table = Parameter(default='') fn = TaskParameter(default=Task) columns = ListParameter(default=[]) id_cols = ListParameter(default=[]) merge_cols = DictParameter(default={}) column_separator = '\t' null_values = (None, nan) def requires(self): return self.clone(self.fn) def rows(self): from pandas import read_pickle, DataFrame connection = self.output().connect() cursor = connection.cursor() sql = """SELECT %s FROM %s;""" % (', '.join(self.id_cols), self.table) cursor.execute(sql) results = cursor.fetchall() current_df = DataFrame(results, columns=self.id_cols) with self.input().open('r') as f: df = read_pickle(f, compression=None) if not df.empty: # self.id_cols is tuple for some reason, so we need to convert # to list first # also, .isin compares by index as well as columns if passed a DF # to avoid that we can pass the dict with orient='list' and compare # just the columns df = df[~(df[list(self.id_cols)] .isin(current_df.to_dict(orient='list')) .all(axis=1))] df = df[list(self.columns)] for index, line in df.iterrows(): # returns (index, Series) tuple yield line.values.tolist()
class ClassifyWhiteDwarfMixin(BaseTask): """ Mix-in class for classifying white dwarfs. """ model_path = Parameter() wavelength_regions = ListParameter( default=[ [3860, 3900], # Balmer line [3950, 4000], # Balmer line [4085, 4120], # Balmer line [4320, 4360], # Balmer line [4840, 4880], # Balmer line [6540, 6580], # Balmer line [3880, 3905], # He I/II line [3955, 3975], # He I/II line [3990, 4056], # He I/II line [4110, 4140], # He I/II line [4370, 4410], # He I/II line [4450, 4485], # He I/II line [4705, 4725], # He I/II line [4900, 4950], # He I/II line [5000, 5030], # He I/II line [5860, 5890], # He I/II line [6670, 6700], # He I/II line [7050, 7090], # He I/II line [7265, 7300], # He I/II line [4600, 4750], # Molecular C absorption band [5000, 5160], # Molecular C absorption band [3925, 3940], # Ca H/K line [3960, 3975], # Ca H/K line ] ) polyfit_order = IntParameter(default=5) polyfit_regions = ListParameter( default=[ [3850, 3870], [4220, 4245], [5250, 5400], [6100, 6470], [7100, 9000] ] )
class ExplodeMaterials(Task): columns = ListParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-game-materials-' f'{self.player}.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): from pandas import read_pickle, concat, Series self.output().makedirs() with self.input().open('r') as f: df = read_pickle(f, compression=None) if df.empty: def complete(self): return True with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None) return df = df[['game_link', 'material_by_move']] df = df.explode('material_by_move') df = concat([df['game_link'], df['material_by_move'].apply(Series) .fillna(0) .astype(int)], axis=1) df.rename(columns={'r': 'rooks_black', 'n': 'knights_black', 'b': 'bishops_black', 'q': 'queens_black', 'p': 'pawns_black', 'P': 'pawns_white', 'R': 'rooks_white', 'N': 'knights_white', 'B': 'bishops_white', 'Q': 'queens_white', }, inplace=True) df['half_move'] = df.groupby('game_link').cumcount() + 1 df = df[list(self.columns)] with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class GetData(Task): player = Parameter() columns = ListParameter(default=[]) def run(self): pg_cfg = postgres_cfg() df = get_weekly_data(pg_cfg, self.player) with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None) def output(self): import os file_location = f'~/Temp/luigi/week-data-{self.player}.pckl' return LocalTarget(os.path.expanduser(file_location), format=Nop)
class ExplodeClocks(Task): columns = ListParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-game-clocks-' f'{self.player}.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): from pandas import read_pickle, to_timedelta self.output().makedirs() with self.input().open('r') as f: df = read_pickle(f, compression=None) if df.empty: def complete(self): return True with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None) return df = df[['game_link', 'clocks']] df = df.explode('clocks') df.rename(columns={'clocks': 'clock'}, inplace=True) df['half_move'] = df.groupby('game_link').cumcount() + 1 df['clock'] = to_timedelta(df['clock'], errors='coerce') df['clock'] = df['clock'].dt.total_seconds() df['clock'].fillna(-1.0, inplace=True) df['clock'] = df['clock'].astype(int) df = df[list(self.columns)] with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class ExplodePositions(Task): columns = ListParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-game-positions-' f'{self.player}.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): from pandas import read_pickle self.output().makedirs() with self.input().open('r') as f: df = read_pickle(f, compression=None) if df.empty: def complete(self): return True with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None) return df = df[['game_link', 'positions']] df = df.explode('positions') df.rename(columns={'positions': 'position'}, inplace=True) df['half_move'] = df.groupby('game_link').cumcount() + 1 # split, get all but last element of resulting list, then re-join df['fen'] = df['position'].str.split().str[:-1].str.join(' ') df = df[list(self.columns)] with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class ExplodeMoves(Task): columns = ListParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-game-moves-' f'{self.player}.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): from pandas import read_pickle self.output().makedirs() with self.input().open('r') as f: df = read_pickle(f, compression=None) if df.empty: def complete(self): return True with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None) return df = df[['game_link', 'moves']] df = df.explode('moves') df.rename(columns={'moves': 'move'}, inplace=True) df['half_move'] = df.groupby('game_link').cumcount() + 1 df = df[list(self.columns)] with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class GetGameInfos(Task): columns = ListParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-game-infos-' f'{self.player}.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): from pandas import read_pickle, to_datetime, to_numeric from pandas import concat, Series, merge self.output().makedirs() with self.input().open('r') as f: df = read_pickle(f, compression=None) if df.empty: def complete(self): return True with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None) return df['player'] = self.player if 'black_rating_diff' not in df.columns: df['black_rating_diff'] = 0 if 'white_rating_diff' not in df.columns: df['white_rating_diff'] = 0 # add two strings and remove the player name so that we don't # have to use pd.DataFrame.apply df['opponent'] = df['white'] + df['black'] df['opponent'] = df['opponent'].str.replace(self.player, '') series_player_black = df['black'] == self.player df['player_color'] = series_player_black.map({True: 'black', False: 'white', }) df['opponent_color'] = series_player_black.map({False: 'black', True: 'white', }) df['player_elo'] = ((series_player_black * df['black_elo']) + (~series_player_black * df['white_elo'])) df['opponent_elo'] = ((series_player_black * df['white_elo']) + (~series_player_black * df['black_elo'])) df['player_rating_diff'] = ((series_player_black * df['black_rating_diff']) + (~series_player_black * df['white_rating_diff'])) df['opponent_rating_diff'] = ((series_player_black * df['white_rating_diff']) + (~series_player_black * df['black_rating_diff'])) # another helper series series_result = df['result'] + series_player_black.astype(str) df['player_result'] = series_result.map({'0-1True': 'Win', '1-0False': 'Win', '1/2-1/2True': 'Draw', '1/2-1/2False': 'Draw', '1-0True': 'Loss', '0-1False': 'Loss', }) df['opponent_result'] = series_result.map({'0-1True': 'Loss', '1-0False': 'Loss', '1/2-1/2True': 'Draw', '1/2-1/2False': 'Draw', '1-0True': 'Win', '0-1False': 'Win', }) df.rename(columns={'speed': 'time_control_category'}, inplace=True) df['datetime_played'] = to_datetime(df['utc_date_played'].astype(str) + ' ' + df['time_played'].astype(str)) df['starting_time'] = df['time_control'].str.extract(r'(\d+)\+') df['increment'] = df['time_control'].str.extract(r'\+(\d+)') df['in_arena'] = df['event_type'].str.contains(r'Arena') df['in_arena'] = df['in_arena'].map({True: 'In arena', False: 'Not in arena'}) df['rated_casual'] = df['event_type'].str.contains('Casual') df['rated_casual'] = df['rated_casual'].map({True: 'Casual', False: 'Rated'}) mapping_dict = {True: 'Queen exchange', False: 'No queen exchange', } df['queen_exchange'] = df['queen_exchange'].map(mapping_dict) # figure out castling sides castling_df = df[['game_link', 'player_color', 'opponent_color', 'castling_sides']] # i thought the following would be easier with pandas 0.25.0's # pd.DataFrame.explode() but because we use dicts, it isn't # convert dict to dataframe cells castling_df = concat([castling_df.drop('castling_sides', axis=1), castling_df['castling_sides'].apply(Series)], axis=1) castling_df.fillna('No castling', inplace=True) castle_helper_srs = castling_df['player_color'] == 'black' castling_df['player_castling_side'] = ((~castle_helper_srs) * castling_df['white'] + castle_helper_srs * castling_df['black']) castling_df['opponent_castling_side'] = ((~castle_helper_srs) * castling_df['black'] + castle_helper_srs * castling_df['white']) castling_df = castling_df[['game_link', 'player_castling_side', 'opponent_castling_side', ]] df = merge(df, castling_df, on='game_link') # type handling df['date_played'] = to_datetime(df['date_played']) df['utc_date_played'] = to_datetime(df['utc_date_played']) rating_columns = ['player_elo', 'player_rating_diff', 'opponent_elo', 'opponent_rating_diff' ] for column in rating_columns: # ? ratings are anonymous players df[column] = df[column].replace('?', '1500') df[column] = to_numeric(df[column]) # filter unnecessary columns out df = df[list(self.columns)] with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)
class GetEvals(Task): local_stockfish = BoolParameter() columns = ListParameter() def output(self): import os file_location = (f'~/Temp/luigi/{self.since}-game-evals-' f'{self.player}.pckl') return LocalTarget(os.path.expanduser(file_location), format=Nop) def run(self): from pandas import read_pickle, to_numeric, concat, DataFrame self.output().makedirs() with self.input().open('r') as f: df = read_pickle(f, compression=None) if df.empty: def complete(self): return True with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None) return stockfish_params = stockfish_cfg() df = df[['evaluations', 'eval_depths', 'positions']] positions_evaluated = query_for_column('position_evals', 'fen') # explode the two different list-likes separately, then concat no_evals = df[~df['evaluations'].astype(bool)] df = df[df['evaluations'].astype(bool)] evals = df['evaluations'].explode().reset_index(drop=True) depths = df['eval_depths'].explode().reset_index(drop=True) positions = df['positions'].explode().reset_index(drop=True) positions = positions.str.split().str[:-1].str.join(' ') df = concat([positions, evals, depths], axis=1) if self.local_stockfish: no_evals = DataFrame(no_evals['positions'].explode()) no_evals['positions'] = (no_evals['positions'].str.split() .str[:-1] .str.join(' ')) local_evals = [] counter = 0 position_count = len(no_evals['positions']) for position in no_evals['positions'].tolist(): if position in positions_evaluated.values: evaluation = None else: evaluation = (get_sf_evaluation(position + ' 0', stockfish_params.location, stockfish_params.depth) or evaluation) local_evals.append(evaluation) # progress bar stuff counter += 1 current_progress = counter / position_count self.set_status_message(f'Analyzed :: ' f'{counter} / {position_count}') self.set_progress_percentage(round(current_progress * 100, 2)) self.set_status_message(f'Analyzed all {position_count} positions') self.set_progress_percentage(100) no_evals['evaluations'] = local_evals no_evals['eval_depths'] = stockfish_params.depth no_evals.dropna(inplace=True) df = concat([df, no_evals], axis=0, ignore_index=True) df = df[~df['positions'].isin(positions_evaluated)] df.rename(columns={'evaluations': 'evaluation', 'eval_depths': 'eval_depth', 'positions': 'fen'}, inplace=True) df['evaluation'] = to_numeric(df['evaluation'], errors='coerce') df.dropna(inplace=True) df = df[list(self.columns)] with self.output().temporary_path() as temp_output_path: df.to_pickle(temp_output_path, compression=None)