class DimensionTable(PostgresTable):
    # copy a dimension table that slowly changes, *not* a transaction table

    table = Parameter(default='')
    fn = TaskParameter(default=Task)
    columns = ListParameter(default=[])
    id_cols = ListParameter(default=[])
    merge_cols = DictParameter(default={})

    column_separator = '\t'

    def requires(self):
        return self.clone(self.fn)

    def rows(self):
        from pandas import read_pickle, DataFrame, merge, concat

        connection = self.output().connect()
        cursor = connection.cursor()

        sql = f"""
               SELECT {', '.join(['id'] + list(self.columns))}
               FROM {self.table};
               """

        cursor.execute(sql)
        results = cursor.fetchall()

        current_df = DataFrame(results, columns=['id'] + list(self.columns))

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if not df.empty:

            # get list of dim values that are already in the database, but have
            # changed their attributes
            merged = merge(current_df, df, on=self.columns, how='inner')
            current_df = concat([current_df, merged], axis=0)
            is_duplicate = current_df.duplicated(keep=False)

            # duplicates = current_df[is_duplicate]
            new = current_df[~is_duplicate]

            to_delete = new['id'].tolist()
            to_copy = df[df[list(self.id_cols)].isin(new[list(
                self.id_cols)].to_dict(orient='list')).all(axis=1)]

            to_copy = to_copy[list(self.columns)]

        delete_sql = f"""
                      DELETE FROM {self.table}
                      WHERE id IN ({', '.join(to_delete)});
                      """

        cursor.execute(delete_sql)

        for index, line in to_copy.iterrows():  # returns (index, Series) tuple
            yield line.values.tolist()
class TransactionFactTable(PostgresTable):
    """
    Copy a pandas DataFrame in a transaction fact table fashion to PostGreSQL.

    :param table: The table to write to.
    :param columns: The columns, in order they show up in the PostGreSQL table.
    :param fn: The Task that provides the data to load.
    :param id_cols: The columns to be used to identify whether a row is already
                    in the table.
    :param merge_cols: The columns representing dimension tables. In a
                       dictionary format, with the key being the `left` to
                       merge with, and the value being a tuple of
                       (`table`, `right`).
    """

    table = Parameter(default='')
    fn = TaskParameter(default=Task)
    columns = ListParameter(default=[])
    id_cols = ListParameter(default=[])
    merge_cols = DictParameter(default={})

    column_separator = '\t'
    null_values = (None, nan)

    def requires(self):
        return self.clone(self.fn)

    def rows(self):
        from pandas import read_pickle, DataFrame

        connection = self.output().connect()
        cursor = connection.cursor()

        sql = """SELECT %s FROM %s;""" % (', '.join(self.id_cols),
                                          self.table)

        cursor.execute(sql)
        results = cursor.fetchall()

        current_df = DataFrame(results, columns=self.id_cols)

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if not df.empty:
            # self.id_cols is tuple for some reason, so we need to convert
            # to list first
            # also, .isin compares by index as well as columns if passed a DF
            # to avoid that we can pass the dict with orient='list' and compare
            # just the columns
            df = df[~(df[list(self.id_cols)]
                      .isin(current_df.to_dict(orient='list'))
                      .all(axis=1))]
            df = df[list(self.columns)]

        for index, line in df.iterrows():  # returns (index, Series) tuple
            yield line.values.tolist()
Example #3
0
class ClassifyWhiteDwarfMixin(BaseTask):

    """
    Mix-in class for classifying white dwarfs.
    """

    model_path = Parameter()

    wavelength_regions = ListParameter(
        default=[
            [3860, 3900], # Balmer line
            [3950, 4000], # Balmer line
            [4085, 4120], # Balmer line
            [4320, 4360], # Balmer line
            [4840, 4880], # Balmer line
            [6540, 6580], # Balmer line
            [3880, 3905], # He I/II line
            [3955, 3975], # He I/II line
            [3990, 4056], # He I/II line
            [4110, 4140], # He I/II line
            [4370, 4410], # He I/II line
            [4450, 4485], # He I/II line
            [4705, 4725], # He I/II line
            [4900, 4950], # He I/II line
            [5000, 5030], # He I/II line
            [5860, 5890], # He I/II line
            [6670, 6700], # He I/II line
            [7050, 7090], # He I/II line
            [7265, 7300], # He I/II line
            [4600, 4750], # Molecular C absorption band
            [5000, 5160], # Molecular C absorption band
            [3925, 3940], # Ca H/K line
            [3960, 3975], # Ca H/K line
        ]
    )

    polyfit_order = IntParameter(default=5)
    polyfit_regions = ListParameter(
        default=[
            [3850, 3870],
            [4220, 4245],
            [5250, 5400],
            [6100, 6470],
            [7100, 9000]
        ]
    )
Example #4
0
class ExplodeMaterials(Task):

    columns = ListParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-game-materials-'
                         f'{self.player}.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        from pandas import read_pickle, concat, Series

        self.output().makedirs()

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if df.empty:

            def complete(self):
                return True

            with self.output().temporary_path() as temp_output_path:
                df.to_pickle(temp_output_path, compression=None)

            return

        df = df[['game_link', 'material_by_move']]

        df = df.explode('material_by_move')

        df = concat([df['game_link'],
                     df['material_by_move'].apply(Series)
                                           .fillna(0)
                                           .astype(int)],
                    axis=1)
        df.rename(columns={'r': 'rooks_black',
                           'n': 'knights_black',
                           'b': 'bishops_black',
                           'q': 'queens_black',
                           'p': 'pawns_black',
                           'P': 'pawns_white',
                           'R': 'rooks_white',
                           'N': 'knights_white',
                           'B': 'bishops_white',
                           'Q': 'queens_white',
                           },
                  inplace=True)

        df['half_move'] = df.groupby('game_link').cumcount() + 1

        df = df[list(self.columns)]

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
class GetData(Task):

    player = Parameter()
    columns = ListParameter(default=[])

    def run(self):
        pg_cfg = postgres_cfg()
        df = get_weekly_data(pg_cfg, self.player)

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)

    def output(self):
        import os

        file_location = f'~/Temp/luigi/week-data-{self.player}.pckl'
        return LocalTarget(os.path.expanduser(file_location), format=Nop)
Example #6
0
class ExplodeClocks(Task):

    columns = ListParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-game-clocks-'
                         f'{self.player}.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        from pandas import read_pickle, to_timedelta

        self.output().makedirs()

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if df.empty:

            def complete(self):
                return True

            with self.output().temporary_path() as temp_output_path:
                df.to_pickle(temp_output_path, compression=None)

            return

        df = df[['game_link', 'clocks']]

        df = df.explode('clocks')
        df.rename(columns={'clocks': 'clock'},
                  inplace=True)
        df['half_move'] = df.groupby('game_link').cumcount() + 1
        df['clock'] = to_timedelta(df['clock'],
                                   errors='coerce')
        df['clock'] = df['clock'].dt.total_seconds()
        df['clock'].fillna(-1.0, inplace=True)
        df['clock'] = df['clock'].astype(int)

        df = df[list(self.columns)]

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
Example #7
0
class ExplodePositions(Task):

    columns = ListParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-game-positions-'
                         f'{self.player}.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        from pandas import read_pickle

        self.output().makedirs()

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if df.empty:

            def complete(self):
                return True

            with self.output().temporary_path() as temp_output_path:
                df.to_pickle(temp_output_path, compression=None)

            return

        df = df[['game_link', 'positions']]

        df = df.explode('positions')
        df.rename(columns={'positions': 'position'},
                  inplace=True)
        df['half_move'] = df.groupby('game_link').cumcount() + 1

        # split, get all but last element of resulting list, then re-join
        df['fen'] = df['position'].str.split().str[:-1].str.join(' ')

        df = df[list(self.columns)]

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
Example #8
0
class ExplodeMoves(Task):

    columns = ListParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-game-moves-'
                         f'{self.player}.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        from pandas import read_pickle

        self.output().makedirs()

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if df.empty:

            def complete(self):
                return True

            with self.output().temporary_path() as temp_output_path:
                df.to_pickle(temp_output_path, compression=None)

            return

        df = df[['game_link', 'moves']]

        df = df.explode('moves')
        df.rename(columns={'moves': 'move'},
                  inplace=True)
        df['half_move'] = df.groupby('game_link').cumcount() + 1

        df = df[list(self.columns)]

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
Example #9
0
class GetGameInfos(Task):

    columns = ListParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-game-infos-'
                         f'{self.player}.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        from pandas import read_pickle, to_datetime, to_numeric
        from pandas import concat, Series, merge

        self.output().makedirs()

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if df.empty:

            def complete(self):
                return True

            with self.output().temporary_path() as temp_output_path:
                df.to_pickle(temp_output_path, compression=None)

            return

        df['player'] = self.player

        if 'black_rating_diff' not in df.columns:
            df['black_rating_diff'] = 0

        if 'white_rating_diff' not in df.columns:
            df['white_rating_diff'] = 0

        # add two strings and remove the player name so that we don't
        # have to use pd.DataFrame.apply
        df['opponent'] = df['white'] + df['black']
        df['opponent'] = df['opponent'].str.replace(self.player, '')

        series_player_black = df['black'] == self.player
        df['player_color'] = series_player_black.map({True: 'black',
                                                      False: 'white',
                                                      })
        df['opponent_color'] = series_player_black.map({False: 'black',
                                                        True: 'white',
                                                        })

        df['player_elo'] = ((series_player_black
                             * df['black_elo'])
                            + (~series_player_black
                                * df['white_elo']))
        df['opponent_elo'] = ((series_player_black
                               * df['white_elo'])
                              + (~series_player_black
                                  * df['black_elo']))

        df['player_rating_diff'] = ((series_player_black
                                     * df['black_rating_diff'])
                                    + (~series_player_black
                                        * df['white_rating_diff']))

        df['opponent_rating_diff'] = ((series_player_black
                                       * df['white_rating_diff'])
                                      + (~series_player_black
                                          * df['black_rating_diff']))

        # another helper series
        series_result = df['result'] + series_player_black.astype(str)
        df['player_result'] = series_result.map({'0-1True': 'Win',
                                                 '1-0False': 'Win',
                                                 '1/2-1/2True': 'Draw',
                                                 '1/2-1/2False': 'Draw',
                                                 '1-0True': 'Loss',
                                                 '0-1False': 'Loss',
                                                 })

        df['opponent_result'] = series_result.map({'0-1True': 'Loss',
                                                   '1-0False': 'Loss',
                                                   '1/2-1/2True': 'Draw',
                                                   '1/2-1/2False': 'Draw',
                                                   '1-0True': 'Win',
                                                   '0-1False': 'Win',
                                                   })

        df.rename(columns={'speed': 'time_control_category'},
                  inplace=True)

        df['datetime_played'] = to_datetime(df['utc_date_played'].astype(str)
                                            + ' '
                                            + df['time_played'].astype(str))
        df['starting_time'] = df['time_control'].str.extract(r'(\d+)\+')
        df['increment'] = df['time_control'].str.extract(r'\+(\d+)')

        df['in_arena'] = df['event_type'].str.contains(r'Arena')
        df['in_arena'] = df['in_arena'].map({True: 'In arena',
                                             False: 'Not in arena'})

        df['rated_casual'] = df['event_type'].str.contains('Casual')
        df['rated_casual'] = df['rated_casual'].map({True: 'Casual',
                                                     False: 'Rated'})

        mapping_dict = {True: 'Queen exchange',
                        False: 'No queen exchange',
                        }
        df['queen_exchange'] = df['queen_exchange'].map(mapping_dict)

        # figure out castling sides
        castling_df = df[['game_link',
                          'player_color',
                          'opponent_color',
                          'castling_sides']]
        # i thought the following would be easier with pandas 0.25.0's
        # pd.DataFrame.explode() but because we use dicts, it isn't

        # convert dict to dataframe cells
        castling_df = concat([castling_df.drop('castling_sides', axis=1),
                              castling_df['castling_sides'].apply(Series)],
                             axis=1)
        castling_df.fillna('No castling', inplace=True)
        castle_helper_srs = castling_df['player_color'] == 'black'
        castling_df['player_castling_side'] = ((~castle_helper_srs)
                                               * castling_df['white']
                                               + castle_helper_srs
                                               * castling_df['black'])
        castling_df['opponent_castling_side'] = ((~castle_helper_srs)
                                                 * castling_df['black']
                                                 + castle_helper_srs
                                                 * castling_df['white'])

        castling_df = castling_df[['game_link',
                                   'player_castling_side',
                                   'opponent_castling_side',
                                   ]]

        df = merge(df,
                   castling_df,
                   on='game_link')

        # type handling
        df['date_played'] = to_datetime(df['date_played'])
        df['utc_date_played'] = to_datetime(df['utc_date_played'])

        rating_columns = ['player_elo',
                          'player_rating_diff',
                          'opponent_elo',
                          'opponent_rating_diff'
                          ]

        for column in rating_columns:
            # ? ratings are anonymous players
            df[column] = df[column].replace('?', '1500')
            df[column] = to_numeric(df[column])

        # filter unnecessary columns out
        df = df[list(self.columns)]

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)
Example #10
0
class GetEvals(Task):

    local_stockfish = BoolParameter()
    columns = ListParameter()

    def output(self):
        import os

        file_location = (f'~/Temp/luigi/{self.since}-game-evals-'
                         f'{self.player}.pckl')
        return LocalTarget(os.path.expanduser(file_location), format=Nop)

    def run(self):
        from pandas import read_pickle, to_numeric, concat, DataFrame

        self.output().makedirs()

        with self.input().open('r') as f:
            df = read_pickle(f, compression=None)

        if df.empty:

            def complete(self):
                return True

            with self.output().temporary_path() as temp_output_path:
                df.to_pickle(temp_output_path, compression=None)

            return

        stockfish_params = stockfish_cfg()

        df = df[['evaluations', 'eval_depths', 'positions']]

        positions_evaluated = query_for_column('position_evals', 'fen')

        # explode the two different list-likes separately, then concat
        no_evals = df[~df['evaluations'].astype(bool)]
        df = df[df['evaluations'].astype(bool)]

        evals = df['evaluations'].explode().reset_index(drop=True)
        depths = df['eval_depths'].explode().reset_index(drop=True)
        positions = df['positions'].explode().reset_index(drop=True)
        positions = positions.str.split().str[:-1].str.join(' ')

        df = concat([positions, evals, depths], axis=1)

        if self.local_stockfish:
            no_evals = DataFrame(no_evals['positions'].explode())
            no_evals['positions'] = (no_evals['positions'].str.split()
                                                          .str[:-1]
                                                          .str.join(' '))

            local_evals = []

            counter = 0
            position_count = len(no_evals['positions'])

            for position in no_evals['positions'].tolist():
                if position in positions_evaluated.values:
                    evaluation = None
                else:
                    evaluation = (get_sf_evaluation(position + ' 0',
                                                    stockfish_params.location,
                                                    stockfish_params.depth)
                                  or evaluation)
                local_evals.append(evaluation)

                # progress bar stuff
                counter += 1

                current_progress = counter / position_count
                self.set_status_message(f'Analyzed :: '
                                        f'{counter} / {position_count}')
                self.set_progress_percentage(round(current_progress * 100, 2))

            self.set_status_message(f'Analyzed all {position_count} positions')
            self.set_progress_percentage(100)

            no_evals['evaluations'] = local_evals
            no_evals['eval_depths'] = stockfish_params.depth
            no_evals.dropna(inplace=True)

            df = concat([df, no_evals], axis=0, ignore_index=True)

        df = df[~df['positions'].isin(positions_evaluated)]

        df.rename(columns={'evaluations': 'evaluation',
                           'eval_depths': 'eval_depth',
                           'positions': 'fen'},
                  inplace=True)
        df['evaluation'] = to_numeric(df['evaluation'],
                                      errors='coerce')

        df.dropna(inplace=True)

        df = df[list(self.columns)]

        with self.output().temporary_path() as temp_output_path:
            df.to_pickle(temp_output_path, compression=None)