Beispiel #1
0
class ResultsSummary(object):
    """Calculates summarised statistics for a group of trades. This includes calculating average slippage for each
    group of trades and also creating histograms of these observations (and fitting it alongside a PDF/probability
    distribution function).

    """
    def __init__(self):
        self.time_series_ops = TimeSeriesOps()
        self.util_func = UtilFunc()

    def _create_histogram_distribution(self,
                                       df,
                                       min_x=None,
                                       max_x=None,
                                       extend_x_proportion_percentage=20,
                                       postfix_label=None,
                                       obs_weights=None,
                                       denormalised=True):

        # get min/max values for our histogram
        min_hist_x = df.min()
        max_hist_x = df.max()

        extend_x_proportion_percentage = 1.0 + (
            float(extend_x_proportion_percentage) / 100.0)

        # extend axes for PDF, so just outside histogram
        if min_x is not None:
            min_x = min(min_x, min_hist_x) * extend_x_proportion_percentage
        else:
            min_x = min_hist_x

        if max_x is not None:
            max_x = max(max_x, max_hist_x) * extend_x_proportion_percentage
        else:
            max_x = max_hist_x

        if denormalised: density = False

        vals = df.T.values.astype(numpy.float64)

        # Create a histogram with 10 buckets
        hist, bins = numpy.histogram(
            vals,
            bins=10,
            range=[float(min_hist_x), float(max_hist_x)],
            density=density,
            weights=obs_weights)
        bin_cent = (bins[1:] + bins[:-1]) * 0.5

        number_of_elements = len(df.values)

        dist_space = numpy.linspace(min_x, max_x, 100)

        if postfix_label is None:
            postfix_label = ''
        else:
            postfix_label = ": " + postfix_label

        if number_of_elements > 1:

            # Create a best fit PDF using Gaussian KDE model (forcibly cast to float64)
            if obs_weights is None:
                kde = gaussian_kde(vals)
            else:
                kde = gaussian_weighted_kde(vals,
                                            weights=obs_weights.values.astype(
                                                numpy.float64))

            # Sometimes need to transpose so the dimensions are consistent
            try:
                pdf_fit = kde(dist_space)
            except:
                pdf_fit = kde(dist_space.T)

            if obs_weights is None:
                # Calculated normal PDF
                weighted_stats = DescrStatsW(df.values, ddof=0)
            else:
                weighted_stats = DescrStatsW(df.values,
                                             weights=obs_weights.T.values,
                                             ddof=0)

            mu = weighted_stats.mean
            std = weighted_stats.std

            normal_pdf_fit = norm.pdf(dist_space, mu, std)

            # Scale pdf_fit (and normal PDF) by total/bin size
            if denormalised:
                bin_width = abs(bins[1] - bins[0])
                N = numpy.sum(hist)
                pdf_fit = pdf_fit * (bin_width * N)
                normal_pdf_fit = normal_pdf_fit * (bin_width * N)

            df_hist = pandas.DataFrame(index=bin_cent,
                                       data=hist,
                                       columns=['Histogram' + postfix_label])
            df_pdf = pandas.DataFrame(index=dist_space,
                                      data=pdf_fit,
                                      columns=['KDE-PDF' + postfix_label])
            df_pdf['Norm-PDF' + postfix_label] = normal_pdf_fit
        else:
            return pandas.DataFrame(), pandas.DataFrame()

        return df_hist, df_pdf

    def field_distribution(self,
                           metric_df,
                           market_df=None,
                           bid_benchmark='bid',
                           mid_benchmark='mid',
                           ask_benchmark='ask',
                           bid_avg=None,
                           ask_avg=None,
                           aggregate_by_field=None,
                           pdf_only=False,
                           postfix_label=None,
                           metric_name='slippage',
                           weighting_field=None,
                           scalar=10000.0):
        """Fits a PDF across the slippage across a group of trades and also calculates a histogram, through bucketing
        the slippage of a group of trades.

        Parameters
        ----------
        metric_df : DataFrame
            Contains trade data with the calculated slippage metrics

        market_df : DataFrame (optional), default None
            Contains market data, which is required if we want to calculate average bid-to-mid and ask-to-mid spreads

        bid_benchmark : str (optional), default 'bid'
            Field to use for bid data

        mid_benchmark : str (optional), default 'mid'
            Field to use for mid data

        ask_benchmark : str (optional), default 'ask'
            Field to use for ask data

        bid_avg : float (optional), default None
            Average spread for bid to mid (in bp)

        ask_avg : float (optional), default None
            Average spread for ask to mid (in bp)

        aggregate_by_field : str (optional), default None
            Aggregate slippage by particular fields, such as the 'venue'

        pdf_only : bool (optional), default False
            Should we only display the fitting PDF (and not the histogram)

        postfix_label : str (optional), default None
            Label to be added to the end of each column of the DataFrame

        metric_name : str, default slippage
            The field to use for defining slippage

        weighting_field : str
            Should observations by weighted by a particular field (eg. notional)

        scalar : float (default: 10000.0)
            Should we multiply all numbers by scalar (typically to convert into basis point)

        Returns
        -------
        DataFrame
        """

        # calculate the average bid/ask values from market data if they haven't already been specified
        # convert to basis points
        if bid_avg is None and ask_avg is None and market_df is not None:
            # will fail for time series which only contain mid values
            try:
                bid_avg = (
                    (market_df[bid_benchmark] / market_df[mid_benchmark]) -
                    1.0).mean() * float(scalar)
                ask_avg = (
                    (market_df[ask_benchmark] / market_df[mid_benchmark]) -
                    1.0).mean() * float(scalar)
            except:
                pass

        # Ff the metric field doesn't exist, we cannot calculate anything!
        if metric_name not in metric_df.columns:
            raise Exception(metric_name +
                            " field not found cannot calculate distribution!")

        obs_weights = None

        if weighting_field is not None and weighting_field in metric_df.columns:
            obs_weights = metric_df[weighting_field]

            # Check that obs_weights don't add up to zero... in which case just use equal weighting
            obs_weights_total = obs_weights.abs().sum()

            if obs_weights_total == numpy.nan:
                obs_weights = None
            elif obs_weights_total == 0:
                obs_weights = None

        # if postfix_label is None: postfix_label = ''

        # If we don't want to aggregate by any specific field
        if aggregate_by_field is None:

            # Convert slippage into basis points
            metric_sub_df = metric_df[metric_name] * float(scalar)

            df_hist, df_pdf = self._create_histogram_distribution(
                metric_sub_df,
                min_x=bid_avg,
                max_x=ask_avg,
                postfix_label=postfix_label,
                obs_weights=obs_weights)

            if pdf_only:
                df = df_pdf
            else:
                df = df_pdf.join(df_hist, how='outer')
        else:
            # Do we want it to aggregate results by a specific field? (eg. get distribution by the venue?)
            df_list = []

            for field_val, df_g in metric_df.groupby([aggregate_by_field]):
                metric_sub_df = df_g[metric_name] * 10000.0

                obs_weights = None

                if weighting_field is not None:
                    obs_weights = df_g[weighting_field]

                if postfix_label is None:
                    lab = str(field_val)
                else:
                    lab = postfix_label + ' ' + str(field_val)

                df_hist, df_pdf = self._create_histogram_distribution(
                    metric_sub_df,
                    min_x=bid_avg,
                    max_x=ask_avg,
                    postfix_label=lab,
                    obs_weights=obs_weights)

                if not (df_hist.empty) and not (df_pdf.empty):
                    if not (pdf_only):
                        df_list.append(df_hist)

                    df_list.append(df_pdf)

            if df_list == []:
                df = pandas.DataFrame()
            else:
                df = self.time_series_ops.outer_join(df_list)

        # TODO add bid/ask columns and mid

        if market_df is not None:
            if bid_avg is not None:
                df['Bid'] = bid_avg
            if ask_avg is not None:
                df['Ask'] = ask_avg

        return df

    def field_bucketing(self,
                        trade_df,
                        metric_name='slippage',
                        aggregation_metric='mean',
                        aggregate_by_field='venue',
                        by_date=None,
                        weighting_field=None):
        """Calculates the "average" for a particular field and aggregates it by venue/asset etc. The average can be specified
        as the mean, or other metrics such as totals, number of trades etc.

        Parameters
        ----------
        trade_df : DataFrame
            Contains trade data by the client contains fields such as trade time, notional, side, price, etc.

        calculation_field : str, default 'signed_slippage'
            Which field to run statistics on such as the absolute slippage or signed slippage

        aggregation_metric : str {'mean', 'sum', 'count'}, default mean
            How should the data be aggregated

        aggregate_by_field : str, default 'venue'
            How should we aggregate our calculations (eg. by 'venue')

        by_date : str
            Should we aggregate our results by date, to create a timeline
            'date' - aggregate by date
            'datehour' - aggregate by date/hour
            'month' - aggregate by month
            'hour' - aggregate by hour
            'time' - aggregate by time

        Returns
        -------
        DataFrame
        """

        # TODO weighting field

        # eg. aggregate output by 'vendor' and calculate the average slippage per 'vendor'

        group = [aggregate_by_field]

        if by_date is not None:
            if by_date == 'date':
                group = [trade_df.index.date]
            elif by_date == 'datehour':
                trade_df.index = trade_df.index.floor('H')
                group = [trade_df.index]

            elif by_date == 'month':
                group = [trade_df.index.day]

            elif by_date == 'day':
                group = [trade_df.index.day]

            elif by_date == 'hour':
                group = [trade_df.index.hour]

            elif by_date == 'time':
                group = [trade_df.index.time]

            if aggregate_by_field is not None:
                group.append(aggregate_by_field)

        displayed_fields = [metric_name, weighting_field]

        if weighting_field is None:
            displayed_fields = [metric_name]

        # remove duplicated list elements
        displayed_fields = self.util_func.remove_duplicated_str(
            displayed_fields)

        group = [x for x in group if x is not None]

        agg = trade_df.groupby(group)[displayed_fields]

        # def weighted_avg(group, avg_name, weight_name):
        #     """ http://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns
        #     In rare instance, we may not have weights, so just return the mean. Customize this if your business case
        #     should return otherwise.
        #     """
        #
        #     d = group[avg_name]
        #     w = group[weight_name]
        #     try:
        #         return (d * w).sum() / w.sum()
        #     except ZeroDivisionError:
        #         return d.mean()

        if aggregation_metric == 'mean':
            if weighting_field is None:
                agg = agg.mean()

            else:
                # calculate a weighted average of the metric for each group
                agg = agg.apply(self.time_series_ops.weighted_average_lambda,
                                metric_name, weighting_field)

        elif aggregation_metric == 'sum':
            agg = agg.sum()

        elif aggregation_metric == 'count':
            agg = agg.count()

        else:
            return None

        df = pandas.DataFrame(agg).transpose()

        if (by_date is not None):
            df = df.melt()
            df = df.set_index(df[df.columns[0]])
            df.index.name = 'Date'
            df = df.drop([df.columns[0]], axis=1)

            df = pandas.pivot_table(df,
                                    index='Date',
                                    columns=aggregate_by_field,
                                    values=df.columns[-1])
        else:
            df = pandas.pivot_table(df,
                                    index=aggregate_by_field,
                                    values=df.columns).transpose()

        return pandas.DataFrame(df)

    def query_trade_order_population(self,
                                     trade_df,
                                     query_fields=['ticker', 'broker_id']):
        """Finds the unique values for particular fields, such as 'ticker'. Can be useful for working out which assets
        to add to our available universe list (same for brokers etc.)

        Parameters
        ----------
        trade_df : DataFrame
            Trade/orders

        query_fields : str (list)
            Fields to search for

        Returns
        -------
        dict
        """

        if not (isinstance(query_fields, list)): query_fields = [query_fields]

        query_dict = {}

        for q in query_fields:
            if q in trade_df.columns:
                x = trade_df[q].unique().tolist()
                x.sort()

                query_dict[q] = x

        return query_dict
Beispiel #2
0
class JoinTables(object):
    """Takes in DataFrames which are joined together according to user preferences

    """
    def __init__(self, tables_dict={}, scalar=1, round_figures_by=None):
        self._tables_dict = tables_dict
        self._scalar = scalar
        self._round_figures_by = round_figures_by

        self._time_series_ops = TimeSeriesOps()
        self._util_func = UtilFunc()

    def aggregate_tables(self,
                         df_dict={},
                         tables_dict={},
                         round_figures_by=None,
                         scalar=None):
        if tables_dict == {}: tables_dict = self._tables_dict
        if round_figures_by is None: round_figures_by = self._round_figures_by
        if scalar is None: scalar = self._scalar

        joined_results = []

        table_name = tables_dict['table_name']
        table_list = tables_dict['table_list']

        column_list = None
        replace_text = None

        if 'column_list' in tables_dict.keys():
            column_list = tables_dict['column_list']

        if 'replace_text' in tables_dict.keys():
            replace_text = tables_dict['replace_text']

        agg_results = []

        for i in range(0, len(table_list)):
            table = table_list[i]

            if table in df_dict.keys():
                df = df_dict[table].copy()

                if column_list is not None and column_list != []:
                    df.columns = [x + ' ' + column_list[i] for x in df.columns]

                df = self._util_func.replace_text_in_cols(df, replace_text)

                if df is not None:
                    df = self._time_series_ops.multiply_scalar_dataframe(
                        df, scalar=scalar)
                    df = self._time_series_ops.round_dataframe(
                        df, round_figures_by)

                    agg_results.append(df)

        if agg_results != []:
            if len(agg_results) > 1:
                df_joined = self._time_series_ops.outer_join(agg_results)
            else:
                df_joined = agg_results[0]

            joined_results.append((df_joined, table_name))

        return joined_results