-
Notifications
You must be signed in to change notification settings - Fork 2
/
calc_zscore.py
259 lines (219 loc) · 9.46 KB
/
calc_zscore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 28 17:36:40 2016
@author: e612727
"""
# Copyright (C) 2015 State Street Global Advisors
import pandas as pd
import numpy as np
def calc_zscore(df,
mean_halflife=21,
mean_seed_period=21,
std_halflife=21,
std_seed_period=21,
smth_halflife=0,
ewm=True,
subtract_mean=True,
cap=3.0,
lag=0):
"""
Calculate timeseries z-score (assuming normal distribution of input data)
Parameters
----------
df : DataFrame or Series
DataFrame or Series object containing timeseries data
mean_halflife : int, optional
Half-life period (periodicity determined by index of df) for computing mean
mean_seed_period : int, optional
Seeding period (periodicity determined by index of df) for computing mean
std_halflife : int, optional
Half-life period (periodicity determined by index of df) for computing standard deviation
std_seed_period : int, optional
Seeding period (periodicity determined by index of df) for computing standard deviation
smth_halflife : int, optional
Smoothing half-life period (periodicity determined by index of df) for smoothing input data before computing z-score
ewm : bool, optional
If True, compute z-score based on ewm mean and standard deviation. If False, compute z-score based on simple mean and standard deviation.
subtract_mean : bool, optional
If True, subtract mean while computing z-score. If False, normalize the value by dividing by standard deviation.
cap : float, optional
Absolute cap for z-score
lag : int, optional
Periods (periodicity determined by index of df) by which to lag the z-score
Returns
-------
score_df : DataFrame or Series
DataFrame or Series object containing z-score
"""
is_series = False
if isinstance(df, pd.Series):
df = pd.DataFrame(df)
is_series = True
elif not isinstance(df, pd.DataFrame):
raise ValueError('df should be either a DataFrame or Series object')
if mean_halflife < 0:
raise ValueError('%d is not a valid mean half-life' % mean_halflife)
if mean_halflife > df.shape[0]:
raise ValueError('mean_halflife can not be larger than length of index of df')
if mean_seed_period < 0:
raise ValueError('%d is not a valid mean seed period' % mean_seed_period)
if mean_seed_period > df.shape[0]:
raise ValueError('mean_seed_period can not be larger than length of index of df')
if std_halflife < 0:
raise ValueError('%d is not a valid standard deviation half-life' % std_halflife)
if std_halflife > df.shape[0]:
raise ValueError('std_halflife can not be larger than length of index of df')
if std_seed_period < 0:
raise ValueError('%d is not a valid standard deviation seed period' % std_seed_period)
if std_seed_period > df.shape[0]:
raise ValueError('std_seed_period can not be larger than length of index of df')
if smth_halflife < 0:
raise ValueError('%d is not a valid smoothing half-life' % smth_halflife)
if smth_halflife > df.shape[0]:
raise ValueError('smth_halflife can not be larger than length of index of df')
if not isinstance(ewm, bool):
raise ValueError('ewm should be either True of False')
if not isinstance(subtract_mean, bool):
raise ValueError('subtract_mean should be either True of False')
if cap <= 0:
raise ValueError('%f is not a valid score cap' % cap)
if lag < 0:
raise ValueError('%d is not a valid lag period' % lag)
if lag > df.shape[0]:
raise ValueError('lag can not be larger than length of index of df')
# apply smoothing
if smth_halflife > 0:
df = pd.ewma(df, halflife=smth_halflife, min_periods=smth_halflife, adjust=False)
# compute mean and standard deviation
if ewm:
mean_df = pd.ewma(df, halflife=mean_halflife, min_periods=mean_seed_period, adjust=False)
std_df = pd.ewmstd(df, halflife=std_halflife, min_periods=std_seed_period, adjust=False)
else:
mean_df = pd.rolling_mean(df, window=mean_halflife, min_periods=mean_seed_period)
std_df = pd.rolling_std(df, window=std_halflife, min_periods=std_seed_period)
# compute score
if subtract_mean:
score_df = (df - mean_df) / std_df
else:
score_df = df / std_df
# cap score
score_df = score_df.clip(-cap, cap)
# lag score
if lag > 0:
score_df = score_df.shift(lag)
if is_series:
return pd.Series(score_df.squeeze())
else:
return score_df
def score_to_alpha(score_df,
vol_df,
IC=0.1):
"""
Compute signal alphas, given scores and IC
Parameters
----------
score_df : DataFrame
DataFrame containing signal scores for assets
vol_df : DataFrame
DataFrame containing asset volatilities
IC : float, optional
Information Co-efficient (IC) of the signal
Returns
-------
alpha_df : DataFrame
DataFrame containing signal alphas for assets
"""
if not isinstance(score_df, pd.DataFrame):
raise ValueError('score_df should be a DataFrame object')
if not isinstance(vol_df, pd.DataFrame):
raise ValueError('vol_df should be a DataFrame object')
if IC <= 0:
raise ValueError('%d is not a valid IC' % IC)
if not score_df.index.equals(vol_df.index):
raise ValueError('score_df and vol_df should have the same index')
if not score_df.columns.equals(vol_df.columns):
raise ValueError('score_df and vol_df should have the same columns')
return score_df * vol_df * IC
def calc_xscore(df,
smth_halflife=0,
min_observations=2,
subtract_mean=True,
cap=3.0,
lag=0):
"""
Calculate cross-sectional score (x-score)
Parameters
----------
df : DataFrame
DataFrame object containing timeseries data
smth_halflife : int, optional
Smoothing half-life period (periodicity determined by index of df) for smoothing input data before computing x-score
min_observations : int, optional
Minimum number of cross-sectional data points required for computing score
subtract_mean : bool, optional
If True, subtract cross-sectional mean while computing x-score. If False, normalize the value by dividing by standard deviation of cross-section.
cap : float, optional
Absolute cap for x-score
lag : int, optional
Periods (periodicity determined by index of df) by which to lag the x-score
Returns
-------
score_df : DataFrame
DataFrame object containing x-score
"""
if not isinstance(df, pd.DataFrame):
raise ValueError('df should be a DataFrame object')
if smth_halflife < 0:
raise ValueError('%d is not a valid smoothing half-life' % smth_halflife)
if smth_halflife > df.shape[0]:
raise ValueError('smth_halflife can not be larger than length of index of df')
if min_observations < 2:
raise ValueError('%d is not a valid number of minimum observations' % min_observations)
if min_observations > df.shape[1]:
raise ValueError('min_observations can not be greater than the number of columns of df')
if not isinstance(subtract_mean, bool):
raise ValueError('subtract_mean should be either True of False')
if cap <= 0:
raise ValueError('%f is not a valid score cap' % cap)
if lag < 0:
raise ValueError('%d is not a valid lag period' % lag)
if lag > df.shape[0]:
raise ValueError('lag can not be larger than length of index of df')
# apply min observations filter
df[df.count(axis=1) < min_observations] = np.nan
# apply smoothing
if smth_halflife > 0:
df = pd.ewma(df, halflife=smth_halflife, min_periods=smth_halflife, adjust=False)
# compute score
if subtract_mean:
score_df = (df.sub(df.mean(axis=1), axis=0)).div(df.std(axis=1), axis=0)
else:
score_df = df.div(df.std(axis=1), axis=0)
# cap score
score_df = score_df.clip(-cap, cap)
# lag score
if lag > 0:
score_df = score_df.shift(lag)
return score_df
def _calc_ewma_variance(ret_df,
vol_halflife = 252,
vol_seed_period = 252,
var_annualization_factor = 1,
lag = 1):
'''
Calculate EWMA (exponentially weighted moving average) variance
'''
if lag > 0:
ret_df = ret_df.shift(lag)
return pd.ewmvar(ret_df, halflife=vol_halflife, min_periods=vol_seed_period) * var_annualization_factor
def _calc_ewma_volatility(ret_df,
vol_halflife = 252,
vol_seed_period = 252,
var_annualization_factor = 1,
lag = 1):
'''
Calculate EWMA (exponentially weighted moving average) volatility
'''
if lag > 0:
ret_df = ret_df.shift(lag)
return pd.ewmvol(ret_df, halflife=vol_halflife, min_periods=vol_seed_period) * np.sqrt(var_annualization_factor)