def GetEndDateList(data, freq, trim_end=False): ''' trim主要用于日度数据resample成低频数据 :param data: :param freq: :param trim_end: :return: ''' if freq == 'M': date_list = data.index.where( data.index == ((data.index + MonthEnd()) - MonthEnd()), data.index + MonthEnd()) elif freq == 'W': week_day = 5 # 0-6分别对应周一至周日 date_list = data.index.where( data.index == ((data.index + Week(weekday=week_day)) - Week()), data.index + Week(weekday=week_day)) elif freq == 'Y': date_list = data.index.where( data.index == ((data.index + YearEnd()) - YearEnd()), data.index + YearEnd()) if trim_end: return sorted(set(date_list))[:-1] else: return sorted(set(date_list))
def GetEndDateList(data, freq, trim_end=False): ''' trim主要用于日度数据 :param data: :param freq: :param trim_end: :return: ''' if freq=='M': date_list=data.index.where(data.index == ((data.index + MonthEnd()) - MonthEnd()), data.index + MonthEnd()) #date_list=pd.to_datetime(date_list.astype(str),format='%Y%m')+MonthEnd() elif freq=='W': week_day = 5 # 0-6分别对应周一至周日 date_list = data.index.where(data.index == ((data.index + Week(weekday=week_day)) - Week()), data.index + Week(weekday=week_day)) #date_list=pd.to_datetime(date_list.astype(str).str.pad(7,side='right',fillchar='6'),format='%Y%W%w') elif freq=='Y': date_list = data.index.where(data.index == ((data.index + YearEnd()) - YearEnd()), data.index + YearEnd()) #date_list = pd.to_datetime(date_list.astype(str), format='%Y') + YearEnd() if trim_end: return sorted(set(date_list))[:-1] else: return sorted(set(date_list))
def resample_index(dat, to_freq): ''' 例如: 20180808 -> 20180831 20180809 -> 20180831 注意: 1、使用时一定要注意,此命令会更改数据的index;因此,凡是涉及输入的数据使用此命令时,一定要使用copy(),以防出错; 2、此方法会掩盖真实交易日期(全都转换为自然年月末尾值) :param dat: :param to_freq: :return: ''' data = dat.copy() if to_freq == 'M': data.index = data.index.where( data.index == ((data.index + MonthEnd()) - MonthEnd()), data.index + MonthEnd()) elif to_freq == 'W': # By=lambda x:x.year*100+x.week # 此种方法转化为周末日期时会出现错误 week_day = 5 #0-6分别对应周一至周日 data.index = data.index.where( data.index == ((data.index + Week(weekday=week_day)) - Week()), data.index + Week(weekday=week_day)) elif to_freq == 'Y': data.index = data.index.where( data.index == ((data.index + YearEnd()) - YearEnd()), data.index + YearEnd()) return data
def time_for_next_update(last_time, period='D', hour=9): """计算下次更新时间 说明: 'D':移动到下一天 'W':移动到下周一 'M':移动到下月第一天 'Q':下一季度的第一天 将小时调整到指定的hour """ if pd.isnull(last_time): return MARKET_START period = period.upper() if period == 'D': d = BDay(normalize=True) return d.apply(last_time).replace(hour=hour) elif period == 'W': w = Week(normalize=True, weekday=0) return w.apply(last_time).replace(hour=hour) elif period == 'M': m = MonthBegin(normalize=True) return m.apply(last_time).replace(hour=hour) elif period == 'Q': q = QuarterBegin(normalize=True) return q.apply(last_time).replace(hour=hour) else: raise TypeError('不能识别的周期类型,仅接受{}'.format(('D', 'W', 'M', 'Q')))
def test_get_offset_name(self): assert BDay().freqstr == "B" assert BDay(2).freqstr == "2B" assert BMonthEnd().freqstr == "BM" assert Week(weekday=0).freqstr == "W-MON" assert Week(weekday=1).freqstr == "W-TUE" assert Week(weekday=2).freqstr == "W-WED" assert Week(weekday=3).freqstr == "W-THU" assert Week(weekday=4).freqstr == "W-FRI" assert LastWeekOfMonth(weekday=WeekDay.SUN).freqstr == "LWOM-SUN"
def process_setup(config, process_config, item, hist_end_date, group=P_BASELINE_PARAMS): ''' Complete build of configuration objects and load the parameters 2016-06-03 -- Adeded logic to respect the hist_start_date on the item. ''' #get baseline parameters and update config with all item parameters and attributes. config, params = get_item_params(config, item, as_of=hist_end_date, group=In(group)) # Establish the window of hist data we will be processing in thia run hist_lookback = get_lookback(config) # We can have a hist_start_date controlled by the planner to exlude some days # We want whole weeks so our hist strt date may be upp to 6 days after # the planner set hist_start_date. max_hist_weeks = (hist_end_date - config.hist_start_date).days // 7 if hist_lookback: max_hist_weeks = min(max_hist_weeks, hist_lookback) hist_start_date = hist_end_date - Week(max_hist_weeks) + Day() config[ 'hist_start_date'] = hist_start_date # this overwrites the original value from the item table??? config['hist_end_date'] = hist_end_date config['hist_lookback'] = max_hist_weeks config['publish_start_date'] = hist_end_date - Week( config.publish_window) + Day() config['forecast_end_date'] = hist_end_date + Week( process_config.forecast_weeks) if config.publish_start_date < config.hist_start_date: # we do not construct a baseline as we do not have enough history # everything is going to be satisifed by a proxy # we trigger the correct result by setting the # hist _start_date to the day afteer the last hist date # this will result inan empty hist_df, no geogs_with_hist and no arrays to smooth config['hist_start_date'] = config.hist_end_date + Day() config['run_baseline_processes'] = False else: config['run_baseline_processes'] = True return config, params
def next_update_time(last_updated, freq='D', hour=18, minute=0): """计算下次更新时间 说明: 'D':移动到下一天 'W':移动到下周一 'M':移动到下月第一天 'Q':下一季度的第一天 将时间调整到指定的hour和minute """ if pd.isnull(last_updated): return MARKET_START freq = freq.upper() if freq == 'D': d = BDay(n=1, normalize=True) res = last_updated + d return res.replace(hour=hour, minute=minute) elif freq == 'W': w = Week(normalize=True, weekday=0) res = last_updated + w return res.replace(hour=hour, minute=minute) elif freq == 'M': m = MonthBegin(n=1, normalize=True) res = last_updated + m return res.replace(hour=hour, minute=minute) elif freq == 'Q': q = QuarterBegin(normalize=True, startingMonth=1) res = last_updated + q return res.replace(hour=hour, minute=minute) else: raise TypeError('不能识别的周期类型,仅接受{}'.format(('D', 'W', 'M', 'Q')))
def _monthly_bd_gen(start_date, end_date): date_series = pd.date_range(start_date, end_date, freq='M').to_series() date_series = date_series.apply( lambda x: x + Week(2, weekday=2)) # 每月第二个周三 date_str = ','.join( date_series.apply(lambda x: x.strftime('%Y/%m/%d'))) return date_str
def prep_FF_weekly(path): # Load Fama French three factors FF3 = pd.read_csv(path + '\\Data\\F-F_Research_Data_Factors_weekly.csv') # Convert date column to date FF3['DATE'] = pd.to_datetime(FF3['DATE'].astype(str), format='%Y%m%d') + Week(weekday=4) # Divide columns be 100 FF3[['Mkt-RF', 'SMB', 'HML', 'RF']] = FF3[['Mkt-RF', 'SMB', 'HML', 'RF']].div(100) # Load faux Fama French weekly momentum FF_mom = pd.read_csv(path + '\\Data\\F-F_Momentum_Factor_weekly.csv') # Convert date column to datetime FF_mom['DATE'] = pd.to_datetime(FF_mom['DATE'].astype(str), format='%m/%d/%Y') # Merge Fama French data FF = FF3.merge(FF_mom, on='DATE') # Record risk free returns RF = FF[['DATE', 'RF']] # Drop risk free date from Fama French dataframe FF.drop('RF', axis=1, inplace=True) return FF, RF
def test_week_add_invalid(): # Week with weekday should raise TypeError and _not_ AttributeError # when adding invalid offset offset = Week(weekday=1) other = Day() with pytest.raises(TypeError, match="Cannot add"): offset + other
def vx_expiry(year, month): t = datetime(year, month, 1) + relativedelta(months=1) offset = Week(weekday=4) if t.weekday() != 4: t_new = t + 3 * offset else: t_new = t + 2 * offset t_exp = t_new - timedelta(days=30) return t_exp
def get_visits(config): q = """select v.id as id ,u.url ,u.title ,v.visit_time ,v.transition ,v.visit_duration ,v.from_visit ,uf.url as from_url ,uf.title as from_title from visits v join urls u on u.id = v.url left join visits vf on vf.id=v.from_visit left join urls uf on uf.id=vf.url order by v.visit_time desc """ visits = pd.read_sql(sql=q, con=sqlite3.connect( os.path.join(config['data_directory'], 'chrome_history')), index_col='id') visits['visit_time'] = windows_epoch_to_datetime(visits['visit_time']) visits['visit_week'] = visits['visit_time'].apply( lambda x: Week(weekday=6, normalize=True).rollback(x)) # https://developer.chrome.com/extensions/history#type-TransitionType transition_types = { 0: 'link', 1: 'typed', # Typed into search bar (also auto-suggest from search bar) 2: 'auto_bookmark', 3: 'auto_subframe', # Automatic navigation within a subframe 4: 'manual_subframe', # Navigation within a subframe 5: 'generated', # Did a search from the search bar, and chose an option) 6: 'auto_toplevel', 7: 'form_submit', 8: 'reload', 9: 'keyword', 10: 'keyword_generated' } visits['transition'] = visits['transition'].apply( lambda x: transition_types[x & 0x000000FF]) idx = visits.transition.isin( ['reload', 'form_submit', 'auto_subframe', 'manual_subframe']) visits.drop(visits.index[idx], inplace=True) # Add a count of unique weeks visited by URL (for some reason to_numeric is needed to convert from a datetime) visits['weeks_observed'] = pd.to_numeric( visits.groupby('url')['visit_week'].transform('nunique')) return visits
def resample_index(dat, to_freq): ''' 使用时一定要注意,此命令会更改数据的index;因此,凡是涉及输入的数据使用此命令时,一定要使用copy(),以防出错 :param data: :param to_freq: :return: ''' data=dat.copy() if to_freq=='M': data.index = data.index.where(data.index == ((data.index + MonthEnd()) - MonthEnd()), data.index + MonthEnd()) elif to_freq=='W': # By=lambda x:x.year*100+x.week # 此种方法转化为周末日期时会出现错误 week_day=5 #0-6分别对应周一至周日 data.index=data.index.where(data.index==((data.index+Week(weekday=week_day))-Week()),data.index+Week(weekday=week_day)) elif to_freq=='Y': data.index = data.index.where(data.index == ((data.index + YearEnd()) - YearEnd()), data.index + YearEnd()) return data
def digest_stocks_features(stock_lst, start_index, finish_index): for stock in stock_lst[start_index:finish_index + 1]: print("working on {}".format(stock)) stock_data_df = stocks_data_dict[stock] all_signals = dict_of_stocks_fp[dict_of_stocks_fp.symbol == stock] # TODO add counter and progress bar or percentage ( curr / len of signals) # TODO per stock per all stocks curr_stock_signals_features = pd.DataFrame() for signal_index, row in enumerate(all_signals.iterrows()): curr_signal_start_date = pd.to_datetime(row[1].start_date) signal_label = row[1].false_positive curr_signal_month_data = stock_data_df["adj_close"][stock_data_df.index <= curr_signal_start_date].resample( "BM").last().interpolate() curr_signal_month_data = curr_signal_month_data[ curr_signal_month_data.index >= curr_signal_start_date - BMonthEnd(100)] curr_signal_week_data = stock_data_df["adj_close"][stock_data_df.index <= curr_signal_start_date].resample( "W").last().interpolate() curr_signal_week_data = curr_signal_week_data[ curr_signal_week_data.index >= curr_signal_start_date - Week(100)] curr_signal_day_data = stock_data_df["adj_close"][stock_data_df.index <= curr_signal_start_date].resample( "B").last().interpolate() curr_signal_day_data = curr_signal_day_data[ curr_signal_day_data.index >= curr_signal_start_date - BDay(100)] curr_signal_only, curr_signal_features_month = do_features(curr_signal_month_data, curr_signal_start_date, all_signals, stock, signal_index, signal_label ) curr_signal_only, curr_signal_features_week = do_features(curr_signal_week_data, curr_signal_start_date, all_signals, stock, signal_index, signal_label ) curr_signal_only, curr_signal_features_day = do_features(curr_signal_day_data, curr_signal_start_date, all_signals, stock, signal_index, signal_label ) timestamps_features_month[str(stock) + "_" + str(signal_index)] = curr_signal_features_month timestamps_features_week[str(stock) + "_" + str(signal_index)] = curr_signal_features_week timestamps_features_day[str(stock) + "_" + str(signal_index)] = curr_signal_features_day # adding to df of current stock curr_stock_signals_features = pd.concat([curr_stock_signals_features, curr_signal_only]) # disregard the date index # still saving it for train/test separations (e.g. learn until 2010 and test afterwards) curr_stock_signals_features = curr_stock_signals_features.reset_index() with lock: # add to the list of all signals of all stocks print("finished {}".format(stock)) all_stocks_signals_features[stock] = curr_stock_signals_features print("finished stocks features")
def __init__( self, num_timeseries: int = 10, num_steps: int = 30, freq: str = "1H", start: str = "2000-01-01 00:00:00", # Generates constant dataset of 0s with explicit NaN missing values is_nan: bool = False, # Inserts random constant value for each time series is_random_constant: bool = False, # Generates constants on various scales is_different_scales: bool = False, # Determines whether the time series in the test # and train set should have different constant values is_piecewise: bool = False, # Determines whether to add Gaussian noise to the constant dataset is_noise: bool = False, # Determines whether some time series will have very long lengths is_long: bool = False, # Determines whether some time series will have very short lengths is_short: bool = False, # Determines whether to add linear trends is_trend: bool = False, # Number of missing values in the middle of the time series num_missing_middle: int = 0, # Determines whether to add promotions to the target time series # and to store in metadata is_promotions: bool = False, # Determines whether to add holidays to the target time series # and to store in metadata holidays: Optional[List[pd.Timestamp]] = None, ) -> None: super().__init__(freq) self.num_timeseries = num_timeseries self.num_steps = num_steps self.num_training_steps = self.num_steps // 10 * 8 self.prediction_length = self.num_steps - self.num_training_steps self.is_nan = is_nan self.is_random_constant = is_random_constant self.is_different_scales = is_different_scales self.is_piecewise = is_piecewise self.is_noise = is_noise self.is_long = is_long self.is_short = is_short self.is_trend = is_trend self.num_missing_middle = num_missing_middle self.is_promotions = is_promotions self.holidays = holidays if isinstance(self.freq, Week): self.freq = Week( self.freq.n, weekday=pd.Timestamp(start).weekday() ) self.start = cast(pd.Period, pd.Period(start, self.freq))
def info_process(dic, contract_type): """Get options according to the inputs.""" target_date = datetime.strptime( dic["Target Date"], "%Y-%m-%d") # Target date to datetime object if target_date.isoweekday( ) != 5: # The target date is moved to next Friday if it's not. target_date = target_date + Week(weekday=4) ls = [] while True: # Use a loop to store options on different target dates to a list. if len(ls) == 3: # Look into 3 available target dates in the future. break try: # If options available on a target date, append the dataframe to the list # then move the target date to next Friday ls.append(get_options(dic["Ticker"], target_date, contract_type)) target_date = target_date + Week(weekday=4) except: # If there is no options available, directly move to next Friday. target_date = target_date + Week(weekday=4) df_options = pd.concat(ls) # Concatenate the dataframe into one. df_options.columns = ["Strike", "Price", "Volatility", "Expiration Date"] # Change column names return df_options
def to_offset(self) -> DateOffset: if self.value == "H": return Hour(1) elif self.value == "D": return Day(1) elif self.value == "W-MON": return Week(1, weekday=0) elif self.value == "MS": return MonthBegin(1) elif self.value == "QS-DEC": return QuarterBegin(startingMonth=10) elif self.value == "AS": return YearBegin(1) raise NotImplementedError(self.value)
def test_get_offset(): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): _get_offset("gibberish") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): _get_offset("QS-JAN-B") pairs = [ ("B", BDay()), ("b", BDay()), ("bm", BMonthEnd()), ("Bm", BMonthEnd()), ("W-MON", Week(weekday=0)), ("W-TUE", Week(weekday=1)), ("W-WED", Week(weekday=2)), ("W-THU", Week(weekday=3)), ("W-FRI", Week(weekday=4)), ] for name, expected in pairs: offset = _get_offset(name) assert offset == expected, ( f"Expected {repr(name)} to yield {repr(expected)} " f"(actual: {repr(offset)})")
def next_update_time(last_updated, freq='D', hour=18, minute=0, second=0): """计算下次更新时间 说明: 'S':移动到下一秒 'm':移动到下一分钟 'H':移动到下一小时 'D':移动到下一天 'W':移动到下周一 'M':移动到下月第一天 'Q':下一季度的第一天 将时间调整到指定的hour和minute """ if pd.isnull(last_updated): return MARKET_START if freq == 'S': off = Second() return last_updated + off elif freq == 'm': off = Minute() return last_updated + off elif freq == 'H': off = Hour() return last_updated + off elif freq == 'D': d = BDay(n=1, normalize=True) res = last_updated + d return res.replace(hour=hour, minute=minute, second=second) elif freq == 'W': w = Week(normalize=True, weekday=0) res = last_updated + w return res.replace(hour=hour, minute=minute, second=second) elif freq == 'M': m = MonthBegin(n=1, normalize=True) res = last_updated + m return res.replace(hour=hour, minute=minute, second=second) elif freq == 'Q': q = QuarterBegin(normalize=True, startingMonth=1) res = last_updated + q return res.replace(hour=hour, minute=minute, second=second) else: raise TypeError('不能识别的周期类型,仅接受{}'.format( ('S', 'm', 'H', 'D', 'W', 'M', 'Q')))
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, **kwds): self.n = n self.normalized = normalize self.kwds.update(kwds) self.offset = kwds.get('offset', timedelta(0)) self.cbday = CustomBusinessDay(n=1, normalize=normalize, weekmask=weekmask, holidays=holidays, calendar=calendar, offset=self.offset) self.calendar = self.cbday.calendar self.holidays = holidays self.w_offset = Week(weekday=4)
def test_weeks_onoffset(): # GH#18510 Week with weekday = None, normalize = False should always # be is_on_offset offset = Week(n=2, weekday=None) ts = Timestamp("1862-01-13 09:03:34.873477378+0210", tz="Africa/Lusaka") fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow # negative n offset = Week(n=2, weekday=None) ts = Timestamp("1856-10-24 16:18:36.556360110-0717", tz="Pacific/Easter") fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', holidays=None, calendar=None, **kwds): self.n = n object.__setattr__(self, "normalized", normalize) self.kwds.update(kwds) object.__setattr__(self, "offset", kwds.get('offset', timedelta(0))) object.__setattr__( self, "cbday", CustomBusinessDay(n=1, normalize=normalize, weekmask=weekmask, holidays=holidays, calendar=calendar, offset=self.offset)) object.__setattr__(self, "calendar", self.cbday.calendar) object.__setattr__(self, "holidays", holidays) object.__setattr__(self, "w_offset", Week(weekday=4))
'BQS-JAN' : BQuarterBegin(startingMonth=1), 'BQS' : BQuarterBegin(startingMonth=1), 'BQS-FEB' : BQuarterBegin(startingMonth=2), 'BQS-MAR' : BQuarterBegin(startingMonth=3), 'BQS-APR' : BQuarterBegin(startingMonth=4), 'BQS-MAY' : BQuarterBegin(startingMonth=5), 'BQS-JUN' : BQuarterBegin(startingMonth=6), 'BQS-JUL' : BQuarterBegin(startingMonth=7), 'BQS-AUG' : BQuarterBegin(startingMonth=8), 'BQS-SEP' : BQuarterBegin(startingMonth=9), 'BQS-OCT' : BQuarterBegin(startingMonth=10), 'BQS-NOV' : BQuarterBegin(startingMonth=11), 'BQS-DEC' : BQuarterBegin(startingMonth=12), # Weekly 'W-MON' : Week(weekday=0), 'W-TUE' : Week(weekday=1), 'W-WED' : Week(weekday=2), 'W-THU' : Week(weekday=3), 'W-FRI' : Week(weekday=4), 'W-SAT' : Week(weekday=5), 'W-SUN' : Week(weekday=6), } _offset_to_period_map = { 'WEEKDAY' : 'D', 'EOM' : 'M', 'B' : 'D', 'BM' : 'M', 'BQS' : 'Q',
def create_data(): """ create the pickle/msgpack data """ data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] } scalars = dict(timestamp=Timestamp('20130101'), period=Period('2012', 'M')) index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10), float=Index(np.arange(10, dtype=np.float64)), uint=Index(np.arange(10, dtype=np.uint64)), timedelta=timedelta_range('00:00:00', freq='30T', periods=10)) if _loose_version >= LooseVersion('0.18'): from pandas import RangeIndex index['range'] = RangeIndex(10) if _loose_version >= LooseVersion('0.21'): from pandas import interval_range index['interval'] = interval_range(0, periods=10) mi = dict(reg2=MultiIndex.from_tuples(tuple( zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101', periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple( zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=['one', 'two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), cat=Series(Categorical(['foo', 'bar', 'baz'])), dt=Series(date_range('20130101', periods=5)), dt_tz=Series( date_range('20130101', periods=5, tz='US/Eastern')), period=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict(float=DataFrame({ 'A': series['float'], 'B': series['float'] + 1 }), int=DataFrame({ 'A': series['int'], 'B': series['int'] + 1 }), mixed=DataFrame({k: data[k] for k in ['A', 'B', 'C', 'D']}), mi=DataFrame( { 'A': np.arange(5).astype(np.float64), 'B': np.arange(5).astype(np.int64) }, index=MultiIndex.from_tuples(tuple( zip(*[['bar', 'bar', 'baz', 'baz', 'baz'], ['one', 'two', 'one', 'two', 'three']])), names=['first', 'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A']), cat_onecol=DataFrame({'A': Categorical(['foo', 'bar'])}), cat_and_float=DataFrame({ 'A': Categorical(['foo', 'bar', 'baz']), 'B': np.arange(3).astype(np.int64) }), mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame( { 'A': Timestamp('20130102', tz='US/Eastern'), 'B': Timestamp('20130603', tz='CET') }, index=range(5)), dt_mixed2_tzs=DataFrame( { 'A': Timestamp('20130102', tz='US/Eastern'), 'B': Timestamp('20130603', tz='CET'), 'C': Timestamp('20130603', tz='UTC') }, index=range(5))) cat = dict(int8=Categorical(list('abcdefg')), int16=Categorical(np.arange(1000)), int32=Categorical(np.arange(10000))) timestamp = dict(normal=Timestamp('2011-01-01'), nat=NaT, tz=Timestamp('2011-01-01', tz='US/Eastern')) if _loose_version < LooseVersion('0.19.2'): timestamp['freq'] = Timestamp('2011-01-01', offset='D') timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', offset='M') else: timestamp['freq'] = Timestamp('2011-01-01', freq='D') timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', freq='M') off = { 'DateOffset': DateOffset(years=1), 'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824), 'BusinessDay': BusinessDay(offset=timedelta(seconds=9)), 'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'), 'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'), 'SemiMonthBegin': SemiMonthBegin(day_of_month=9), 'SemiMonthEnd': SemiMonthEnd(day_of_month=24), 'MonthBegin': MonthBegin(1), 'MonthEnd': MonthEnd(1), 'QuarterBegin': QuarterBegin(1), 'QuarterEnd': QuarterEnd(1), 'Day': Day(1), 'YearBegin': YearBegin(1), 'YearEnd': YearEnd(1), 'Week': Week(1), 'Week_Tues': Week(2, normalize=False, weekday=1), 'WeekOfMonth': WeekOfMonth(week=3, weekday=4), 'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3), 'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"), 'Easter': Easter(), 'Hour': Hour(1), 'Minute': Minute(1) } return dict(series=series, frame=frame, index=index, scalars=scalars, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()), cat=cat, timestamp=timestamp, offsets=off)
def create_data(): """ create the pickle/msgpack data """ data = { "A": [0.0, 1.0, 2.0, 3.0, np.nan], "B": [0, 1, 0, 1, 0], "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], "D": date_range("1/1/2009", periods=5), "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], } scalars = dict(timestamp=Timestamp("20130101"), period=Period("2012", "M")) index = dict( int=Index(np.arange(10)), date=date_range("20130101", periods=10), period=period_range("2013-01-01", freq="M", periods=10), float=Index(np.arange(10, dtype=np.float64)), uint=Index(np.arange(10, dtype=np.uint64)), timedelta=timedelta_range("00:00:00", freq="30T", periods=10), ) index["range"] = RangeIndex(10) if _loose_version >= LooseVersion("0.21"): from pandas import interval_range index["interval"] = interval_range(0, periods=10) mi = dict(reg2=MultiIndex.from_tuples( tuple( zip(*[ ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], ["one", "two", "one", "two", "one", "two", "one", "two"], ])), names=["first", "second"], )) series = dict( float=Series(data["A"]), int=Series(data["B"]), mixed=Series(data["E"]), ts=Series(np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)), mi=Series( np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple( zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]), ), dup=Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]), cat=Series(Categorical(["foo", "bar", "baz"])), dt=Series(date_range("20130101", periods=5)), dt_tz=Series(date_range("20130101", periods=5, tz="US/Eastern")), period=Series([Period("2000Q1")] * 5), ) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict( float=DataFrame({ "A": series["float"], "B": series["float"] + 1 }), int=DataFrame({ "A": series["int"], "B": series["int"] + 1 }), mixed=DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}), mi=DataFrame( { "A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64) }, index=MultiIndex.from_tuples( tuple( zip(*[ ["bar", "bar", "baz", "baz", "baz"], ["one", "two", "one", "two", "three"], ])), names=["first", "second"], ), ), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]), cat_onecol=DataFrame({"A": Categorical(["foo", "bar"])}), cat_and_float=DataFrame({ "A": Categorical(["foo", "bar", "baz"]), "B": np.arange(3).astype(np.int64), }), mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), }, index=range(5), ), dt_mixed2_tzs=DataFrame( { "A": Timestamp("20130102", tz="US/Eastern"), "B": Timestamp("20130603", tz="CET"), "C": Timestamp("20130603", tz="UTC"), }, index=range(5), ), ) cat = dict( int8=Categorical(list("abcdefg")), int16=Categorical(np.arange(1000)), int32=Categorical(np.arange(10000)), ) timestamp = dict( normal=Timestamp("2011-01-01"), nat=NaT, tz=Timestamp("2011-01-01", tz="US/Eastern"), ) timestamp["freq"] = Timestamp("2011-01-01", freq="D") timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M") off = { "DateOffset": DateOffset(years=1), "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824), "BusinessDay": BusinessDay(offset=timedelta(seconds=9)), "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"), "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"), "SemiMonthBegin": SemiMonthBegin(day_of_month=9), "SemiMonthEnd": SemiMonthEnd(day_of_month=24), "MonthBegin": MonthBegin(1), "MonthEnd": MonthEnd(1), "QuarterBegin": QuarterBegin(1), "QuarterEnd": QuarterEnd(1), "Day": Day(1), "YearBegin": YearBegin(1), "YearEnd": YearEnd(1), "Week": Week(1), "Week_Tues": Week(2, normalize=False, weekday=1), "WeekOfMonth": WeekOfMonth(week=3, weekday=4), "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3), "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"), "Easter": Easter(), "Hour": Hour(1), "Minute": Minute(1), } return dict( series=series, frame=frame, index=index, scalars=scalars, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()), cat=cat, timestamp=timestamp, offsets=off, )
'Zweiter Weihnachtstag': Holiday('Zweiter Weihnachtstag', month=12, day=26), 'Heilige Drei Könige': Holiday('Heilige Drei Könige', month=1, day=6), 'Mariä Himmelfahrt': Holiday('Mariä Himmelfahrt', month=8, day=15), 'Tag der Deutschen Einheit': Holiday('Tag der Deutschen Einheit', month=10, day=3), 'Reformationstag': Holiday('Reformationstag', month=10, day=31), '500. Reformationstag': Holiday('Reformationstag', year=2017, month=10, day=31), 'Allerheiligen': Holiday('Allerheiligen', month=11, day=1), 'Buß- und Bettag': Holiday('Buß- und Bettag', month=11, day=15, offset=[Week(weekday=2)]), } HOLIDAY_EXCLUDE_MAP = { 'BW': {'Mariä Himmelfahrt', 'Reformationstag', 'Buß- und Bettag'}, 'BY': {'Reformationstag', 'Buß- und Bettag'}, 'BE': { 'Heilige Drei Könige', 'Fronleichnam', 'Mariä Himmelfahrt', 'Reformationstag', 'Allerheiligen', 'Buß- und Bettag' }, 'BB': { 'Heilige Drei Könige', 'Fronleichnam', 'Mariä Himmelfahrt', 'Allerheiligen', 'Buß- und Bettag' }, 'HB': { 'Heilige Drei Könige', 'Fronleichnam', 'Mariä Himmelfahrt',
def returns(ts, calc_type='D', force=False): """ Calculate returns time series of returns for various time windows. :param ts: :py:obj:`TimeSeries`, :py:obj:`pandas.Series`, :py:obj:`pandas.DataFrame` Time series whose returns will be calculated. :param calc_type: {'D', 'W', 'M', '6M', 'Y', '3Y', 'WTD', 'MTD', 'YTD', 'SI'}, optional The time window for return calculation. Default is 'D' (daily returns). :param force: bool, optional Backward-fill missing data. Default is False. :return :py:obj:`pandas.Series`, :py:obj:`pandas.DataFrame` Series or DataFrame of returns. """ if isinstance(ts, TimeSeries): df = ts.ts_values else: df = ts if df.empty: return df first_index = df.first_valid_index() last_index = df.last_valid_index() def array_return(x): return x[-1] / x[0] - 1 def one_month_ago(x): return to_datetime(to_ql_date(x) - ql.Period(1, ql.Months)) def six_months_ago(x): return to_datetime(to_ql_date(x) - ql.Period(6, ql.Months)) calc_type = calc_type.upper() if calc_type == 'D': return df.pct_change() elif calc_type == 'W': df = df.reindex() return df.resample(BDay()).fillna(method='pad').rolling( 6, min_periods=2).apply(array_return) elif calc_type == 'M': one_month_ago = df.index.map(one_month_ago) df_one_month_ago = df.reindex(one_month_ago, method='pad') if force: df_one_month_ago = df_one_month_ago.fillna(df.loc[first_index]) return pd.Series(index=df.index, data=df.values / df_one_month_ago.values) - 1 elif calc_type == '6M': six_months_ago = df.index.map(six_months_ago) df_six_months_ago = df.reindex(six_months_ago, method='pad') if force is True: df_six_months_ago = df_six_months_ago.fillna(df.loc[first_index]) return pd.Series(index=df.index, data=df.values / df_six_months_ago.values) - 1 elif calc_type == 'Y': one_year_ago = df.index - pd.DateOffset(years=1) df_one_year_ago = df.reindex(one_year_ago, method='pad') if force is True: df_one_year_ago = df_one_year_ago.fillna(df.loc[first_index]) return pd.Series(index=df.index, data=df.values / df_one_year_ago.values) - 1 elif calc_type == '3Y': three_years_ago = df.index - pd.dateOffset(years=3) df_three_years_ago = df.reindex(three_years_ago, method='pad') if force: df_three_years_ago = df_three_years_ago.fillna(df.loc[first_index]) return pd.Series(index=df.index, data=df.values / df_three_years_ago.values) - 1 elif calc_type == 'WTD': index = pd.date_range(first_index, last_index, freq=Week(weekday=4)) df_week_end = df.reindex(index, method='pad').reindex(df.index, method='pad') return df / df_week_end - 1 elif calc_type == 'MTD': index = pd.date_range(first_index, last_index, freq=BMonthEnd()) df_month_end = df.reindex(index, method='pad').reindex(df.index, method='pad') return df / df_month_end - 1 elif calc_type == 'YTD': index = pd.date_range(first_index, last_index, freq=BYearEnd()) df_year_end = df.reindex(index, method='pad').reindex(df.index, method='pad') return df / df_year_end - 1 elif calc_type == 'SI': return df / df.loc[first_index] - 1
def test_get_offset_legacy(): pairs = [("w@Sat", Week(weekday=5))] for name, expected in pairs: with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): _get_offset(name)
# Calculate ME_lag; not lagged yet however columns['inv_ME_lag'] = 1/(columns['TOTAL_SHARES'] * columns['PRICE_UNADJUSTED']) # Create ME dataframe ME = columns[['DATE', 'DW_INSTRUMENT_ID', 'MAIN_KEY', 'inv_ME_lag']] # Drop unneeded columns columns.drop(['TOTAL_SHARES', 'PRICE_UNADJUSTED', 'inv_ME_lag'], axis = 1, inplace = True) # Obtain year fundamentals['year'] = fundamentals['DATE'].dt.year fundamentals['month'] = fundamentals['DATE'].dt.month # For the monthly ones we need to shift the dates a month minus one week forward faux_date = columns['DATE'] + dt.timedelta(days = 7) + MonthEnd(0) - dt.timedelta(days = 6) + Week(weekday = 4) columns['year'] = faux_date.dt.year # Create year and month columns for ME ME['year'] = faux_date.dt.year ME['month'] = faux_date.dt.month # Drop columns columns.drop('DATE', axis = 1, inplace = True) ME.drop('DATE', axis = 1, inplace = True) # Drop values with missing instrument ID, gvkey, or return ME.dropna(subset = ['inv_ME_lag'], axis = 0, how = 'any', inplace = True) # Merge columns with fundamentals fundamentals = fundamentals.merge(columns, on = ['year', 'DW_INSTRUMENT_ID'])
stocks_month = pd.read_csv(path + '\\Data\\monthly_ret.csv') # Convert date column to datetime object stocks_week['DATE'] = pd.to_datetime(stocks_week['DATE'].astype(str), format="%Y-%m-%d") stocks_month['DATE'] = pd.to_datetime(stocks_month['DATE'].astype(str), format="%Y-%m-%d") # Create year and month columns stocks_week['year'] = stocks_week['DATE'].dt.year stocks_week['month'] = stocks_week['DATE'].dt.month # For the monthly ones we need to shift the dates a month minus one week forward faux_date = stocks_month['DATE'] + dt.timedelta( days=7) + MonthEnd(0) - dt.timedelta(days=6) + Week(weekday=4) stocks_month['year'] = faux_date.dt.year stocks_month['month'] = faux_date.dt.month del faux_date # Only consider observations before 2010 stocks_week = stocks_week.loc[stocks_week['year'] < 2010, :] # Save the good columns of stocks_month columns = stocks_month[[ 'year', 'month', 'DW_INSTRUMENT_ID', 'MAIN_KEY', 'INDUSTRY', 'PRICE_UNADJUSTED', 'TOT_EQUITY' ]] # Merge the columns with stocks_week