def _align_bivariate(self, df1, df2, force_align=False): if isinstance(df1, pd.DataFrame) and isinstance(df2, pd.DataFrame): len1 = len(df1.index) len2 = len(df2.index) if (self.ann_dts is not None) and (self.trade_dts is not None): if len1 > len2: df2 = align(df2, self.ann_dts, self.trade_dts) elif len1 < len2: df1 = align(df1, self.ann_dts, self.trade_dts) elif force_align: df1 = align(df1, self.ann_dts, self.trade_dts) df2 = align(df2, self.ann_dts, self.trade_dts) return (df1, df2)
def test_remote_data_service_industry(): from quantos.data.align import align import pandas as pd ds = RemoteDataService() arr = ds.get_index_comp(index='000300.SH', start_date=20130101, end_date=20170505) df = ds.get_industry_raw(symbol=','.join(arr), type_='ZZ') df = df.astype(dtype={'in_date': int}) # df_ann = df.loc[:, ['in_date', 'symbol']] # df_ann = df_ann.set_index(['symbol', 'in_date']) # df_ann = df_ann.unstack(level='symbol') from quantos.data.dataview import DataView dic_sec = DataView._group_df_to_dict(df, by='symbol') dic_sec = {sec: df.reset_index() for sec, df in dic_sec.viewitems()} df_ann = pd.concat([df.loc[:, 'in_date'].rename(sec) for sec, df in dic_sec.viewitems()], axis=1) df_value = pd.concat([df.loc[:, 'industry1_code'].rename(sec) for sec, df in dic_sec.viewitems()], axis=1) dates_arr = ds.get_trade_date(20140101, 20170505) res = align(df_value, df_ann, dates_arr) print # df_ann = df.pivot(index='in_date', columns='symbol', values='in_date') # df_value = df.pivot(index=None, columns='symbol', values='industry1_code') def align_single_df(df_one_sec): df_value = df_one_sec.loc[:, ['industry1_code']] df_ann = df_one_sec.loc[:, ['in_date']] res = align(df_value, df_ann, dates_arr) return res # res_list = [align_single_df(df) for sec, df in dic_sec.viewitems()] res_list = [align_single_df(df) for sec, df in dic_sec.items()[:10]] res = pd.concat(res_list, axis=1) print res
def _align_univariate(self, df1): if isinstance(df1, pd.DataFrame): if (self.ann_dts is not None) and (self.trade_dts is not None): len1 = len(df1.index) len2 = len(self.trade_dts) if len1 != len2: return align(df1, self.ann_dts, self.trade_dts) return df1
def get_industry_daily(self, symbol, start_date, end_date, type_='SW'): """ Get index components on each day during start_date and end_date. Parameters ---------- symbol : str separated by ',' start_date : int end_date : int type_ : {'SW', 'ZZ'} Returns ------- res : pd.DataFrame index dates, columns symbols values are industry code """ df_raw = self.get_industry_raw(symbol, type_=type_) dic_sec = self._group_df_to_dict(df_raw, by='symbol') dic_sec = { sec: df.sort_values(by='in_date', axis=0).reset_index() for sec, df in dic_sec.viewitems() } df_ann = pd.concat([ df.loc[:, 'in_date'].rename(sec) for sec, df in dic_sec.viewitems() ], axis=1) df_value = pd.concat([ df.loc[:, 'industry1_code'].rename(sec) for sec, df in dic_sec.viewitems() ], axis=1) dates_arr = self.get_trade_date(start_date, end_date) df_industry = align.align(df_value, df_ann, dates_arr) # TODO before industry classification is available, we assume they belong to their first group. df_industry = df_industry.fillna(method='bfill') df_industry = df_industry.astype(str) return df_industry
def align_single_df(df_one_sec): df_value = df_one_sec.loc[:, ['industry1_code']] df_ann = df_one_sec.loc[:, ['in_date']] res = align(df_value, df_ann, dates_arr) return res