Esempio n. 1
0
def feature_models(start_date='20181101',
                   end_date='20181131',
                   data_source=0,
                   feature_types=[],
                   saved_feature=True,
                   bc=None,
                   top_ratio=0.25,
                   bottom_ratio=0.2):
    root = get_source_root()
    feature_mapping = _get_source_features()

    date_periods = _get_in_out_dates(
        start_date=start_date, end_date=end_date,
        security_id=bc) or [[start_date, end_date]]
    next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1)
    idx_labels = get_idx_returns(security_ids=[bc],
                                 start_date=start_date,
                                 end_date=next_date,
                                 source=0).get(bc)
    all_factor_returns = []

    for _start_date, _end_date in date_periods:
        logger.info('get factor return: processing from {0} to {1}'.format(
            _start_date, _end_date))
        next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2)
        if bc:
            security_ids = get_idx_cons_jy(bc, _start_date, _end_date)
        else:
            security_ids = get_security_codes()
        # FIXME HACK FOR TESTING
        security_ids = security_ids[:5]
        ret_features = get_equity_daily_features(security_ids=security_ids,
                                                 features=feature_mapping,
                                                 start_date=_start_date,
                                                 end_date=_end_date,
                                                 source=data_source)
        ret_returns = get_equity_returns(security_ids=security_ids,
                                         start_date=_start_date,
                                         end_date=next_date)
        ret_mv = get_market_value(security_ids=security_ids,
                                  start_date=_start_date,
                                  end_date=next_date)
        for date, val in ret_features.items():
            daily_factors = []
            daily_return = []
            daily_mv = []
            for sec_id, f_lst in val.items():
                _val_lst = list(f_lst.values())
                all_feature_names = list(f_lst.keys())
                daily_factors.append(_val_lst)
                try:
                    s_label = ret_returns.get(sec_id).get(str(date))
                    i_label = idx_labels.get(str(date))
                    label = (s_label - i_label) * 100
                    mv = ret_mv.get(sec_id).get(str(date))
                except Exception as ex:
                    label = np.nan
                    logger.error(
                        'fail to calculate the label with error:{0}'.format(
                            ex))
                daily_return.append(label)
                daily_mv.append(mv)
            try:
                daily_factors = feature_preprocessing(arr=daily_factors,
                                                      fill_none=True,
                                                      weights=daily_mv)
            except Exception as ex:
                logger.error(
                    'fail in feature preprocessing with error:{0}'.format(ex))
            # indus_factors = get_indust_vectors(security_ids)
            courtry_factors = np.ones(len(security_ids)).reshape(
                len(security_ids), 1)
            factor_returns = factor_return_regression(
                courtry_factors, industry_exposure_factors, daily_factors,
                daily_return)
            all_factor_returns.append(factor_returns)
    mean_returns = np.array(all_factor_returns).mean(axis=0)
    return mean_returns
Esempio n. 2
0
def get_factor_returns(start_date='20181101',
                       end_date='20181131',
                       data_source=0,
                       feature_types=[],
                       saved_feature=True,
                       bc=None,
                       top_ratio=0.25,
                       bottom_ratio=0.2):
    '''
    multi factor framework, regression to calculate the factor returns
    :param start_date:
    :param end_date:
    :param data_source:
    :param feature_types:
    :param saved_feature:
    :param bc:
    :param top_ratio:
    :param bottom_ratio:
    :return:
    '''
    root = get_source_root()
    feature_mapping = _get_source_features() or get_source_feature_mappings(
        souce=True,
        feature_types=feature_types,
        top_ratio=top_ratio,
        bottom_ratio=bottom_ratio)

    date_periods = _get_in_out_dates(
        start_date=start_date, end_date=end_date,
        security_id='000300.XSHG') or [[start_date, end_date]]
    next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1)
    idx_labels = get_idx_returns(security_ids=[bc],
                                 start_date=start_date,
                                 end_date=next_date,
                                 source=0).get(bc)
    all_factor_returns = []

    for _start_date, _end_date in date_periods:
        logger.info('get factor return: processing from {0} to {1}'.format(
            _start_date, _end_date))
        next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2)
        if bc:
            security_ids = get_idx_cons_jy(bc, _start_date, _end_date)
        else:
            security_ids = get_security_codes()
        # FIXME HACK FOR TESTING
        security_ids = security_ids[:5]
        ret_features = get_equity_daily_features(security_ids=security_ids,
                                                 features=feature_mapping,
                                                 start_date=_start_date,
                                                 end_date=_end_date,
                                                 source=data_source)
        ret_returns = get_equity_returns(security_ids=security_ids,
                                         start_date=_start_date,
                                         end_date=next_date)
        ret_mv = get_market_value(security_ids=security_ids,
                                  start_date=_start_date,
                                  end_date=next_date)
        none_factor_dict = defaultdict()
        # FIXME use the future industry return, should be updated to trace back the history data by windows
        industry_exposure_factors = get_indust_exposures_time_series_regression(
            start_date=_start_date,
            end_date=_end_date,
            stock_returns=ret_returns)
        _industry_exposure_df = pd.DataFrame(industry_exposure_factors)
        _industry_exposure_df.to_csv('{0}_{1}_{2}.csv'.format(
            _start_date, _end_date, bc))
        for date, val in ret_features.items():
            daily_factors = []
            daily_return = []
            daily_mv = []
            for sec_id, f_lst in val.items():
                _val_lst = list(f_lst.values())
                all_feature_names = list(f_lst.keys())
                daily_factors.append(_val_lst)
                try:
                    s_label = ret_returns.get(sec_id).get(str(date))
                    i_label = idx_labels.get(str(date))
                    label = (s_label - i_label) * 100
                    mv = ret_mv.get(sec_id).get(str(date))
                except Exception as ex:
                    label = np.nan
                    logger.error(
                        'fail to calculate the label with error:{0}'.format(
                            ex))
                daily_return.append(label)
                daily_mv.append(mv)
            try:
                daily_factors = feature_preprocessing(arr=daily_factors,
                                                      fill_none=True,
                                                      weights=daily_mv)
            except Exception as ex:
                logger.error(
                    'fail in feature preprocessing with error:{0}'.format(ex))
            # indus_factors = get_indust_vectors(security_ids)
            courtry_factors = np.ones(len(security_ids)).reshape(
                len(security_ids), 1)
            factor_returns = factor_return_regression(
                courtry_factors, industry_exposure_factors, daily_factors,
                daily_return)
            all_factor_returns.append(factor_returns)
    mean_returns = np.array(all_factor_returns).mean(axis=0)
    return mean_returns
Esempio n. 3
0
def feature_models(start_date='20181101',
                   end_date='20181131',
                   data_source=0,
                   feature_types=[],
                   saved_feature=True,
                   bc=None,
                   top_ratio=0.25,
                   bottom_ratio=0.2):
    root = get_source_root()
    feature_mapping = _get_source_features()
    feature_shape = [(1, len(item)) for item in list(feature_mapping.values())]
    # FIXME remove hardcode
    indust_shape = (1, 17)
    date_periods = _get_in_out_dates(
        start_date=start_date, end_date=end_date,
        security_id=bc) or [[start_date, end_date]]
    next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1)
    idx_labels = get_idx_returns(security_ids=[bc],
                                 start_date=start_date,
                                 end_date=next_date,
                                 source=0).get(bc)
    all_factor_returns = []
    # FIXME change to load existing model
    m = TFMultiFactor()
    m.build_model(feature_shape=feature_shape, indust_shape=indust_shape)
    for _start_date, _end_date in date_periods:
        logger.info('get factor return: processing from {0} to {1}'.format(
            _start_date, _end_date))
        _next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2)
        if bc:
            security_ids = get_idx_cons_jy(bc, _start_date, _end_date)
        else:
            security_ids = get_security_codes()
        # FIXME HACK FOR TESTING
        security_ids = security_ids[:50]

        ret_features = get_equity_daily_features(security_ids=security_ids,
                                                 features=feature_mapping,
                                                 start_date=_start_date,
                                                 end_date=_end_date,
                                                 source=data_source)
        ret_returns = get_equity_returns(security_ids=security_ids,
                                         start_date=_start_date,
                                         end_date=next_date)
        ret_mv = get_market_value(security_ids=security_ids,
                                  start_date=_start_date,
                                  end_date=_next_date)

        # FIXME industry vector updates
        industry_exposure_factors, industry_name = get_indust_exposures_corr(
            start_date=_start_date,
            end_date=_end_date,
            stock_returns=ret_returns,
            period=120,
            bc_returns=idx_labels)
        for date, val in ret_features.items():
            daily_factors = []
            daily_return = []
            daily_mv = []
            for sec_id, f_lst in val.items():
                _val_lst = list(f_lst.values())
                all_feature_names = list(f_lst.keys())
                daily_factors.append(_val_lst)
                try:
                    s_label = ret_returns.get(sec_id).get(str(date))
                    i_label = idx_labels.get(str(date))
                    label = (s_label - i_label) * 100
                    mv = ret_mv.get(sec_id).get(str(date))
                except Exception as ex:
                    label = np.nan
                    logger.error(
                        'fail to calculate the label with error:{0}'.format(
                            ex))
                daily_return.append(label)
                daily_mv.append(mv)
            try:
                daily_factors = feature_preprocessing(arr=daily_factors,
                                                      fill_none=True,
                                                      weights=daily_mv)
            except Exception as ex:
                logger.error(
                    'fail in feature preprocessing with error:{0}'.format(ex))

            # TODO country factors TBD
            courtry_factors = np.ones(len(security_ids)).reshape(
                len(security_ids), 1)
            m.train_model(
                daily_factors,
                np.array(daily_return).reshape(len(daily_return), 1),
                industry_exposure_factors,
                2,
                5,
            )
    return
Esempio n. 4
0
def retrieve_features(start_date='20181101', end_date='20181131', data_source=0,
                      feature_types=[], bc='000300.XSHG'):
    feature_mapping = get_source_feature_mappings(feature_types=feature_types)
    date_periods = _get_in_out_dates(start_date=start_date, end_date=end_date, security_id='000300.XSHG') or [
        [start_date, end_date]]
    all_labels = []
    all_features = []
    all_feature_names = []
    g_next_date = w.tdaysoffset(1, end_date).Data[0][0].strftime('%Y%m%d')
    idx_labels = get_idx_returns(security_ids=[bc], start_date=start_date, end_date=g_next_date,
                                 source=0).get(bc)
    _w_ret = w.tdays(start_date, g_next_date)
    tdays = [item.strftime(config['constants']['standard_date_format']) for item in _w_ret.Data[0]]
    tdays = sorted(tdays)
    for _start_date, _end_date in date_periods:
        next_date = _next_trading_date(tdays, _end_date)
        if not next_date:
            logger.error("Fail to get the next trading date for :{0}".format(_end_date))
        security_ids = get_idx_cons_dy(bc, _start_date)
        # FIXME add some filter,e.g. halt sec
        logger.info("Start query the features from :{0} to {1}....".format(_start_date, _end_date))
        ret_features = get_equity_daily_features(security_ids=security_ids, features=feature_mapping,
                                                 start_date=_start_date,
                                                 end_date=_end_date, source=data_source)
        logger.info("Complete query the features from :{0} to {1}".format(_start_date, _end_date))
        logger.info("Start query the market returns from :{0} to {1}....".format(_start_date, next_date))
        ret_labels = get_equity_returns(security_ids=security_ids, start_date=_start_date, end_date=next_date)
        logger.info("Complete query the market returns from :{0} to {1}".format(_start_date, next_date))
        logger.info("Start calculate the features from :{0} to {1}....".format(_start_date, _end_date))
        for date, val in ret_features.items():
            logger.info("Starting processing for date:{0}....".format(date))
            date_features = []
            date_labels = []
            for sec_id, f_lst in val.items():
                _val_lst = list(f_lst.values())
                all_feature_names = list(f_lst.keys())
                date_features.append(_val_lst)
                try:
                    _next_date = _next_trading_date(tdays, str(date))
                    s_label = ret_labels.get(sec_id).get(str(_next_date))
                    i_label = idx_labels.get(str(_next_date))
                    label = s_label * 100 - i_label
                except Exception as ex:
                    label = np.nan
                    logger.error('Fail to calculate the label with error:{0}'.format(ex))
                date_labels.append(label)

            try:
                date_features = feature_preprocessing(arr=date_features, fill_none=True, trade_date=date,
                                                      sec_ids=list(val.keys()), neutralized=False)
            except Exception as ex:
                logger.error('Fail in feature preprocessing with error:{0}'.format(ex))
            logger.info('Adding sec_id and trade_dates and reshape...')
            try:
                df_shape = date_features.shape
                date_features = np.column_stack((date_features, list(val.keys()), [date] * df_shape[0]))
                if isinstance(list(val.keys())[0], int):
                    print('checking here')
                if 'XS' in [date] * df_shape[0][0]:
                    print('checking here')
                all_features.extend(date_features)
                all_labels.extend(date_labels)
            except Exception as ex:
                logger.error('fail to reshape features features with error:{0}'.format(ex))
            logger.info("Complete processing for date:{0}".format(date))
        logger.info("Complete calculate the features from :{0} to {1}".format(_start_date, _end_date))
    try:
        if 'SECURITY_ID' not in all_feature_names:
            all_feature_names.append('SECURITY_ID')
        if 'TRADE_DATE' not in all_feature_names:
            all_feature_names.append('TRADE_DATE')
        df = pd.DataFrame(all_features, columns=all_feature_names)
        df['LABEL'] = all_labels
    except Exception as ex:
        logger.error(ex)
    del ret_features
    del ret_labels
    gc.collect()
    return df
Esempio n. 5
0
def retrieve_features(start_date='20181101', end_date='20181131', data_source=0,
                      feature_types=[], bc='000300.XSHG'):
    # feature_mapping = get_source_feature_mappings(feature_types=feature_types)
    date_periods = _get_in_out_dates(start_date=start_date, end_date=end_date, security_id='000300.XSHG') or [
        [start_date, end_date]]
    all_labels = []
    all_features = []
    all_feature_names = []
    g_next_date = w.tdaysoffset(1, end_date).Data[0][0].strftime('%Y%m%d')
    idx_labels = get_idx_returns(security_ids=[bc], start_date=start_date, end_date=g_next_date,
                                 source=0).get(bc)
    _w_ret = w.tdays(start_date, g_next_date)
    tdays = [item.strftime(config['constants']['standard_date_format']) for item in _w_ret.Data[0]]
    tdays = sorted(tdays)
    for _start_date, _end_date in date_periods:
        next_date = _next_trading_date(tdays, _end_date)
        if not next_date:
            logger.error("Fail to get the next trading date for :{0}".format(_end_date))
        security_ids = get_idx_cons_dy(bc, _start_date)
        # FIXME add some filter,e.g. halt sec
        logger.info("Start query the features from :{0} to {1}....".format(_start_date, _end_date))
        mkt = Market(security_ids)
        # mkt.security_ids = ['000651.XSHE']
        mkt.initialize(start_date=_start_date, end_date=next_date, source=0, tick_cache='')
        ret_features = get_min_features(security_ids=security_ids, start_date=_start_date, end_date=_end_date,
                                        start_time=None,
                                        end_time=None, mkt_cache=mkt, features=hf_features)

        # ret_features = get_equity_daily_features(security_ids=security_ids, features=feature_mapping,
        #                                          start_date=_start_date,
        #                                          end_date=_end_date, source=data_source)
        logger.info("Complete query the features from :{0} to {1}".format(_start_date, _end_date))
        logger.info("Start query the market returns from :{0} to {1}....".format(_start_date, next_date))
        ret_labels = get_equity_returns(security_ids=security_ids, start_date=_start_date, end_date=next_date)
        logger.info("Complete query the market returns from :{0} to {1}".format(_start_date, next_date))
        logger.info("Start calculate the features from :{0} to {1}....".format(_start_date, _end_date))
        for date, val in ret_features.items():
            logger.info("Starting processing for date:{0}....".format(date))
            date_features = []
            date_labels = []
            for sec_id, f_lst in val.items():
                _val_lst = list(f_lst.values())
                all_feature_names = list(f_lst.keys())
                date_features.append(_val_lst)
                try:
                    _next_date = _next_trading_date(tdays, str(date))
                    s_label = ret_labels.get(sec_id).get(str(_next_date))
                    i_label = idx_labels.get(str(_next_date))
                    # label = s_label * 100 - i_label
                    label = s_label * 100
                except Exception as ex:
                    label = np.nan
                    logger.error('Fail to calculate the label with error:{0}'.format(ex))
                date_labels.append(label)

            try:
                date_features = feature_preprocessing(arr=date_features, fill_none=True, trade_date=date,
                                                      sec_ids=list(val.keys()), neutralized=False)
            except Exception as ex:
                logger.error('Fail in feature preprocessing with error:{0}'.format(ex))
            logger.info('Adding sec_id and trade_dates and reshape...')
            try:
                df_shape = date_features.shape
                # df_shape = [len(date_features), len(date_features[0])]
                date_features = np.column_stack((date_features, list(val.keys()), [date] * df_shape[0]))
                all_features.extend(date_features)
                all_labels.extend(date_labels)
            except Exception as ex:
                logger.error('fail to reshape features features with error:{0}'.format(ex))
            logger.info("Complete processing for date:{0}".format(date))
        logger.info("Complete calculate the features from :{0} to {1}".format(_start_date, _end_date))
    try:
        if 'SECURITY_ID' not in all_feature_names:
            all_feature_names.append('SECURITY_ID')
        if 'TRADE_DATE' not in all_feature_names:
            all_feature_names.append('TRADE_DATE')
        df = pd.DataFrame(all_features, columns=all_feature_names)
        df['LABEL'] = all_labels
    except Exception as ex:
        logger.error(ex)
    del ret_features
    del ret_labels
    gc.collect()
    df.to_csv('hf_feature_6m_abs.csv')
    corr_var = np.corrcoef([float(item) for item in list(df['ret_var'])], df['LABEL'])
    corr_skew = np.corrcoef([float(item) for item in list(df['ret_skew'])], df['LABEL'])
    corr_kurtosis = np.corrcoef([float(item) for item in list(df['ret_kurtosis'])], df['LABEL'])
    print(df.corr())
    return df, corr_var, corr_skew, corr_kurtosis