コード例 #1
0
def get_market_prediction(start_date='20050103',
                          end_date='20181231',
                          input_period=12,
                          predict_period=6):
    root = get_source_root()
    path = os.path.join(get_source_root(), 'data', 'macro_indicators.xlsx')
    df = pd.read_excel(path)
    econ_ids, cat = list(df['指标ID']), list(df['分项'])
    factor_cats = dict(zip(econ_ids, cat))
    pmi_path = os.path.join(root, 'data', 'features', 'PMI.xls')
    # pmi_rows = pd.read_excel(pmi_path).values
    dates, factors, ids = get_econ_data(start_date=start_date,
                                        end_date=end_date)
    df = DataFetcher(source=0)
    rows, desc = df.get_data_fetcher_obj().get_mkt_equd(
        security_ids=['000300.XSHG'],
        start_date=start_date,
        end_date=end_date,
        asset_type='idx',
        fields=['CLOSE_INDEX', 'PRE_CLOSE_INDEX'])

    train_y = []
    ma_lst = []
    # econ_month_lst = [item.strftime('%Y%m') for item in dates]
    # n_pmi = len(pmi_lst)
    # train_x = [pmi_lst[idx: idx + period] for idx in range(n_pmi - period)]
    for item in factors:
        ma_lst.append(
            ta.SMA(np.array(item, dtype=float),
                   input_period)[input_period - 1:-1])
    ma_lst = np.array(ma_lst).transpose()
    close_idx = desc.index('CLOSE_INDEX')
    pre_close_idx = desc.index('PRE_CLOSE_INDEX')
    trade_date_idx = desc.index('TRADE_DATE')
    monthly_return = {}
    tmp_rows = {}
    mon_dates = get_all_month_start_end_dates(start_date, end_date)

    for item in rows:
        tmp_rows.update(
            {item[trade_date_idx].strftime('%Y%m%d'): item[close_idx]})
    for som, eom in mon_dates:
        k = som[:6]
        v = (tmp_rows.get(eom) - tmp_rows.get(som)) / tmp_rows.get(som)
        monthly_return.update({k: v})
    for m in dates[input_period:]:
        # train_y.append(0 if monthly_return.get(m) < 0 else 1)
        tmp = monthly_return.get(m[:6]) or 0.0
        train_y.append(0.0 if tmp < 0 else 1.0)

    # ma_lst = [[sum(item) / len(item)] for item in train_x]
    factor_names = [factor_cats.get(idx) for idx in ids]
    scores = feature_selection_sort(x=ma_lst,
                                    y=train_y,
                                    feature_names=factor_names,
                                    sort_type='pearson')
    df = pd.DataFrame(scores, columns=['相关性', '指标名'])
    df.to_csv('scores_{0}_months.csv'.format(input_period))
    return scores
コード例 #2
0
ファイル: model_selection.py プロジェクト: xiaoqiyu/quant_lab
def get_selected_features():
    root = get_source_root()
    feature_source = os.path.join(os.path.realpath(root), 'data', 'features')
    # files = os.listdir(feature_source)
    # files = [item for item in files if item.startswith('score')]
    # get the score with the corresponding start and end date, otherwise return the latest one
    # TODO confirm whether the listdir function's return is sorted by time
    # _path = 'score_{0}_{1}'.format(start_date, end_date) if 'score_{0}_{1}'.format(start_date, end_date) in files else \
    #     files[-1]
    # _path = files[0]
    _score_path = os.path.join(
        feature_source, 'score{0}_{1}_{2}.csv'.format(
            config['feature_mining_strategy']['benchmark'].split('.')[0],
            config['feature_mining_strategy']['start_date'],
            config['feature_mining_strategy']['end_date']))
    # logger.info("Reading score from path:{0}".format(_score_path))
    score_df = pd.read_csv(_score_path)
    score_df = score_df.sort_values(by='score', ascending=False)
    _feature_names = list(score_df['feature'])
    _score_bound = int(
        len(_feature_names) *
        float(config['feature_mining_strategy']['best_feature_ratio']) / 2)
    feature_names = list(
        set(_feature_names[:_score_bound + 1]).union(
            set(_feature_names[-_score_bound:])))
    return feature_names
コード例 #3
0
def get_tick_l2_features(security_id='',
                         trade_date='',
                         start_time='',
                         end_time=''):
    root = get_source_root()
    l2_path = os.path.join(os.path.realpath(root), 'data', 'features',
                           'level2', trade_date)
    tick_path = os.path.join(
        l2_path, 'l2tick',
        '{0}_{1}.csv'.format(security_id.split('.')[0], trade_date))
    tick_df = pd.read_csv(tick_path)
    _start_time_int = int(start_time) * 1000
    _end_time_int = int(end_time) * 1000
    _tick_df = tick_df[tick_df.Time >= _start_time_int]
    _tick_df = _tick_df[_tick_df.Time < _end_time_int]
    _tick_df.drop_duplicates(keep='last', inplace=True)
    _tick_df[
        'acc_bid_volume5'] = _tick_df.BidVolume1 + _tick_df.BidVolume2 + _tick_df.BidVolume3 + _tick_df.BidVolume4 + _tick_df.BidVolume5
    _tick_df[
        'acc_offer_volume5'] = _tick_df.OfferVolume1 + _tick_df.OfferVolume2 + _tick_df.OfferVolume3 + _tick_df.OfferVolume4 + _tick_df.OfferVolume5
    _tick_df['quote_imbalance5'] = _tick_df['acc_bid_volume5'] - _tick_df[
        'acc_offer_volume5']
    _tick_df['amplitude1'] = (_tick_df['High'] -
                              _tick_df['Low']) / _tick_df['LastPrice']
    ret = cal_tick_l2_features(df=_tick_df,
                               features=[
                                   'quote_imbalance5', 'quote_imbalance5_std',
                                   'ampl', 'ampl_std'
                               ])
    return ret, _tick_df
コード例 #4
0
ファイル: sql_lite_helper.py プロジェクト: xiaoqiyu/quant_lab
 def __init__(self):
     root = get_source_root()
     # get the file name of the features
     feature_source = os.path.join(os.path.realpath(root), 'data', 'features', 'cache_features')
     try:
         self._conn = sqlite.connect(feature_source)
     except Exception as ex:
         traceback.print_exc()
コード例 #5
0
def get_significant_features(top_ratio=0.5, bottom_ratio=0.2):
    root = get_source_root()
    tops = []
    bottoms = []
    ret = defaultdict(dict)
    score_lst = []
    f_types = set()

    root = get_source_root()
    feature_mapping = load_json_file(
        os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json'))
    # for start_date, end_date in [('20180103', '20181230'), ('20140603', '20160103'), ('20160103', '20171230')]:
    for start_date, end_date in [('20150103', '20181231')]:
        corr_path = os.path.join(
            os.path.realpath(root), 'conf',
            'score_{0}_{1}.csv'.format(start_date, end_date))
        df = pd.read_csv(corr_path)
        score_ret = defaultdict(list)
        for idx, k, s, ft in df.values:
            _tmp = score_ret.get(ft) or list()
            _tmp.append([k, s])
            score_ret.update({ft: _tmp})
            f_types.add(ft)
        for ft, val in score_ret.items():
            logger.debug(ft, len(val))
            val.sort(key=lambda x: x[1], reverse=True)
            t_dict = ret.get(ft) or dict()
            top_dict = t_dict.get('top_features') or list()
            bottom_dict = t_dict.get('bottom_features') or list()
            top_idx = int(len(val) *
                          top_ratio) if int(len(val) * top_ratio) > 5 else 5
            bottom_idx = int(
                len(val) *
                bottom_ratio) if int(len(val) * bottom_ratio) > 1 else 1
            top_dict.extend(item[0] for item in val[:top_idx])
            bottom_dict.extend(item[0] for item in val[-bottom_idx:])
            t_dict.update({'top_features': top_dict})
            t_dict.update({'bottom_features': bottom_dict})
            ret.update({ft: t_dict})
    for ft, val in ret.items():
        top_lst = list(set(val['top_features']))
        bottom_lst = list(set(val['bottom_features']))
        top_lst.extend(bottom_lst)
        ret.update({ft: top_lst})
    return ret
コード例 #6
0
def result_analysis():
    result = pd.read_pickle('output_result.pkl')
    pprint.pprint(result)
    # margin_lst = list(result['future_positions']['margin'])
    for k, v in list(result.items())[1:]:
        r = get_source_root()
        result_path = os.path.join(r, 'data', 'results', '{0}.csv'.format(k))
        print('save the result to:{0}'.format(result_path))
        v.to_csv(result_path)
コード例 #7
0
def get_source_feature_mappings(feature_types=None):
    root = get_source_root()
    feature_mapping = load_json_file(
        os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json'))
    if not feature_types:
        return feature_mapping
    _tmp = copy.deepcopy(feature_mapping)
    for k, v in _tmp.items():
        if not k in feature_types:
            feature_mapping.pop(k)
    return feature_mapping
コード例 #8
0
ファイル: data_fetcher_wd.py プロジェクト: xiaoqiyu/quant_lab
def get_econ_data(start_date='', end_date='', fields=None):
    path = os.path.join(get_source_root(), 'data', 'macro_indicators.xlsx')
    df = pd.read_excel(path)
    ids = fields or list(df['指标ID'])
    print(len(ids))
    # ret = w.edb(','.join(ids), start_date, end_date, "Fill=Previous")
    ret = w.edb(codes="",
                beginTime=start_date,
                endTime=end_date,
                options="Fill=Previous")
    dates = [item.strftime('%Y%m%d') for item in ret.Times]
    return dates, ret.Data, ids
コード例 #9
0
def train_features(start_date='', end_date='', bc='000300.XSHG'):
    # rows, desc = read_features()
    # _df = pd.DataFrame(rows, columns=desc)
    root = get_source_root()
    df = load_cache_features(start_date, end_date, bc)
    cols = list(df.columns)[:-4]
    cols.append('LABEL')
    df_corr = df[cols].corr()
    score_df = pd.DataFrame(
        {'feature': list(df_corr.iloc[:, -1].index)[:-1], 'score': list(df_corr.iloc[:, -1].values)[:-1]})
    score_path = os.path.join(os.path.realpath(root), 'data', 'features',
                              'score{0}_{1}_{2}.csv'.format(bc.split('.')[0], start_date, end_date))
    score_df.to_csv(score_path, index=None)
    return df, score_df
コード例 #10
0
def get_sw1_indust_code(sec_ids=[], trade_date=''):
    root = get_source_root()
    indust_path = os.path.join(os.path.join(root, 'data'), 'features', 'sw1_indust.csv')
    df = pd.read_csv(indust_path)
    ret = {}
    for sec_id in sec_ids:
        _df = df[df.secID == sec_id]
        for idx, sec_id, in_code, into_date, out_date, is_new in list(_df.values):
            if not is_new:
                if into_date.replace('-', '') <= str(trade_date) and str(trade_date) <= out_date.replace('-', ''):
                    ret.update({sec_id: in_code})
                    continue
            else:
                if into_date.replace('-', '') <= str(trade_date):
                    ret.update({sec_id: in_code})
    return ret
コード例 #11
0
def feature_refine():
    root = get_source_root()
    feature_mapping = load_json_file(
        os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json'))
    score_file = 'testing_train_features_score_20160103_20171230.csv'
    score_file = os.path.join(os.path.realpath(root), 'conf', score_file)
    df = pd.read_csv(score_file)
    values = df.values
    type_lst = []
    for item in values:
        f, s = item[1:]
        for k, v in feature_mapping.items():
            if f in v:
                type_lst.append(k)
    df['feature_type'] = type_lst
    save_file = os.path.join(os.path.realpath(root), 'conf',
                             'score_20160103_20171230.csv')
    df.to_csv(save_file)
コード例 #12
0
    def initialize(self,
                   start_date='',
                   end_date='',
                   source=0,
                   tick_cache='qto'):
        _df = g_db_fetcher.get_data_fetcher_obj(source)
        _min_mkt = _df.get_mkt_mins(startdate=start_date,
                                    enddate=end_date,
                                    sec_codes=self.security_ids,
                                    table_name='CUST.EQUITY_PRICEMIN')
        df = pd.DataFrame(_min_mkt[0], columns=_min_mkt[1])
        close_lst = list(df['CLOSEPRICE'])
        close_lst.insert(0, close_lst[0])
        return_lst = []
        n_row = len(close_lst)
        for idx in range(n_row - 1):
            return_lst.append(
                (close_lst[idx + 1] - close_lst[idx]) / close_lst[idx])
        df['RETURN'] = return_lst
        df['RET2VOL'] = df['RETURN'] / df['VOLUME']
        self.min_cache = df

        #retrieve data from
        _w_ret = w.tdays(start_date, end_date)
        t_dates = list(
            set([item.strftime('%Y%m%d') for item in _w_ret.Data[0]]))
        if 'q' in tick_cache and self.security_ids and t_dates:
            df_lst = []
            # security_id = self.security_ids[0]
            # trade_date = t_dates[0]
            for security_id in self.security_ids:
                for trade_date in t_dates:
                    root = get_source_root()
                    l2_path = os.path.join(os.path.realpath(root), 'data',
                                           'features', 'level2', trade_date)
                    tick_path = os.path.join(
                        l2_path, 'l2tick', '{0}_{1}.csv'.format(
                            security_id.split('.')[0], trade_date))
                    df_lst.append(pd.read_csv(tick_path))
            if df_lst:
                self.q_cache = df_lst
                for _tmp in df_lst:
                    self.q_cache.append(_tmp)
            del df_lst
コード例 #13
0
def get_l2_features(security_id='', trade_date='', start_time='', end_time=''):
    root = get_source_root()
    l2_path = os.path.join(os.path.realpath(root), 'data', 'features',
                           'level2', trade_date)
    order_path = os.path.join(
        l2_path, 'l2order',
        '{0}_{1}.csv'.format(security_id.split('.')[0], trade_date))
    tick_path = os.path.join(
        l2_path, 'l2tick',
        '{0}_{1}.csv'.format(security_id.split('.')[0], trade_date))
    trade_path = os.path.join(
        l2_path, 'l2trade',
        '{0}_{1}.csv'.format(security_id.split('.')[0], trade_date))
    tick_df = pd.read_csv(tick_path)
    order_df = pd.read_csv(order_path)
    trade_df = pd.read_csv(trade_path)
    print(order_df.shape)
    _start_time_int = int(start_time)
    _end_time_int = int(end_time)
コード例 #14
0
def load_cache_features(start_date='', end_date='', bc='000300.XSHG'):
    print('Loading features from :{0} to {1}'.format(start_date, end_date))
    _w_ret = w.tdays(start_date, end_date)
    # t_months = list(set([item.strftime('%Y%m') for item in _w_ret.Data[0]]))
    t_years = list(set([item.strftime('%y') for item in _w_ret.Data[0]]))
    root = get_source_root()
    feature_paths = [os.path.join(os.path.realpath(root), 'data', 'features',
                                  'features{0}_{1}.csv'.format(bc.split('.')[0], m_date)) for m_date in t_years]
    if feature_paths:
        # logger.info('Reading features:{0}'.format(feature_paths[0]))
        df = pd.read_csv(feature_paths[0])
        for p in feature_paths[1:]:
            # logger.info('Reading features:{0}'.format(p))
            # _df = pd.read_csv(p)
            # df = df.append(_df)
            _df = pd.read_csv(p)
            df = df.append(_df)
    df = df[df.TRADE_DATE >= int(start_date.replace('-', ''))]
    df = df[df.TRADE_DATE < int(end_date.replace('-', ''))]
    return df
コード例 #15
0
ファイル: sql_lite_helper.py プロジェクト: xiaoqiyu/quant_lab
def create_features_table():
    db = SQLiteHelper()
    root = get_source_root()
    # get the file name of the features
    feature_mapping_source = os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json')
    feature_mapping = load_json_file(feature_mapping_source)
    _vals = list(feature_mapping.values())
    fields = []
    for item in _vals:
        fields.extend(item)
    table_name = 'FEATURE_CACHE'
    s1 = "CREATE TABLE {0} (TICKER_SYMBOL INT, TRADE_DATE TEXT,SECURITY_ID TEXT,D_LABEL REAL,M_LABEL REAL, ".format(
        table_name)
    for f in fields:
        s1 += "{0} REAL,".format(f)
    s1 = s1[:-1] + ')'
    print(s1)
    try:
        db.execute_sql(s1)
    except Exception as ex:
        print(ex)
コード例 #16
0
def show_feature_correlate():
    root = get_source_root()
    tops = []
    bottoms = []
    for start_date, end_date in [('20180103', '20181231'), ('20140603', '20160202'), ('20160103', '20171231')]:
        corr_path = os.path.join(os.path.realpath(root), 'conf',
                                 'feature_score_{0}_{1}.csv'.format(start_date, end_date))
        df = pd.read_csv(corr_path)
        df = df.sort_values(by='score', ascending=False)
        df_top = df[df.score > 0.1]
        tops.append(list(df_top['feature']))
        df_bottom = df[df.score < -0.1]
        bottoms.append(list(df_bottom['feature']))
    t = set(tops[0])
    t = t.union(set(tops[1]))
    t = t.union(set(tops[2]))

    b = set(bottoms[0])
    b = b.union(set(bottoms[1]))
    b = b.union(set(bottoms[2]))
    t = t.union(b)
    return list(t)
コード例 #17
0
def get_feature_heatmap(dates=[], bc='000300.XSHG'):
    root = get_source_root()
    source_path = os.path.join(os.path.realpath(root), 'data', 'features')
    _bc = bc.split('.')[0]
    _files = os.listdir(source_path)
    files = [item for item in _files if item.startswith('score') and _bc in item]
    y_vvalues = [item.split('.')[-2].split('_')[1] for item in files]
    f_mapping = get_source_feature_mappings()

    ret_f_score = defaultdict(list)
    for file in files:
        f_type_score = defaultdict(list)
        df = pd.read_csv(os.path.join(source_path, file))
        for item in list(df.values):
            for _type, _lst in f_mapping.items():
                if item[0] in _lst:
                    f_type_score[_type].append(item[1])
        for k, v in f_type_score.items():
            ret_f_score[k].append(sum(v) / len(v))
    pprint.pprint(ret_f_score)
    x = []
    x_vvalues = list(ret_f_score.keys())
    for k, v in ret_f_score.items():
        x.append(v)

    ax = sns.heatmap(pd.DataFrame(x, index=x_vvalues, columns=y_vvalues))
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

    # generate the score
    # scores = []
    # for start_date, end_date in dates:
    #     df, score_df = train_features(start_date=start_date, end_date=end_date, bc=bc)
    #     scores.append(score_df['score'])
    # scores = np.array(scores).transpose()
    # ax = sns.heatmap(scores)
    plt.savefig('feature_heatmap_{0}.jpg'.format(_bc))
コード例 #18
0
def cache_features(start_date='20180101', end_date='20181231', data_source=0,
                   feature_types=[], bc='000300.XSHG', sufix=None):
    '''
    cache the features
    :param start_date:
    :param end_date:
    :param data_source:
    :param feature_types:
    :param bc:
    :return:
    '''
    logger.info("Start retrieve features from {0} to {1}....".format(start_date, end_date))
    df = retrieve_features(start_date=start_date, end_date=end_date, data_source=data_source,
                           feature_types=feature_types, bc=bc)
    logger.info("Complete retrieve features from {0} to {1}".format(start_date, end_date))
    # save_features(tuple(df.columns), tuple(df.values))
    root = get_source_root()

    # df.to_pickle(feature_source)
    logger.info("Start saving the features....")
    if sufix:
        feature_source = os.path.join(os.path.realpath(root), 'data', 'features',
                                      'features{0}_{1}.csv'.format(bc.split('.')[0], sufix))
        df.to_csv(feature_source)
        del df
        return
    df['MONTH'] = [item[:6] for item in df['TRADE_DATE']]
    m_dates = set(df['MONTH'])
    n_mdates = len(m_dates)
    for idx, m_date in enumerate(m_dates):
        _df = df[df.MONTH == m_date]
        feature_source = os.path.join(os.path.realpath(root), 'data', 'features',
                                      'features{0}_{1}.csv'.format(bc.split('.')[0], m_date))
        logger.info('Saving the {0} th features {1} out of {2}'.format(idx, feature_source, n_mdates))
        _df.to_csv(feature_source, index=None)
    del df
    gc.collect()
コード例 #19
0
from collections import defaultdict
from quant_models.utils.helper import get_source_root
from quant_models.utils.helper import get_config
from quant_models.utils.helper import get_parent_dir
from quant_models.applications.feature_mining.model_selection import get_selected_features
from quant_models.applications.feature_mining.model_selection import train_models
from quant_models.applications.feature_mining.feature_selection import load_cache_features
from quant_models.data_processing.features_calculation import get_sw_indust
from sklearn import decomposition
from sklearn.externals import joblib

model_name = 'linear'
config = get_config()
strategy_config = config['feature_mining_strategy']
# TODO change the path of the backtesting results
root = get_source_root()

# get the file name of the features
# _feature_path = os.path.join(os.path.realpath(root), 'data', 'features', 'feature_mining_strategy')
# w.start()


def init(context):
    # model_path = os.path.join(get_parent_dir(), 'data', 'models', 'stock_selection_{0}'.format(model_name))
    model_path = os.path.join(get_parent_dir(), 'data', 'models',
                              'ridge_20150103_20181231_0.9_000905.ZICN')
    feature_names = get_selected_features()
    context.features = load_cache_features(__config__['base']['start_date'],
                                           __config__['base']['end_date'],
                                           __config__['base']['benchmark'])
    context.model = joblib.load(model_path)
コード例 #20
0
def get_equity_daily_features(security_ids=[],
                              features={'ma': ['ACD6', 'ACD20']},
                              start_date=20181101,
                              end_date=20181102,
                              trade_date=None,
                              source=0):
    logger.info(
        'Start calculate features from {0} to {1} for sec_ids:{2} and features types{3}'
        .format(start_date, end_date, len(security_ids), len(features)))

    ret_features = defaultdict(dict)
    # query on one date
    if trade_date:
        start_date = end_date = trade_date
    if isinstance(start_date, str):
        start_date = int(start_date)
    if isinstance(end_date, str):
        end_date = int(end_date)
    retrieve_feature_names = list()
    for f_type, f_val in features.items():
        retrieve_feature_names.extend(f_val)
    retrieve_feature_names = list(set(retrieve_feature_names))
    root = get_source_root()
    feature_mapping = load_json_file(
        os.path.join(os.path.realpath(root), 'conf', 'feature_mapping.json'))
    source_features = []
    for item in list(feature_mapping.values()):
        source_features.extend(item)
    cal_features = list(set(retrieve_feature_names) - set(source_features))
    _df = g_db_fetcher.get_data_fetcher_obj(source)
    excluded = [
        'CREATE_TIME', 'UPDATE_TIME', 'TMSTAMP', 'ID', 'SECURITY_ID_INT',
        'SECURITY_ID', 'TRADE_DATE', 'TICKER_SYMBOL'
    ]
    retrieve_feature_names = [item.upper() for item in retrieve_feature_names]
    for f_type, f_fields in features.items():
        rows, desc = _df.get_equ_factor(fields=f_fields,
                                        factor_type=f_type,
                                        security_ids=security_ids,
                                        start_date=start_date,
                                        end_date=end_date)
        # logger.info('Complete querying')
        id_idx = desc.index('SECURITY_ID')
        date_idx = desc.index('TRADE_DATE')
        # logger.info('start processing rows for factor type:{0}'.format(f_type))

        if not f_fields:
            continue
        for item in rows:
            sec_id, date = item[id_idx], item[date_idx]
            date_dict = ret_features[date] or {}
            if date_dict and sec_id in date_dict:
                curr_dict = date_dict[sec_id]
            else:
                curr_dict = {}
                date_dict[sec_id] = {}
            idx_lst = []
            for idx, val in enumerate(desc):
                if val.upper() in retrieve_feature_names:
                    idx_lst.append(idx)
            # idx_lst = [idx for idx, val in enumerate(desc) if val.upper() in retrieve_feature_names]
            tmp_lst = [item[idx] for idx in idx_lst]
            tmp_dict = dict(zip([desc[idx] for idx in idx_lst], tmp_lst))
            tmp_dict1 = copy.deepcopy(tmp_dict)
            for k1, v1 in tmp_dict1.items():
                if k1 in excluded:
                    tmp_dict.pop(k1)
            # add the pre-defined calculated featuers
            tmp_dict = get_cal_features(tmp_dict, cal_features)
            if tmp_dict:
                curr_dict.update(tmp_dict)
            if curr_dict:
                ret_features[date][sec_id] = curr_dict
        # logger.info('complete processing rows for factor type:{0}'.format(f_type))
        del rows
        gc.collect()
        time.sleep(3)
    for date, val in ret_features.items():
        for sec_id, _val in val.items():
            _keys = set(_val.keys())
            _add_keys = set(retrieve_feature_names) - _keys
            _remove_keys = _keys - set(retrieve_feature_names)
            for _k in _add_keys:
                _val.update({_k: None})
            for _k in _remove_keys:
                _val.pop(_k)
    # FIXME check whether the length of the features are the same now
    return ret_features
コード例 #21
0
def feature_models(start_date='20181101',
                   end_date='20181131',
                   data_source=0,
                   feature_types=[],
                   saved_feature=True,
                   bc=None,
                   top_ratio=0.25,
                   bottom_ratio=0.2):
    root = get_source_root()
    feature_mapping = _get_source_features()

    date_periods = _get_in_out_dates(
        start_date=start_date, end_date=end_date,
        security_id=bc) or [[start_date, end_date]]
    next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1)
    idx_labels = get_idx_returns(security_ids=[bc],
                                 start_date=start_date,
                                 end_date=next_date,
                                 source=0).get(bc)
    all_factor_returns = []

    for _start_date, _end_date in date_periods:
        logger.info('get factor return: processing from {0} to {1}'.format(
            _start_date, _end_date))
        next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2)
        if bc:
            security_ids = get_idx_cons_jy(bc, _start_date, _end_date)
        else:
            security_ids = get_security_codes()
        # FIXME HACK FOR TESTING
        security_ids = security_ids[:5]
        ret_features = get_equity_daily_features(security_ids=security_ids,
                                                 features=feature_mapping,
                                                 start_date=_start_date,
                                                 end_date=_end_date,
                                                 source=data_source)
        ret_returns = get_equity_returns(security_ids=security_ids,
                                         start_date=_start_date,
                                         end_date=next_date)
        ret_mv = get_market_value(security_ids=security_ids,
                                  start_date=_start_date,
                                  end_date=next_date)
        for date, val in ret_features.items():
            daily_factors = []
            daily_return = []
            daily_mv = []
            for sec_id, f_lst in val.items():
                _val_lst = list(f_lst.values())
                all_feature_names = list(f_lst.keys())
                daily_factors.append(_val_lst)
                try:
                    s_label = ret_returns.get(sec_id).get(str(date))
                    i_label = idx_labels.get(str(date))
                    label = (s_label - i_label) * 100
                    mv = ret_mv.get(sec_id).get(str(date))
                except Exception as ex:
                    label = np.nan
                    logger.error(
                        'fail to calculate the label with error:{0}'.format(
                            ex))
                daily_return.append(label)
                daily_mv.append(mv)
            try:
                daily_factors = feature_preprocessing(arr=daily_factors,
                                                      fill_none=True,
                                                      weights=daily_mv)
            except Exception as ex:
                logger.error(
                    'fail in feature preprocessing with error:{0}'.format(ex))
            # indus_factors = get_indust_vectors(security_ids)
            courtry_factors = np.ones(len(security_ids)).reshape(
                len(security_ids), 1)
            factor_returns = factor_return_regression(
                courtry_factors, industry_exposure_factors, daily_factors,
                daily_return)
            all_factor_returns.append(factor_returns)
    mean_returns = np.array(all_factor_returns).mean(axis=0)
    return mean_returns
コード例 #22
0
def get_factor_returns(start_date='20181101',
                       end_date='20181131',
                       data_source=0,
                       feature_types=[],
                       saved_feature=True,
                       bc=None,
                       top_ratio=0.25,
                       bottom_ratio=0.2):
    '''
    multi factor framework, regression to calculate the factor returns
    :param start_date:
    :param end_date:
    :param data_source:
    :param feature_types:
    :param saved_feature:
    :param bc:
    :param top_ratio:
    :param bottom_ratio:
    :return:
    '''
    root = get_source_root()
    feature_mapping = _get_source_features() or get_source_feature_mappings(
        souce=True,
        feature_types=feature_types,
        top_ratio=top_ratio,
        bottom_ratio=bottom_ratio)

    date_periods = _get_in_out_dates(
        start_date=start_date, end_date=end_date,
        security_id='000300.XSHG') or [[start_date, end_date]]
    next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1)
    idx_labels = get_idx_returns(security_ids=[bc],
                                 start_date=start_date,
                                 end_date=next_date,
                                 source=0).get(bc)
    all_factor_returns = []

    for _start_date, _end_date in date_periods:
        logger.info('get factor return: processing from {0} to {1}'.format(
            _start_date, _end_date))
        next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2)
        if bc:
            security_ids = get_idx_cons_jy(bc, _start_date, _end_date)
        else:
            security_ids = get_security_codes()
        # FIXME HACK FOR TESTING
        security_ids = security_ids[:5]
        ret_features = get_equity_daily_features(security_ids=security_ids,
                                                 features=feature_mapping,
                                                 start_date=_start_date,
                                                 end_date=_end_date,
                                                 source=data_source)
        ret_returns = get_equity_returns(security_ids=security_ids,
                                         start_date=_start_date,
                                         end_date=next_date)
        ret_mv = get_market_value(security_ids=security_ids,
                                  start_date=_start_date,
                                  end_date=next_date)
        none_factor_dict = defaultdict()
        # FIXME use the future industry return, should be updated to trace back the history data by windows
        industry_exposure_factors = get_indust_exposures_time_series_regression(
            start_date=_start_date,
            end_date=_end_date,
            stock_returns=ret_returns)
        _industry_exposure_df = pd.DataFrame(industry_exposure_factors)
        _industry_exposure_df.to_csv('{0}_{1}_{2}.csv'.format(
            _start_date, _end_date, bc))
        for date, val in ret_features.items():
            daily_factors = []
            daily_return = []
            daily_mv = []
            for sec_id, f_lst in val.items():
                _val_lst = list(f_lst.values())
                all_feature_names = list(f_lst.keys())
                daily_factors.append(_val_lst)
                try:
                    s_label = ret_returns.get(sec_id).get(str(date))
                    i_label = idx_labels.get(str(date))
                    label = (s_label - i_label) * 100
                    mv = ret_mv.get(sec_id).get(str(date))
                except Exception as ex:
                    label = np.nan
                    logger.error(
                        'fail to calculate the label with error:{0}'.format(
                            ex))
                daily_return.append(label)
                daily_mv.append(mv)
            try:
                daily_factors = feature_preprocessing(arr=daily_factors,
                                                      fill_none=True,
                                                      weights=daily_mv)
            except Exception as ex:
                logger.error(
                    'fail in feature preprocessing with error:{0}'.format(ex))
            # indus_factors = get_indust_vectors(security_ids)
            courtry_factors = np.ones(len(security_ids)).reshape(
                len(security_ids), 1)
            factor_returns = factor_return_regression(
                courtry_factors, industry_exposure_factors, daily_factors,
                daily_return)
            all_factor_returns.append(factor_returns)
    mean_returns = np.array(all_factor_returns).mean(axis=0)
    return mean_returns
コード例 #23
0
ファイル: model_selection.py プロジェクト: xiaoqiyu/quant_lab
def train_models(model_name='',
                 start_date='20140603',
                 end_date='20181231',
                 feature_ratio=1.0,
                 bc='000300.XSHG',
                 feature_df=None,
                 score_df=None,
                 cache_df=False):
    '''

    :param model_name:
    :param start_date:
    :param end_date:
    :param score_bound: (up_ratio, down_ratio), will pick the features with socre in the top up_ratio ranking
    and in the bottom with down_ratio
    :return:
    '''
    m = Ml_Reg_Model(model_name)
    model_full_name = '{0}_{1}_{2}_{3}_{4}'.format(model_name, start_date,
                                                   end_date, feature_ratio, bc)
    if not m.load_model(model_full_name):
        m.build_model()
    root = get_source_root()
    if not cache_df:
        df, score_df = train_features(start_date=start_date,
                                      end_date=end_date,
                                      bc=bc)
    else:
        df = feature_df
        score_df = score_df
    score_df = score_df.sort_values(by='score', ascending=False)
    _feature_names = list(score_df['feature'])
    _score_bound = int(len(_feature_names) * feature_ratio / 2)
    feature_names = list(
        set(_feature_names[:_score_bound + 1]).union(
            set(_feature_names[-_score_bound:])))

    # select the features by the ic values
    # feature_names = get_selected_features(start_date=start_date, end_date=end_date, up_ratio=score_bound[0],
    #                                       down_ratio=score_bound[1])

    train_X = df[feature_names].values
    dataes = list(df['TRADE_DATE'])
    sec_ids = list(df['SECURITY_ID'])
    train_Y = df.iloc[:, -1]
    decom_ratio = float(config['feature_mining_strategy']['component_ratio'])
    # PCA processing
    n_component = min(int(len(feature_names) * decom_ratio),
                      int(config['feature_mining_strategy']['n_component']))
    pca = decomposition.PCA(n_components=n_component)
    # train_X = pca.fit_transform(train_X)
    train_X = pca.fit_transform(pd.DataFrame(train_X).fillna(method='ffill'))
    train_Y = train_Y.fillna(0.0)
    st = time.time()
    logger.info('start training the models')
    # mse_scores, r2_scores = m.train_model(train_X[:1000], train_Y[:1000])
    mse_scores, r2_scores = m.train_model(train_X, train_Y)
    et = time.time()
    logger.info('complete training model with time:{0}'.format(et - st))

    m.save_model(model_full_name)
    logger.info("Mean squared error:{0}: %0.5f -  %0.5f" %
                (mse_scores.mean() - mse_scores.std() * 3,
                 mse_scores.mean() + mse_scores.std() * 3))
    logger.info("Mean squared error:{0}: %0.5f -  %0.5f" %
                (r2_scores.mean() - r2_scores.std() * 3,
                 r2_scores.mean() + r2_scores.std() * 3))
    result_path = os.path.join(
        os.path.join((os.path.join(os.path.realpath(root), 'data')),
                     'results'), 'feature_model_selection.txt')

    with open(result_path, 'a+') as fout:
        # fout.write('{0}\n'.format(datetime.datetime.now().strftime(config['constants']['no_dash_datetime_format'])))
        fout.write('{0}\n'.format(model_full_name))
        fout.write('mse: {0}\n r2_score:{1}\n'.format(str(list(mse_scores)),
                                                      str(list(r2_scores))))
    return mse_scores, r2_scores
コード例 #24
0
def feature_models(start_date='20181101',
                   end_date='20181131',
                   data_source=0,
                   feature_types=[],
                   saved_feature=True,
                   bc=None,
                   top_ratio=0.25,
                   bottom_ratio=0.2):
    root = get_source_root()
    feature_mapping = _get_source_features()
    feature_shape = [(1, len(item)) for item in list(feature_mapping.values())]
    # FIXME remove hardcode
    indust_shape = (1, 17)
    date_periods = _get_in_out_dates(
        start_date=start_date, end_date=end_date,
        security_id=bc) or [[start_date, end_date]]
    next_date = datetime_delta(dt=end_date, format='%Y%m%d', days=1)
    idx_labels = get_idx_returns(security_ids=[bc],
                                 start_date=start_date,
                                 end_date=next_date,
                                 source=0).get(bc)
    all_factor_returns = []
    # FIXME change to load existing model
    m = TFMultiFactor()
    m.build_model(feature_shape=feature_shape, indust_shape=indust_shape)
    for _start_date, _end_date in date_periods:
        logger.info('get factor return: processing from {0} to {1}'.format(
            _start_date, _end_date))
        _next_date = datetime_delta(dt=_end_date, format='%Y%m%d', days=2)
        if bc:
            security_ids = get_idx_cons_jy(bc, _start_date, _end_date)
        else:
            security_ids = get_security_codes()
        # FIXME HACK FOR TESTING
        security_ids = security_ids[:50]

        ret_features = get_equity_daily_features(security_ids=security_ids,
                                                 features=feature_mapping,
                                                 start_date=_start_date,
                                                 end_date=_end_date,
                                                 source=data_source)
        ret_returns = get_equity_returns(security_ids=security_ids,
                                         start_date=_start_date,
                                         end_date=next_date)
        ret_mv = get_market_value(security_ids=security_ids,
                                  start_date=_start_date,
                                  end_date=_next_date)

        # FIXME industry vector updates
        industry_exposure_factors, industry_name = get_indust_exposures_corr(
            start_date=_start_date,
            end_date=_end_date,
            stock_returns=ret_returns,
            period=120,
            bc_returns=idx_labels)
        for date, val in ret_features.items():
            daily_factors = []
            daily_return = []
            daily_mv = []
            for sec_id, f_lst in val.items():
                _val_lst = list(f_lst.values())
                all_feature_names = list(f_lst.keys())
                daily_factors.append(_val_lst)
                try:
                    s_label = ret_returns.get(sec_id).get(str(date))
                    i_label = idx_labels.get(str(date))
                    label = (s_label - i_label) * 100
                    mv = ret_mv.get(sec_id).get(str(date))
                except Exception as ex:
                    label = np.nan
                    logger.error(
                        'fail to calculate the label with error:{0}'.format(
                            ex))
                daily_return.append(label)
                daily_mv.append(mv)
            try:
                daily_factors = feature_preprocessing(arr=daily_factors,
                                                      fill_none=True,
                                                      weights=daily_mv)
            except Exception as ex:
                logger.error(
                    'fail in feature preprocessing with error:{0}'.format(ex))

            # TODO country factors TBD
            courtry_factors = np.ones(len(security_ids)).reshape(
                len(security_ids), 1)
            m.train_model(
                daily_factors,
                np.array(daily_return).reshape(len(daily_return), 1),
                industry_exposure_factors,
                2,
                5,
            )
    return