Exemple #1
0
    def update_financial_data(self):
        data = Data()
        roattm = data.roattm
        latest_date = roattm.columns[-1]
        needed = 1
        if latest_date.month == 3:
            if datetime.today().month <= 7:
                needed = 0

        elif latest_date.month == 6:
            if datetime.today().month <= 10:
                needed = 0

        elif latest_date.month == 9:
            if datetime.today().month <= 4:
                needed = 0

        elif latest_date.month == 12:
            raise ValueError

        if needed == 1:
            try:
                update_all_basic()
            except Exception as e:
                print('暂时链接不上聚源,再试')

        else:
            print('无需更新财务数据')
Exemple #2
0
def month_return_compare_to_market_index(stock_list, his_month):
    data = Data()
    changePCT = data.changepct_monthly

    ff = None
    for c in changePCT.columns:
        if c.year == his_month.year and c.month == his_month.month:
            ff = c
            break
    res2 = changePCT.loc[stock_list, ff]

    index_path = r'D:\pythoncode\IndexEnhancement\指数相关'
    index_price = pd.read_csv(os.path.join(index_path, 'index_price_monthly.csv'), engine='python')
    index_price = index_price.set_index(index_price.columns[0])
    index_price.index = pd.to_datetime(index_price.index)
    index_r = (index_price - index_price.shift(1)) / index_price.shift(1)

    fff = None
    for c in index_r.index:
        if c.year == his_month.year and c.month == his_month.month:
            fff = c
            break

    res1 = pd.DataFrame({fff: index_r.loc[fff, :].drop_duplicates()})

    tt = pd.DataFrame(data=res2.mean(), index=['组合'], columns=[ff])
    res1 = pd.concat([res1, tt], axis=0)

    return res1, res2
Exemple #3
0
def name_to_code(names_list):
    data = Data()
    all_stocks_code = data.stock_basic_inform
    all_stocks_code = all_stocks_code[['sec_name'.upper()]]
    all_stocks_code['code'.upper()] = all_stocks_code.index

    all_stocks_code = all_stocks_code.set_index('sec_name'.upper())
    res = list(all_stocks_code.loc[names_list, 'code'.upper()])
    return res
 def load_benchmark(self, benchmark_str):
     data = Data()
     if self.freq == 'M':
         if benchmark_str in ['WindA', 'HS300', 'SH50', 'ZZ500']:
             price_monthly = data.index_price_monthly
             self.benchmark_p = price_monthly.loc[benchmark_str, :]
             self.benchmark_r = self.benchmark_p / self.benchmark_p.shift(1) - 1
         else:
             price_monthly = data.industry_price_monthly
             self.benchmark_p = price_monthly.loc[benchmark_str + '(申万)', :]
             self.benchmark_r = self.benchmark_p / self.benchmark_p.shift(1) - 1
Exemple #5
0
def del_industry(stock_pool, to_del_indus):
    data = Data()
    stock_basic = data.stock_basic_inform
    sw_1 = stock_basic[['申万一级行业']]

    for col, v in stock_pool.iteritems():
        for ind in to_del_indus:
            si = [i for i in sw_1[sw_1[sw_1.columns[0]] == ind].index if i in stock_pool.index]
            v[si] = False

    return stock_pool
def deal_marco_data():
    data = Data()
    month_macro_data = data.month_macro_data_raw
    daily_macro_data = data.daily_macro_data_raw

    d_m = deal_daily_macro_data(daily_macro_data.T)
    dm0 = naturedate_to_tradeddate(d_m, tar='index')
    m_m = deal_month_macro_data(month_macro_data.T)
    dm1 = naturedate_to_tradeddate(m_m, tar='index')
    macro_dat = pd.concat([dm0, dm1], axis=1)
    smoothed_macro = macro_data_smooth_process(macro_dat)
    return smoothed_macro
def form_stock2_second_indus(panel_path, save_path):
    '''
    :param panel_path: 月度数据的存储地址
    :param save_path:  目标文件的存储地址
    :return:
    '''
    # 把股票的月度数据转换为行业的形式
    data = Data()
    indus_infor = data.secondindustryname
    indus_infor = data.reindex(indus_infor)

    dirlist = os.listdir(panel_path)

    indux_wei_total = pd.DataFrame()
    for f in dirlist:
        stock_wei = pd.read_csv(os.path.join(panel_path, f),
                                encoding='gbk',
                                engine='python')

        stock_wei = stock_wei.set_index('wind_code')
        if f.split('.')[0] in indus_infor.columns:
            stock_wei['second_indus'] = indus_infor[f.split('.')[0]]
        else:
            stock_wei['second_indus'] = indus_infor[indus_infor.columns[-1]]

        stock_wei = stock_wei.dropna(axis=0, how='any')
        stock_wei['i_weight'] = 100 * stock_wei['i_weight'] / np.sum(
            stock_wei['i_weight'])

        grouped = stock_wei[['i_weight',
                             'second_indus']].groupby('second_indus')
        indus_wei = grouped.sum()
        indus_wei = indus_wei.T
        indus_wei.index = [f.split('.')[0]]

        indux_wei_total = pd.concat([indux_wei_total, indus_wei], axis=0)
        indux_wei_total = indux_wei_total.fillna(0)

    indux_wei_total.to_csv(os.path.join(save_path, '二级行业权重.csv'),
                           encoding='gbk')
def update_f_data_from_wind():
    path = r'D:\pythoncode\IndexEnhancement\barra_cne6\download_from_juyuan'
    w.start()

    data = Data()
    stock_basic_inform = data.stock_basic_inform

    # m_list = w.tdays(tds, eds, "Days=Alldays;Period=Q", usedf=True)
    # m_list = m_list[1]
    # m_list = list(m_list[m_list.columns[0]])
    iterms = [
        'rd_exp',  # 研发费用
    ]
    codes_str = ''
    for i in stock_basic_inform.index:
        codes_str = codes_str + ',' + i
    codes_str = codes_str[1:]

    eds = datetime.today().strftime("%Y-%m-%d")

    for it in iterms:
        try:
            tmp_df = eval('data.' + it)
            tds = tmp_df.columns[-1]
        except Exception as e:
            tmp_df = pd.DataFrame()
            tds = datetime(2009, 1, 1)

        if (datetime.today() - tds).days > 110:
            res_tmp = w.wsd(codes_str,
                            it,
                            tds.strftime("%Y-%m-%d"),
                            eds,
                            "unit=1;rptType=1;Period=Q;Days=Alldays",
                            usedf=True)
            res_tmp1 = res_tmp[1]
            res_tmp1 = res_tmp1.T
            tmp_df = pd.concat([tmp_df, res_tmp1], axis=1)
            data.save(tmp_df, it, save_path=path)
Exemple #9
0
def rps_factor(rps_min=50, rps_max=100):
    data = Data()
    rps = data.RPS
    rps.fillna(0, inplace=True)

    rps_cond = float_2_bool_df(rps, min_para=rps_min, max_para=rps_max)
    # 用于回测,向右移动一期
    rps = rps.shift(1, axis=1)
    rps_cond = rps_cond.shift(1, axis=1)
    rps.dropna(axis=1, how='all', inplace=True)
    rps_cond.dropna(axis=1, how='all', inplace=True)

    return rps_cond
def get_indus_wt_in_index(index, indus_level='second'):
    index_wt = get_stock_wt_in_index(index)
    data = Data()
    if indus_level == 'first':
        industry = data.firstindustryname
    elif indus_level == 'second':
        industry = data.secondindustryname

    industry = data.reindex(industry)
    industry = industry.loc[index_wt.index, index_wt.columns]

    indus_wt = pd.DataFrame()
    for d in index_wt.columns:
        # d = index_wt.columns[0]
        tmp_df = pd.concat([index_wt.loc[:, d], industry.loc[:, d]], axis=1)
        tmp_df = tmp_df.dropna()
        tmp_df.columns = ['wt', 'industry']
        indus_wt_tmp = tmp_df['wt'].groupby(tmp_df['industry']).sum()
        indus_wt_df = pd.DataFrame(indus_wt_tmp.values, index=indus_wt_tmp.index, columns=[d])

        indus_wt = pd.concat([indus_wt, indus_wt_df], axis=1)

    indus_wt = indus_wt.fillna(0)
    return indus_wt
Exemple #11
0
def from_stock_wei_2_industry_wei(wei_df):
    data = Data()
    all_stocks_code = data.stock_basic_inform
    all_stocks_code = all_stocks_code[['申万一级行业']]
    wei_df = wei_df.fillna(0)
    res = pd.DataFrame()
    for col, se in wei_df.iteritems():
        tmp_df = pd.DataFrame({col: se})
        to_group = pd.concat([tmp_df, all_stocks_code], axis=1)
        to_group.fillna(0, inplace=True)
        grouped = to_group.groupby('申万一级行业').sum()
        if 0 in grouped.index:
            grouped.drop(0, axis=0, inplace=True)
            res = pd.concat([res, grouped], axis=1)

    return res
Exemple #12
0
def add_industry_infor():
    # 存储临时数据地址
    tmp_save_path = r'D:\pythoncode\IndexEnhancement\barra_cne6\tmp'
    if not os.path.exists(tmp_save_path):
        os.makedirs(tmp_save_path)

    # 获取数据
    data = Data()
    second_ind = data.industry_sw_2
    # pd.DataFrame(set(second_ind[second_ind.columns[0]].values)).to_csv(r'D:\pythoncode\IndexEnhancement\二级行业名称.csv',
    #                                                                    encoding='gbk')

    # 输入的类型为Series
    inps = second_ind[second_ind.columns[0]]
    # 与原月度数据合并
    target_date_path = r'D:\pythoncode\IndexEnhancement\因子预处理模块\因子'
    add_to_panels(inps, target_date_path, 'second_industry')
Exemple #13
0
def easy_bt(wei_stocks, basic_return_infor):
    data = Data()
    changepct_daily = data.CHANGEPECT_OPEN_DAILY
    changepct_daily = changepct_daily.shift(-1, axis=1)
    changepct_daily.dropna(how='all', axis=1, inplace=True)

    changepct_daily = changepct_daily / 100

    wei_stocks, changepct_daily = align(wei_stocks, changepct_daily)

    # fee_type='No_fee' 不计算佣金和印花税, 'fee_1'计算佣金和印花税,不计算冲击成本
    daily_return, net_value = back_test(changepct_daily, wei_stocks, fee_type='fee_1')
    # plt.plot(net_cpd)

    # 若有基准日度收益率,则计算累计超额收益率
    if isinstance(basic_return_infor, str):
        # 有基准收益,算超额收益
        basic_return = pd.read_csv(basic_return_infor, engine='python')
        basic_return = basic_return.set_index('date')
        if 'daily_return' in basic_return.columns:
            daily_excess_r = daily_return['daily_return'] - basic_return['daily_return']
        # 若没有日度收益数据,则根据日度净值数据计算出日度收益收益数据
        elif 'daily_return' not in basic_return.columns and 'net_value' in basic_return.columns:
            basic_return['daily_return'] = basic_return['net_value']/basic_return['net_value'].shift(1) - 1
            daily_excess_r = daily_return['daily_return'] - basic_return['daily_return']
            daily_excess_r.dropna(inplace=True)

        daily_excess_cum = (daily_excess_r + 1).cumprod()
        cum_excess_df = pd.DataFrame({'cum_excess_ret': daily_excess_cum})

    elif isinstance(basic_return_infor, pd.DataFrame):
        if 'daily_return' not in basic_return_infor.columns and 'net_value' in basic_return_infor.columns:
            basic_return_infor['daily_return'] = basic_return_infor['net_value'] / \
                                                 basic_return_infor['net_value'].shift(1) - 1
            daily_excess_r = daily_return['daily_return'] - basic_return_infor['daily_return']
            daily_excess_r.dropna(inplace=True)

        daily_excess_cum = (daily_excess_r + 1).cumprod()
        cum_excess_df = pd.DataFrame({'cum_excess_ret': daily_excess_cum})
    else:
        cum_excess_df = None

    return daily_return, net_value, cum_excess_df
Exemple #14
0
def new_record_stock(stock_wei, save_name='每期', save_path=None):

    data = Data()
    all_stocks_code = data.all_stocks_code
    all_stocks_code = all_stocks_code[['wind_code', 'sec_name']]
    all_stocks_code = all_stocks_code.set_index('wind_code')

    maxl = np.sum(stock_wei != 0.0, axis=0).max()
    res_to_csv = pd.DataFrame(index=range(0, maxl))

    for col, items in stock_wei.iteritems():
        selected = items[items.index[items != 0]]
        selected = all_stocks_code.loc[selected.index, :]
        selected = pd.DataFrame(selected.values, columns=[col])
        res_to_csv = pd.concat([res_to_csv, selected], axis=1)

    if not save_path:
        save_path = r'D:\pythoncode\IndexEnhancement'
    res_to_csv.to_csv(os.path.join(save_path, save_name + '选股结果.csv'),
                      encoding='gbk')
    return res_to_csv
def select_import_wei(wei_se, n_max, abs_max_ratio=0.9, max_in_indus=2):
    '''
    abs_max = 15        留下绝对数前15的个股权重
    max_in_indus = 2    留下行业内排名前2的个股权重
    同时删除部分权重过小的股票,只留下极少部分股票做进一步的优化,删除第一部优化时权重小于0.001的股票。
    '''
    data = Data()
    basic_inform = data.stock_basic_inform
    indus_map = basic_inform.loc[wei_se.index, '申万一级行业']
    # 强制保留w1中各行业最大权重股以及其他权重前15的股票
    res_wei = pd.Series(index=wei_se.index)

    # 分行业里面权重大的留下
    wei_se = wei_se.sort_index()
    dat_df = pd.DataFrame({'wei': wei_se, 'industry': indus_map})
    grouped = dat_df.groupby('industry')
    for ind, v in grouped:
        tmp = v['wei'].sort_values(ascending=False)
        res_wei[tmp[:max_in_indus].index] = tmp[:max_in_indus]

    num_0 = np.sum(res_wei > 0)
    # 分行业留下权重前2的股票之外,在找到其余的股票中绝对权重排名前 (n_max * abs_max_ratio - 已经留下的股票数量)的股票数量。
    num_1 = int(n_max * abs_max_ratio) - num_0
    tmp = list(set(wei_se.index) - set(res_wei.dropna().index))
    left_se = wei_se[tmp]
    left_se = left_se.sort_values(ascending=False)
    tmp1 = left_se[:num_1]
    res_wei[tmp1.index] = tmp1

    res_wei = res_wei.dropna()
    n_left = np.sum(res_wei > 0)
    # np.sum(res_wei)
    if n_left > n_max:
        print('留下的股票过多,重新选择')
        input("暂时挂起.... ")

    tmp2 = wei_se[wei_se > 0.001]
    tobe_opt = [i for i in tmp2.index if i not in res_wei.index]

    return res_wei, n_left, tobe_opt
Exemple #16
0
def keep_industry(stock_pool, to_keep_indus_list, industry_type='sw'):
    data = Data()

    stock_basic = data.stock_basic_inform

    if industry_type == 'sw':
        industry_df = stock_basic[['申万一级行业']]
    elif industry_type == 'zx':
        industry_df = stock_basic[['中信一级行业']]

    for col, v in stock_pool.iteritems():
        for ind in to_keep_indus_list:
            # 选出该行业的股票
            tmp = industry_df[industry_df[industry_df.columns[0]] == ind].index
            # 也在stock_pool的index里面
            si = [i for i in tmp if i in stock_pool.index]
            # 其余行业的股票
            se = list(set(stock_pool.index).difference(set(si)))
            # 均为否
            v[se] = False

    return stock_pool
Exemple #17
0
def section_stock_num(industry_name):

    data = Data()
    sw_1 = data.industry_sw_1
    stock_codes = list(sw_1.index[sw_1[sw_1.columns[0]] == industry_name])

    res_i = []
    res_v = []

    fn_list = os.listdir(origin_factor_path)
    for i in fn_list:
        dat = pd.read_csv(os.path.join(origin_factor_path, i), encoding='gbk')
        dat.set_index('code', inplace=True)
        new_index = [c for c in stock_codes if c in dat.index]
        dat = dat.loc[new_index, :]

        res_i.append(pd.to_datetime(i.split('.')[0]))
        res_v.append(len(dat))

    res = pd.Series(res_v, index=res_i)

    return res
def update_industry_data():
    w.start()
    data = Data()
    index_path = r'D:\pythoncode\IndexEnhancement\指数相关'
    try:
        indus_p = data.industry_price_monthly
        st = indus_p.index[-1] - timedelta(90)
    except Exception as e:
        indus_p = pd.DataFrame()
        st = datetime(2006, 1, 1)

    ed = datetime.today() - timedelta(1)

    targets_str = ''
    for key in code_name_map_sw.keys():
        targets_str = targets_str + ',' + key

    targets_str = targets_str.lstrip(',')

    res = w.wsd(targets_str,
                "close",
                st.strftime("%Y-%m-%d"),
                ed.strftime("%Y-%m-%d"),
                "Period=M",
                usedf=True)
    res = res[1]
    res.index = pd.to_datetime(res.index)
    res = res.rename(code_name_map_sw, axis=1)

    if indus_p.empty:
        res.to_csv(os.path.join(index_path, 'industry_price_monthly.csv'),
                   encoding='gbk')
    else:
        to_deal_index = [i for i in indus_p if i in res.index]
        indus_p.drop(to_deal_index, axis=0, inplace=True)
        indus_p = pd.concat([indus_p, res], axis=0)
        indus_p.to_csv(os.path.join(index_path, 'industry_price_monthly.csv'),
                       encoding='gbk')
Exemple #19
0
def select_stocks_by_scores(stock_pool, factors, factor_weight, reversed_factors, icir_e, max_num=100,
                            each_industry_num=5, select_type='by_industry', wei_type='equal'):

    data = Data()
    industry_sw = data.industry_sw_1

    new_stock_pool = pd.DataFrame()
    for col, value in stock_pool.iteritems():
        # if value.sum() > max_num:
        codes = list(value[value == True].index)
        codes_selected = select_stocks_by_scores_singal_section(codes, col, factors, factor_weight, reversed_factors,
                                                                industry_sw, max_num, icir_e, each_industry_num,
                                                                select_type=select_type,  wei_type=wei_type)
        # else:
        #     codes_selected = list(value[value==True].index)

        if codes_selected:
            tmp = pd.DataFrame(np.full(len(codes_selected), True), index=codes_selected, columns=[col])
            new_stock_pool = pd.concat([new_stock_pool, tmp], axis=1)

    new_stock_pool.fillna(False, inplace=True)

    return new_stock_pool
def linear_programming(data_dict, industry_neutralized=False, mv_neutralized=False, equal_weighted=False):
    """
    线性规划法-求解最优组合权重
    """
    est_stock_rets, limit_fac_data, index_wt = data_dict['est_stock_rets'], \
                        data_dict['limit_fac_data'], data_dict['index_wt']
    stock_wt = pd.DataFrame()
    data = Data()
    basic = data.stock_basic_inform
    industry_sw = basic[['申万一级行业']]
    for date in est_stock_rets.columns:
        est_rets = est_stock_rets[[date]].dropna()
        est_rets.columns = ['rets']
        limit_fac_panel = limit_fac_data[date].dropna()
        benchmark_wt = index_wt[[date]].dropna()
        benchmark_wt.columns = ['benchmark_wt']

        cur_wt = lp_solve(date, est_rets, limit_fac_panel, benchmark_wt, industry_sw)
        cur_wt.name = date
        stock_wt = pd.concat([stock_wt, cur_wt], axis=1)
    
    stock_wt = stock_wt.where(stock_wt != 0, np.nan)
    return stock_wt        
def update_index_wei():
    w.start()
    data = Data()
    zz500_wt = data.zz500_wt
    hs300_wt = data.hs300_wt

    mes = generate_months_ends()
    # 先删除一些不是月末的数据
    to_del = [c for c in zz500_wt.columns if c not in mes]
    if len(to_del) > 0:
        zz500_wt = zz500_wt.drop(to_del, axis=1)
    to_del = [c for c in hs300_wt.columns if c not in mes]
    if len(to_del) > 0:
        hs300_wt = hs300_wt.drop(to_del, axis=1)

    new_mes = [m for m in mes if m > zz500_wt.columns[-1]]

    for m in new_mes:
        m_str = m.strftime("%Y-%m-%d")
        # 沪深300
        res = w.wset("indexconstituent",
                     "date=" + m_str + ";windcode=000300.SH",
                     usedf=True)
        res = res[1]
        res.set_index('wind_code', inplace=True)
        to_add = pd.DataFrame({m: res['i_weight']})
        hs300_wt = pd.concat([hs300_wt, to_add], axis=1)

        # 中证500
        res = w.wset("indexconstituent",
                     "date=" + m_str + ";windcode=000905.SH",
                     usedf=True)
        res = res[1]
        res.set_index('wind_code', inplace=True)
        to_add = pd.DataFrame({m: res['i_weight']})
        zz500_wt = pd.concat([zz500_wt, to_add], axis=1)

    data.save(hs300_wt,
              'hs300_wt',
              save_path=r'D:\pythoncode\IndexEnhancement\指数相关')
    data.save(zz500_wt,
              'zz500_wt',
              save_path=r'D:\pythoncode\IndexEnhancement\指数相关')
Exemple #22
0
def signal_factor_pool(factor_range, indus_dict, factor_name, top_or_bottom, per):
    # factor_path = r'D:\pythoncode\IndexEnhancement\因子预处理模块\因子(已预处理)'

    if factor_range == 'one_industry':
        data = Data()
        sw_1 = data.industry_sw_1

    factor_path = r'D:\pythoncode\IndexEnhancement\因子预处理模块\因子'
    f_list = os.listdir(factor_path)

    res_df = pd.DataFrame()
    for f in f_list:
        data = pd.read_csv(os.path.join(factor_path, f), engine='python', encoding='gbk')
        if factor_name not in data.columns:
            print('在{}数据中未找到{}因子'.format(f, factor_name))

        data = data[['code', 'name', factor_name]]
        data = data.set_index('code')
        data.dropna(axis=0, how='any', inplace=True)

        if factor_range == 'one_industry':
            se = list(sw_1.index[sw_1[sw_1.columns[0]] == indus_dict['to_handle_indus'][0]])
            se_code = [i for i in se if i in data.index]
            data = data.loc[se_code, :]

        le = int(len(data) * per)
        sorted_df = data.sort_values(by=factor_name, ascending=False)

        if top_or_bottom == 'top':
            r = sorted_df.index[:le - 1]
        elif top_or_bottom == 'bottom':
            r = sorted_df.index[-le - 1:]

        tmp_df = pd.DataFrame([True for i in range(0, len(r))], index=r.values,
                              columns=[datetime.strptime(f.split('.')[0], "%Y-%m-%d")])
        res_df = pd.concat([res_df, tmp_df], axis=1)
        res_df.fillna(False, inplace=True)

    return res_df
Exemple #23
0
def financial_condition_pool(selection_dict, start_date, end_date):
    data = Data()
    stock_basic = data.stock_basic_inform
    firstindustry = stock_basic[['中信一级行业']]

    all_stocks_code = stock_basic[['sec_name'.upper(), 'ipo_date'.upper()]]

    # roe
    roettm = data.roettm
    # 净利润同比增速
    netprofitgrowrate = data.netprofitgrowrate
    # 基本每股收益同比增长率
    basicepsyoy = data.basicepsyoy
    # 销售毛利率
    grossincome = data.grossincomeratiottm
    # 资产负债率
    debtassetsratio = data.debtassetsratio
    # 估值
    pe = data.pe

    cond_total = pd.DataFrame()
    for plate_name, conditions_dict in selection_dict.items():
        pe_con = None
        con_plate = None
        if plate_name == 'all':
            codes_in_industry = list(firstindustry.index)
        else:
            codes_in_industry = [ind for ind in firstindustry.index if firstindustry[ind] == plate_name]

        for conditions_type, tuples in conditions_dict.items():
            if conditions_type.split('_')[0] == 'scope':
                myfun = 'scopy_condition'
                for t in tuples:
                    if t == 'pe':
                        has_pe = 1
                        pe_con = copy.deepcopy(t)
                        continue

                res = eval('select_stocks(' + tuples[0] + ', codes_in_industry, start_date, end_date)')
                res_con = eval(myfun + '(res, minV=' + str(tuples[1]) + ', maxV= ' + str(tuples[2]) + ')')

                if isinstance(con_plate, pd.DataFrame):
                    con_plate = con_plate & res_con
                else:
                    con_plate = res_con

            elif conditions_type.split('_')[0] == 'rise':
                myfun = 'rise_condition'
                # for t in tuples:

                res = eval('select_stocks(' + tuples[0] + ', codes_in_industry, start_date, end_date)')
                res_con = eval(myfun + '(res,' + str(tuples[1]) + ')')

                if isinstance(con_plate, pd.DataFrame):
                    con_plate = con_plate & res_con
                else:
                    con_plate = res_con

        # 不同行业之间合并
        cond_total = pd.concat([cond_total, con_plate], axis=1)

    # 剔除上市未满N年得股票,N = 1
    # N = 1
    # for col, items in cond_total.iteritems():
    #     for i in items.index:
    #         if items[i]:
    #             if i in all_stocks_code.index:
    #                 de = col - all_stocks_code.loc[i, 'ipo_date']
    #                 if de.days < N * 365:
    #                     items[i] = False
    #             else:
    #                 # i 不在all_stocks_code里面,是因为all_stocks_code没有更新,说明该股票是最近上市的股票,
    #                 # 直接全部复制为False
    #                 items[i] = False

    # 剔除st
    for col, items in cond_total.iteritems():
        for i in items.index:
            if i in all_stocks_code.index and 'ST' in all_stocks_code.loc[i, 'sec_name'.upper()]:
                items[i] = False

    # 调整为公告日期
    cond_total = adjust_months(cond_total)
    # 用来扩展月度数据
    cond_total = append_df(cond_total)

    return cond_total
Exemple #24
0
def code_to_name(code_list):
    data = Data()
    all_stocks_code = data.stock_basic_inform
    all_stocks_code = all_stocks_code[['sec_name'.upper()]]
    res = list(all_stocks_code.loc[code_list, 'sec_name'.upper()])
    return res
def optimization_fun(ret,
                     e,
                     bench_wei,
                     pre_w=None,
                     is_enhance=True,
                     lamda=10,
                     c=0.015,
                     turnover=None,
                     te=None,
                     industry_max_expose=0,
                     risk_factor_dict={},
                     limit_factor_df=None,
                     in_benchmark=True,
                     in_benchmark_wei=0.8,
                     max_num=None):
    if in_benchmark:
        # 如果必须在成份股内选择,则需要对风险矩阵进行处理,跳出仅是成份股的子矩阵
        wei_tmp = bench_wei.dropna()
        bug_maybe = [i for i in wei_tmp.index if i not in e.index]
        if len(bug_maybe) > 0:
            print('存在下列股票不在组合里,请检查')
            print(bug_maybe)

        e_tmp = e.loc[wei_tmp.index, wei_tmp.index].fillna(0)
        ret_tmp = ret[wei_tmp.index].fillna(0)
        if pre_w:
            pre_w = pre_w[wei_tmp.index].fillna(0)

    else:
        # 确保几个重要变量有相同的index
        n_index = [i for i in e.index if i in ret.index]
        e_tmp = e.loc[n_index, n_index]
        ret_tmp = ret[n_index]
        wei_tmp = bench_wei[n_index].fillna(0)
        if isinstance(pre_w, pd.Series):
            to_test_list = len([i for i in pre_w.index if i not in n_index])
            if np.any(pre_w[to_test_list] > 0.001):
                input('input:存在部分有权重的股票在上期,而不再当期的数据中,请检查')
            pre_w = pre_w[n_index].fillna(0)
        # 如果可以选非成份股,则可以确定一个成份股权重比例的约束条件。
        is_in_bench = deepcopy(wei_tmp)
        is_in_bench[is_in_bench > 0] = 1  # 代表是否在成份股内的变量

    data = Data()
    basic = data.stock_basic_inform
    industry_sw = basic[['申万一级行业']]
    # 股票组合的行业虚拟变量
    industry_map = industry_sw.loc[ret_tmp.index, :]

    # dummies_bench = pd.get_dummies(industry_map.loc[bench_wei.index, :])
    # dummies_bench.sum()  不同行业的公司数量
    industry_map.fillna('综合', inplace=True)
    dummies = pd.get_dummies(industry_map[industry_map.columns[0]])

    dummies.sum()

    # 个股最大权重为行业权重的 3/4
    ind_wei = np.dot(dummies.T, wei_tmp)
    ind_wei_se = pd.Series(index=dummies.columns, data=ind_wei)
    industry_map['max_wei'] = None
    for i in industry_map.index:
        try:
            industry_map.loc[
                i,
                'max_wei'] = 0.75 * ind_wei_se[industry_map.loc[i, '申万一级行业']]
        except Exception as e:
            industry_map.loc[i, 'max_wei'] = 0.02
    max_wei = industry_map['max_wei'].values

    x = cp.Variable(len(ret_tmp), nonneg=True)

    q = ret_tmp.values
    P = lamda * e_tmp.values

    ind_wei = np.dot(dummies.T, wei_tmp)  # b.shape
    ind_wei_su = pd.Series(ind_wei, index=dummies.columns)
    dum = dummies.T.values  # A.shape

    para_dict = {
        'x': x,
        'max_wei': max_wei,
        'in_benchmark_wei': in_benchmark_wei,
        'is_in_bench': is_in_bench,
        'ret_e': ret_tmp,
        'dum': dum,
        'wei_tmp': wei_tmp,
        'ind_wei': ind_wei,
        'risk_factor_dict': risk_factor_dict,
        'limit_factor_df': limit_factor_df,
        'pre_w': pre_w,
        'P': P,
        'total_wei': 1,
    }
    con_dict = {
        'in_benchmark': in_benchmark,
        'industry_max_expose': industry_max_expose,
        'turnover': turnover,
        'te': te,
    }

    constraints = generates_constraints(para_dict, con_dict)
    prob = generates_problem(q, x, P, c, pre_w, constraints, te)

    print('开始优化...')
    time_start = time.time()
    prob.solve()
    status = prob.status
    # 如果初始条件无解,需要放松风险因子的约束
    iters = 0
    while status != 'optimal' and iters < 3:
        if len(risk_factor_dict) > 0 and iters == 0:
            tmp_d = deepcopy(risk_factor_dict)
            for k, v in tmp_d.items():
                tmp_d[k] = v + 0.5
            para_dict['risk_factor_dict'] = tmp_d

        elif not turnover and iters == 1:
            turnover = turnover + 0.2
            con_dict['turnover'] = turnover
        elif iters == 2:
            industry_max_expose = industry_max_expose + 0.05
            con_dict['industry_max_expose'] = industry_max_expose

        iters = iters + 1
        constraints = generates_constraints(para_dict, con_dict)
        prob = generates_problem(q, x, P, c, pre_w, constraints, te)
        print('第{}次优化'.format(iters))
        prob.solve()
        status = prob.status

    time_end = time.time()
    print('优化结束,用时', time_end - time_start)
    print('优化结果为{}'.format(status))

    # if prob.status != 'optimal':
    #     input('input:未得出最优解,请检查')
    # np.sum(x.value)
    # np.sum(x.value > 0.0)
    # np.sum(x.value > 0.001)
    # np.sum(x.value[x.value > 0.001])
    # np.sum(x.value[x.value < 0.001])
    # 返回值
    wei_ar = np.array(x.value).flatten()  # wei_ar.size
    wei_se = pd.Series(wei_ar, index=ret_tmp.index)

    # 设定标准,一般情况下无需对股票数量做二次优化,只有股票数量过多是才需要。
    if np.sum(x.value > 0.001) > max_num:
        print('进行第二轮股票数量的优化')
        # wei_selected, n2, tobe_opt = select_import_wei(wei_se, max_num)
        tobe_opt = list(wei_se[wei_se > 0.001].index)
        print('第二次优化为从{}支股票中优化选择出{}支'.format(len(tobe_opt), max_num))

        # 经过处理后,需要优化的计算量大幅度减少。比如第一次优化后,权重大于0.001的股票数量是135,超过最大要求的100。
        # 我们首先保留其中前90,然后从后面的45个中选择10保留下来。
        len(tobe_opt)
        e_tmp2 = e_tmp.loc[tobe_opt, tobe_opt]
        ret_tmp2 = ret_tmp[tobe_opt]
        # wei_tmp2 = wei_tmp[tobe_opt]

        is_in_bench2 = is_in_bench[tobe_opt]

        dummies2 = pd.get_dummies(industry_map.loc[tobe_opt,
                                                   industry_map.columns[0]])
        dum2 = dummies2.T.values
        # 小坑
        new_ind = ind_wei_su[dummies2.columns]
        new_ind = new_ind / new_ind.sum()
        ind_wei2 = new_ind.values

        # 对个股权重优化的坑,开始时是行业权重乘以0.75,但在二次优化的时候,可能有的行情的权重不够用了。
        max_wei2 = 3 * industry_map.loc[tobe_opt, 'max_wei'].values
        total_wei = 1
        if pre_w:
            pre_w = pre_w[tobe_opt]

        P2 = lamda * e_tmp2.values
        # 有些行业个股权重以前的不够了
        x = cp.Variable(len(ret_tmp2), nonneg=True)
        y = cp.Variable(len(ret_tmp2), boolean=True)
        para_dict2 = {
            'x': x,
            'y': y,
            'y_sum': max_num,  # - n2,
            'max_wei': max_wei2,  # max_wei2.max()    max_wei2.sum()
            'in_benchmark_wei': in_benchmark_wei,
            'is_in_bench': is_in_bench2,
            'ret_e': ret_tmp2,
            'dum': dum2,
            'wei_tmp': wei_tmp,
            'ind_wei': ind_wei2,  # ind_wei2.sum()
            'risk_factor_dict': risk_factor_dict,
            'limit_factor_df': limit_factor_df,
            'pre_w': pre_w,
            'P': P,
            'total_wei': total_wei
        }
        con_dict2 = {
            'in_benchmark': in_benchmark,
            'industry_max_expose': industry_max_expose,
            'turnover': turnover,
            'te': te,
        }
        q2 = ret_tmp2.values
        # P2.shape
        # q2.shape
        # ind_wei2.sum()
        # max_wei2.sum()
        cons = generates_constraints(para_dict2, con_dict2)
        prob = cp.Problem(cp.Maximize(q2.T * x - cp.quad_form(x, P2)), cons)
        prob.solve(solver=cp.ECOS_BB, feastol=1e-10)
        print(prob.status)
        if prob.status != 'optimal':
            input('input:二次股票数量优化时,未得出最优解,请检查')
        # winsound.Beep(600, 2000)
        # print(x.value)
        # print(y.value)
        # np.sum(x.value > 0.001)
        # np.sum(x.value)
        # np.sum(y.value)
        # np.sum(x.value[y.value == 1])

    #
    # prob = cp.Problem(cp.Maximize(q.T * x - cp.quad_form(x, P)),  # - cp.quad_form(x, P)),
    #                   constraints)
    # print(prob.is_dcp())
    # prob.solve()
    # print(prob.status)
    # print(x.value)
    #
    # np.sum(x.value > 0.01)
    #
    # # np.vstack((a, b))  # 在垂直方向上拼接
    # # np.hstack((a, b))  # 在水平方向上拼接
    # industry_max_expose = 0.05

    #
    # if max_num:
    #     '''
    #     优化目标函数:
    #     ECOS is a numerical software for solving convex second-order cone programs (SOCPs) of type
    #     min c'*x
    #     s.t. A * x = b
    #     G * x <= _K h
    #     步骤:
    #     1,假设股票数量没有约束,求解组合优化,得到绝对权重向量
    #     2,对股票数量不过N_max的原始可行域进行限制,选股空间为w1中有权重(>1e-6),数量为n1
    #        强制保留w1中各行业最大权重股以及其他权重靠前的股票,数量为n2,n2<N_max
    #     3,在第2步限制后的可行域内运用BB算法求解最优权重,设置最大迭代刺猬niters,超过
    #        迭代次数返回截至目前的最优解。
    #     '''
    #     # 步骤1
    #     sol = solvers.qp(P, q, G, h, A, b)
    #     wei = sol['x']    # print(wei)  wei.size
    #     wei_ar = np.array(wei).flatten()  # wei_ar.size
    #     n1 = np.sum(wei_ar > 0)
    #     #  np.sum(wei_ar[wei_ar > 0.01])
    #     wei_se = pd.Series(wei_ar, index=ret_tmp.index)
    #     # 步骤2
    #     wei_selected, n2 = select_import_wei(wei_se)
    #     # 步骤3
    #     wei_selected, n2
    #
    #     x = cp.Variable(len(ret_tmp), nonneg=True)
    #     y = cp.Variable(len(ret_tmp), boolean=True)
    #     prob = cp.Problem(cp.Maximize(q.T * x - cp.quad_form(x, P)),  # - cp.quad_form(x, P)),
    #                       [  # G @ x <= h,                              # print(P)   G.size  h.size
    #                           x - y <= 0,
    #                           A @ x == b,
    #                           cp.sum(x) == 1,
    #                           cp.sum(y) <= 200,
    #                       ])
    #     print(prob.is_dcp())
    #     # max_iters: maximum number of iterations
    #     # reltol: relative accuracy(default: 1e-8)
    #     # feastol: tolerance for feasibility conditions (default: 1e-8)
    #     # reltol_inacc: relative accuracy for inaccurate solution (default: 5e-5)
    #     # feastol_inacc: tolerance for feasibility condition for inaccurate solution(default:1e-4)
    #
    #     prob.solve(solver=cp.ECOS_BB, max_iters=20000, feastol=1e-4, reltol=1e-4, reltol_inacc=1e-4,
    #                feastol_inacc=1e-4)
    #     # prob.solve(solver=cp.ECOS_BB,  max_iters=20, feastol=1e-5, reltol=1e-5, feastol_inacc=1e-1)
    #     print(prob.status)
    #     print(x.value)
    #     print(y.value)
    #
    #     max_num
    #
    #
    #     pass
    #
    # # print(cvxpy.installed_solvers())
    # #
    # # np.sum(A, axis=1)
    # # A.size
    # # np.linalg.matrix_rank(A)
    # #
    # # sol = solvers.qp(P, q, G, h, A, b)
    # # wei = sol['x']  # print(wei)  wei.size
    # #
    # # np.rank(A)
    # # print(A)
    # # print(q.T * wei)
    # #
    # # #  np.sum(wei)
    # # wei_ar = np.array(wei).flatten()  # wei_ar.size
    # # #  np.sum(wei_ar > 0.01)
    # # #  np.sum(wei_ar[wei_ar > 0.01])

    return wei_se
 def load_pct(self):
     if self.freq == 'M':         # 导入月度价格变动百分比数据
         data = Data()
         self.changePCT_np = data.changepct_monthly.shift(-1, axis=1)
     if self.freq == 'D':
         print('未实现')
def get_firt_industry_list():
    data = Data()
    stock_basic = data.stock_basic_inform
    industry_names = list(set(stock_basic['申万一级行业'].values))

    return industry_names
Exemple #28
0
def main(p_dict, fp, is_ind_neu, is_size_neu, is_plate_neu, special_plate=None,
         selection=None):
    """
    输入: 需要进行预处理的因子名称(可为1个或多个,默认为对所有因子进行预处理)
    is_ind_neu : 是否做行业中性化处理,对股票多因子需要,做行业多因子时不需要
    输出: 预处理后的因子截面数据(如2009-01-23.csv文件)
    
    对指定的原始因子数据进行预处理
    顺序:缺失值填充、去极值、中性化、标准化
    (因输入的截面数据中所含财务类因子默认已经过
    财务日期对齐处理,故在此不再进行该步处理)
    """
    file_path = p_dict['file_path']
    save_path = p_dict['save_path']

    # 读取原始因子截面数据
    try:
        data = pd.read_csv(os.path.join(file_path, fp), engine='python',
                           encoding='gbk')
    except Exception as e:
        print('debug')
    if 'No' in data.columns:
        data = data.set_index('No')

    # 若针对特定板块,则删除其他板块的股票数据
    if special_plate:
        data_ = Data()
        stock_basic = data_.stock_basic_inform
        sw_1 = stock_basic[['申万一级行业']]
        stock_list = list(sw_1.index[sw_1[sw_1.columns[0]] == special_plate])
        # 剔除在当期还未上市的股票
        codes = [i for i in data.index if data.loc[i, 'Code'] in stock_list]
        data = data.loc[codes, :]

        data.index = range(0, len(data))

    # 根据输入的因子名称将原始因子截面数据分割
    data_to_process, data_unchanged = get_factor_data(data)

    # '002345.SZ' in data_to_process['Code']

    # 预处理步骤依次进行
    data_to_process = fill_na(data_to_process)                                    # 缺失值填充
    if len(data_to_process) == 0:
        print('debug')
    data_to_process = winsorize(data_to_process)                                  # 去极值
    if is_ind_neu or is_size_neu:
        data_to_process = neutralize(data_to_process, ind_neu=is_ind_neu, size_neu=is_size_neu)  # 中性化
    data_to_process = standardize(data_to_process)                                # 标准化

    # 合并生成经过处理后的总因子文件
    if len(data_unchanged) > 0:
        data_final = pd.concat([data_to_process, data_unchanged.loc[data_to_process.index]], axis=1)
    else:
        data_final = data_to_process

    if data_final.index.name != 'No':
        data_final.index = range(1, len(data_final)+1)
        data_final.index.name = 'No'

    data_final.to_csv(os.path.join(save_path, fp), encoding='gbk')