Python get_data Examples, djq_data_processor.get_data Python Examples

Example #1

0

Show file

File: djq_utils.py Project: superdjq/djq_quant

 def initial_env(self):
     df_env = pd.DataFrame(index=self.df_pred.index)
     df_stk = djq_data_processor.get_data(self.etf)
     df_stk = df_stk.set_index('date')
     df_env = df_env.join(df_stk.close)
     df_env = df_env.fillna(method='ffill')
     df_env = df_env.fillna(method='backfill')
     return df_env

Example #2

0

Show file

    def initial_env(self):
        df_env = pd.DataFrame(index=self.df_pred.index)
        for name, etf_name in self.etf_names.items():
            try:
                df_stk = djq_data_processor.get_data(etf_name, inx=False)
            except:
                raise ValueError('Cannot find the file!')
            df_stk = df_stk[['date', 'close']]
            df_stk = df_stk.set_index('date')
            df_stk = df_stk.rename(columns={'close': name})
            df_stk = df_stk.sort_values('date')
            # df_stk = df_stk.pct_change() + 1
            df_env = df_env.join(df_stk)
        df_env = df_env.fillna(method='ffill')
        df_env = df_env.fillna(method='backfill')

        return df_env

Example #3

0

Show file

File: djq_train_model.py Project: superdjq/djq_quant

 def cls_to_weighted_pct(self, df):
     """
     The estimated result is a class interval number, change the class to corresponding pct change
     Calculate the index pct change using the market value weights
     :param df: pandas.DataFrame, estimated result
     :param pjNam: str, project name
     :return: float, estimated change of index
     """
     df2 = df.astype(float)
     for stk in df.columns:
         df2.loc[:][stk] = [
             self.book[self.book.code == stk]['tier' +
                                              str(int(c))].values[0]
             for c in df.loc[:][stk]
         ]
     mkt = djq_data_processor.get_data('market')
     mkt = mkt.set_index('code')
     return np.average(df2, weights=mkt.loc[df.columns]['mktcap'], axis=1)

Example #4

0

Show file

File: djq_utils.py Project: superdjq/djq_quant

def mkt_cmp(df, mkt='399300',start_date='2020-01-01'):
    '''
    Draw the picture of portfolio price chart, compared with market
    Draw the linear regression line of portfolio with market, show the beta/alpha value
    :param df: pandas.Series, index = date, time series data of NPV of the portfolio
    :param mkt: str with length=6, China stock index code number
    :param start_date: 'YYYY-mm-dd'
    :return: None
    '''
    assert type(df) == pd.Series
    plt.figure(figsize=(10, 8))
    df.index = pd.to_datetime(df.index)
    df_mkt = djq_data_processor.get_data(mkt, inx=True)
    # df_mkt.date = df_mkt.date.dt.strftime('%Y/%m/%d')
    df_mkt = df_mkt.set_index('date')
    df_mkt.index = pd.to_datetime(df_mkt.index)
    if set(df.index) & set(df_mkt.index) == set([]):
        print('Input data error')
        return
    df = df[df.index>=start_date]
    df_mkt = df_mkt.loc[list(df.index)]
    df_mkt['close'] = df_mkt['close'] / df_mkt['close'][0]
    ax1 = plt.subplot(2,1,1)
    #ax1.xaxis.set_major_locator(ticker.MultipleLocator(base=10))
    plt.plot(df)
    plt.plot(df_mkt.close)
    plt.legend(['portfolio_profit','market_profit'], loc='best')
    #plt.xticks(rotation=330)

    df_chg = df.pct_change().dropna()
    mkt_chg = df_mkt.close.pct_change().dropna()
    plt.subplot(2, 1, 2)
    plt.scatter(mkt_chg, df_chg)

    cov_a_b = np.cov(df_chg, mkt_chg)[0][1]
    beta = cov_a_b / mkt_chg.var()
    alpha = df_chg.mean() - beta * mkt_chg.mean()
    x = np.linspace(min(mkt_chg), max(mkt_chg), 50)
    y1 = beta * x + alpha
    plt.plot(x, y1, color='red')
    plt.title('Security Characteristic Line with beta=%.4f, alpha=%.4f'%(beta, alpha))
    plt.show()

Example #5

0

Show file

File: djq_train_model.py Project: superdjq/djq_quant

    def data_prepare(self, code, drop=True, real_time=True):
        """
        :param code: China stock code with 6 numbers, using local data set
        :param drop: set True to drop the data of days with missing information
        :param real_time: add real time data as the newest data when the market is open
        :return: train_data, train_label, test_data, test_label, df_train, df_test, class threshold
        """
        xlst = zsys.ohlcVLst + zsys.stcokcharts_indicators + zsys.last_data  # + zsys.TDS_talib_indicators_all
        train_start = '2012-01-01'
        train_end = '2019-12-31'
        n_pca = 50
        pca = True
        min_profit_ratio = self.target_day // 3
        for part in self.data_params:
            if part.startswith('xlst'):
                xlst_name = part[5:].split('+')
                xlst = []
                for name in xlst_name:
                    xlst += {
                        'ohlcV': zsys.ohlcVLst,
                        'chart': zsys.stcokcharts_indicators,
                        'last': zsys.last_data,
                        'all': zsys.TDS_talib_indicators_all,
                        'ohlc': zsys.ohlcLst,
                        'talib5': zsys.TDS_talib_multi_indicators_5,
                        'talib15': zsys.TDS_talib_multi_indicators_15,
                        'talib30': zsys.TDS_talib_multi_indicators_30,
                        'talibstatic': zsys.TDS_talib_multi_indicators_Static
                    }[name]
            elif part.startswith('date'):
                start, end = part[5:].split('-')
                train_start = start + '-01-01'
                train_end = end + '-12-31'
            elif part.startswith('minprofit'):
                min_profit_ratio = int(part[10:])
            elif part.startswith('pca'):
                if part[4:].isnumeric():
                    n_pca = int(part[3:])
            elif part.startswith('lda'):
                pca = False
                n_pca = self.classify - 1

        # 数据准备
        try:
            df = djq_data_processor.get_data(code)
        except:
            raise ValueError('Cannot find the file!')
        if real_time and list(df['date'])[-1] != time.strftime('%Y-%m-%d'):
            open_time = time.strptime(
                time.strftime('%Y-%m-%d') + ' 09:30:00', '%Y-%m-%d %H:%M:%S')
            now = time.strptime(time.strftime('%Y-%m-%d %H:%M:%S'),
                                '%Y-%m-%d %H:%M:%S')
            diff = min(max(0,
                           time.mktime(now) - time.mktime(open_time)),
                       2 * 60 * 60)
            open_time = time.strptime(
                time.strftime('%Y-%m-%d') + ' 13:00:00', '%Y-%m-%d %H:%M:%S')
            diff += min(max(0,
                            time.mktime(now) - time.mktime(open_time)),
                        2 * 60 * 60)
            new = ts.get_realtime_quotes([code])
            if new['date'][0] != list(df['date'])[-1] and diff and float(
                    new['open'][0]):
                time_multiple = 4 * 60 * 60 / diff
                line = pd.Series(
                    dict(
                        zip(['date', 'open', 'high', 'low', 'close', 'volume'],
                            [
                                time.strftime('%Y-%m-%d'),
                                float(new['open'][0]),
                                float(new['high'][0]),
                                float(new['low'][0]),
                                float(new['price'][0]),
                                time_multiple * float(new['volume'][0][:-2])
                            ])))
                df = df.append(line, ignore_index=True)
                df = df.sort_values('date', ascending=False)
                df = df.reset_index(drop=True)
        if df.shape[0] < 252:
            raise ValueError('Not enough train data!')
        df = get_all_finanical_indicators(df)
        djq_data_processor.get_label(df, target_day=self.target_day)
        df = df[df.date >= train_start]
        if drop:
            df = df.dropna()
        #transfer_label_to_classification(df, classify=self.classify)

        df['y_pct_change'] = df['y'].copy()
        df_train = df[df.date <= train_end].copy()
        if df_train.shape[0] < 252:
            raise ValueError('Not enough train data!')
        df_test = df[df.date > train_end].copy()
        split, thresholds = pd.qcut(df_train['y'],
                                    self.classify,
                                    labels=range(self.classify),
                                    retbins=True)
        if thresholds[-2] < min_profit_ratio:
            pass
            # raise ValueError('Too small profit!')
        df_train['y'] = np.array(split)
        df_test['y'] = np.array(
            pd.cut(df_test['y'], thresholds, labels=range(self.classify)))

        # 数据清洗
        std = preprocessing.StandardScaler()
        x_train = df_train[xlst].values
        x_train = std.fit_transform(x_train)

        if pca:
            dimension_reducer = PCA(n_components=n_pca)
            dimension_reducer.fit(x_train)
        else:
            # LDA dimension reduction
            dimension_reducer = LDA(n_components=n_pca)
            dimension_reducer.fit(x_train, df_train['y'])

        x_train = dimension_reducer.transform(x_train)
        # 测试集与训练集采用相同方法处理
        x_test = df_test[xlst].values
        if x_test.shape[0]:
            x_test = std.transform(x_test)
            x_test = dimension_reducer.transform(x_test)
        return self.subclassifiers_transfer(
            code, x_train), df_train['y'].values, self.subclassifiers_transfer(
                code,
                x_test), df_test['y'].values, df_train, df_test, thresholds