def get_tick_source_data(): df = DataAPI.MktTicksHistOneDayGet(securityID=u"300634.XSHE", date='20191122', startSecOffset="", endSecOffset="", field=u"bidVolume1", pandas="1") print(df.columns) print(df['bidVolume1'].mean())
def get_features(security_id=u"300634.XSHE", date='20191122'): ticker, exchange_cd = security_id.split('.') df = DataAPI.MktTicksHistOneDayGet(securityID=security_id, date=date, startSecOffset="", endSecOffset="", field=u"", pandas="1") df_min = DataAPI.SHSZBarHistOneDayGet(tradeDate=date, exchangeCD=exchange_cd, ticker=ticker, unit="5", startTime=u"", endTime=u"", field=u"", pandas="1") datatimes = list(df['dataTime']) total_vol = list(df['value'])[-1] data_min = [ '{0}:{1}'.format(item.split(':')[0], item.split(':')[1]) for item in datatimes ] # tick feature calculation df['dataMin'] = data_min df['avgPrice'] = df['value'] / df['volume'] df['amplitude'] = (df['highPrice'] - df['lowPrice']) / df['lastPrice'] df['spread'] = df['askPrice1'] - df['bidPrice1'] df['openDiff'] = (df['openPrice'] - df['prevClosePrice']) / df['prevClosePrice'] df['trackError'] = (df['lastPrice'] - df['avgPrice']) / df['avgPrice'] df['askTrackError1'] = (df['askPrice1'] - df['avgPrice']) / df['avgPrice'] df['bidTrackError1'] = (df['bidPrice1'] - df['avgPrice']) / df['avgPrice'] df['totalAskVolume'] = df['askVolume1'] + df['askVolume2'] + df[ 'askVolume3'] + df['askVolume4'] + df['askVolume5'] df['totalBidVolume'] = df['bidVolume1'] + df['bidVolume2'] + df[ 'bidVolume3'] + df['bidVolume4'] + df['bidVolume5'] df['volumeImbalance1'] = (df['askVolume1'] - df['bidVolume1']) / ( df['askVolume1'] + df['bidVolume1']) df['volumeImbalanceTotal'] = ( df['totalAskVolume'] - df['totalBidVolume']) / (df['totalAskVolume'] + df['totalBidVolume']) df['volumePerDeal'] = df['volume'] / df['deal'] df['volumeRatio'] = df['volume'] / total_vol #min acc features by tick int(list(set(df['dataMin']))[0].split(':')[1]) % 5 # print(df.shape) # print(df.columns) min_vwap = list(df_min['vwap']) min_vwap.insert(0, min_vwap[0]) df_min['ret'] = (df_min['vwap'] / min_vwap[:-1] - 1) * 100 dict_min_ret = dict(zip(df_min['barTime'], df_min['ret'])) # TODO add features of tick and min columns = list(df.columns) columns.remove('dataDate') columns.remove('exchangeCD') columns.remove('ticker') columns.remove('dataTime') columns.remove('dataMin') columns.remove('shortNM') columns.remove('currencyCD') columns.remove('askPrice1') columns.remove('askPrice2') columns.remove('askPrice3') columns.remove('askPrice4') columns.remove('askPrice5') columns.remove('askVolume1') columns.remove('askVolume2') columns.remove('askVolume3') columns.remove('askVolume4') columns.remove('askVolume5') columns.remove('bidPrice1') columns.remove('bidPrice2') columns.remove('bidPrice3') columns.remove('bidPrice4') columns.remove('bidPrice5') columns.remove('bidVolume1') columns.remove('bidVolume2') columns.remove('bidVolume3') columns.remove('bidVolume4') columns.remove('bidVolume5') df.sort_values(by='dataTime', ascending=True, inplace=True) data_min = list(df['dataMin']) df = df[columns] print(df.columns) print(df.shape) # print(df.head(5)) # df.fillna(method='ffill', inplace=True) # df = df.apply(lambda x: (x - np.mean(x)) / np.std(x)) rows = list(df.values) print(rows[:3]) train_x = [] train_y = [] _start, _end = 0, 0 n_row = len(rows) total_row = 0 for idx, val in enumerate(rows): hh, mm = data_min[idx].split(':') if int(hh) == 14 and int(mm) == 57: break if int(hh) == 9 and int(mm) <= 30: _start = idx continue if total_row >= 40: break if idx == n_row - 1: if dict_min_ret.get('{0}:{1}'.format(hh, mm)): train_y.append([dict_min_ret.get('{0}:{1}'.format(hh, mm))]) train_x.append(list(rows[_start:idx])) total_row += 1 _start = idx else: hh_, mm_ = data_min[idx + 1].split(':') if int(mm) % 5 == 0 and int(mm_) != int(mm): # print(data_min[_start], data_min[idx]) if dict_min_ret.get('{0}:{1}'.format(hh, mm)): train_y.append( [dict_min_ret.get('{0}:{1}'.format(hh, mm))]) train_x.append(list(rows[_start:idx + 1])) total_row += 1 _start = idx + 1 return train_x, train_y
def get_features_by_date(security_id=u"300634.XSHE", date='20191122', min_unit="1", tick=False, df_min=None, df_bc_min=None, win_len=20): ''' Example call: -get tick level features: get_features_by_date(security_id=u"ticker.mkt", date='yyyymmdd', min_unit="1", tick=True) -get min level features: get_features_by_date(security_id=u"ticker.mkt", date='yyyymmdd', min_unit="1", tick=False, df_min=xx,df_bc_min=xx) ''' logger.info('Start processing sec id:{0} for date:{1}'.format(security_id, date)) df = DataAPI.MktTicksHistOneDayGet(securityID=security_id, date=date.replace('-', ''), startSecOffset="", endSecOffset="", field=u"", pandas="1") # df_min = DataAPI.SHSZBarHistOneDayGet(tradeDate=date, exchangeCD=exchange_cd, ticker=ticker, unit=min_unit, # startTime=u"", endTime=u"", field=u"", pandas="1") datatimes = list(df['dataTime']) total_vol = list(df['value'])[-1] data_min = ['{0}:{1}'.format(item.split(':')[0], item.split(':')[1]) for item in datatimes] # tick feature calculation df['barTime'] = data_min _cal_tick_features(df) if tick: return df df_min['vwap'] = df_min['totalValue'] / df_min['totalVolume'] df_min['bcClosePrice'] = df_bc_min['closePrice'] # calculate tick level features # calculate min features accumulate from tick level df_agg = _cal_min_features_by_ticks(df) # calculate min level features _cal_min_features(df_min, win_len) common_min_lst = set(df_min['barTime']).intersection(set(data_min)) df_min = df_min[df_min['barTime'].isin(common_min_lst)].sort_values(by='barTime', ascending=True) df_agg = df_agg[df_agg['barTime'].isin(common_min_lst)].sort_values(by='barTime', ascending=True) df_min = df_min.reset_index() df_agg = df_agg.reset_index() df_min = pd.concat([df_min, df_agg], axis=1, ignore_index=False) df_min['volumePerDeal'] = df_min['totalVolume'] / df_min['deal_sum'] df_min['valuePerDeal'] = df_min['totalValue'] / df_min['deal_sum'] df_min = df_min.drop(REMOVE_MIN_COLS, axis=1) df_min = df_min.replace(np.inf, np.nan) df_min = df_min.replace(-np.inf, np.nan) col_before_drop = df_min.columns # drop the columns that are all None df_min.dropna(axis=1, how='all', inplace=True) # df_min.fillna(axis=1, inplace=True, method='pad') # drop the rows that contain None df_min.dropna(axis=0, how='any', inplace=True) col_after_drop = df_min.columns if set(col_before_drop) - set(col_after_drop): logger.info('Drop the empty columns:{0}'.format(set(col_before_drop) - set(col_after_drop))) if not tick: return df_min else: # TODO refactor this session, this is to generate train_x and train_y for time series models(RNN) min_vwap = list(df_min['vwap']) min_vwap.insert(0, min_vwap[0]) df_min['ret'] = (df_min['vwap'] / min_vwap[:-1] - 1) * 100 dict_min_ret = dict(zip(df_min['barTime'], df_min['ret'])) columns = list(set(df.columns) - set(REMOVE_TICK_COLS)) df.sort_values(by='dataTime', ascending=True, inplace=True) data_min = list(df['barTime']) df = df[columns] # print(df.head(5)) # df.fillna(method='ffill', inplace=True) # df = df.apply(lambda x: (x - np.mean(x)) / np.std(x)) rows = list(df.values) train_x = [] train_y = [] _start, _end = 0, 0 n_row = len(rows) total_row = 0 for idx, val in enumerate(rows): hh, mm = data_min[idx].split(':') if int(hh) == 14 and int(mm) == 57: break if int(hh) == 9 and int(mm) <= 30: _start = idx continue if total_row >= 40: break if idx == n_row - 1: if dict_min_ret.get('{0}:{1}'.format(hh, mm)): train_y.append([dict_min_ret.get('{0}:{1}'.format(hh, mm))]) train_x.append(list(rows[_start: idx])) total_row += 1 _start = idx else: hh_, mm_ = data_min[idx + 1].split(':') if int(mm) % 5 == 0 and int(mm_) != int(mm): if dict_min_ret.get('{0}:{1}'.format(hh, mm)): train_y.append([dict_min_ret.get('{0}:{1}'.format(hh, mm))]) train_x.append(list(rows[_start: idx + 1])) total_row += 1 _start = idx + 1 logger.info('Start processing sec id:{0} for date:{1}'.format(security_id, date)) return train_x, train_y, columns