def _czce_df_read(url, skiprow, encode='utf-8'): """ 抓取郑州商品期货交易所的网页数据 Parameters ------ url: 网站 string skiprow: 去掉前几行 int Return ------- DataFrame """ r = requests_link(url, encode) data = pd.read_html(r.text, match='.+', flavor=None, header=0, index_col=0, skiprows=skiprow, attrs=None, parse_dates=False, tupleize_cols=False, thousands=', ', encoding="gbk", decimal='.', converters=None, na_values=None, keep_default_na=True) return data
def get_cffex_rank_table(date=None, vars=cons.vars): """ 抓取郑州商品期货交易所前20会员持仓排名数据明细 注:该交易所即公布了品种排名,也公布了标的排名 Parameters ------ date: 日期 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象 为空时为当天 vars: 合约品种如RB、AL等列表 为空时为所有商品 数据从20100416开始,每交易日16:30左右更新数据 Return ------- DataFrame: rank 排名 int vol_party_name 成交量排序的当前名次会员 string(中文) vol 该会员成交量 int vol_chg 该会员成交量变化量 int long_party_name 持多单排序的当前名次会员 string(中文) long_openIntr 该会员持多单 int long_openIntr_chg 该会员持多单变化量 int short_party_name 持空单排序的当前名次会员 string(中文) short_openIntr 该会员持空单 int short_openIntr_chg 该会员持空单变化量 int symbol 标的合约 string var 品种 string date 日期 string YYYYMMDD """ vars = [i for i in vars if i in cons.market_var['cffex']] date = cons.convert_date( date) if date is not None else datetime.date.today() if date.strftime('%Y%m%d') not in calendar: print('%s非交易日' % date.strftime('%Y%m%d')) return {} D = {} for var in vars: url = cons.CFFEX_VOLRANK_URL % (date.strftime('%Y%m'), date.strftime('%d'), var) r = requests_link(url, encoding='gbk') if '网页错误' not in r.text: table = pd.read_csv(StringIO(r.text.split('\n交易日,')[1])) table = table.dropna(how='any') table = table.applymap(lambda x: x.strip() if type(x) == type('') else x) for symbol in set(table['合约']): tableCut = table[table['合约'] == symbol] tableCut.columns = ['symbol', 'rank'] + rank_columns tableCut = _tableCut_cal(tableCut, symbol) D[symbol] = tableCut.reset_index(drop=True) return D
def get_czce_rank_table(date=None, vars=cons.vars): """ 抓取郑州商品期货交易所前20会员持仓排名数据明细 注:该交易所即公布了品种排名,也公布了标的排名 Parameters ------ date: 日期 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象 为空时为当天 vars: 合约品种如RB、AL等列表 为空时为所有商品 数据从20050509开始,每交易日16:30左右更新数据 Return ------- DataFrame: rank 排名 int vol_party_name 成交量排序的当前名次会员 string(中文) vol 该会员成交量 int vol_chg 该会员成交量变化量 int long_party_name 持多单排序的当前名次会员 string(中文) long_openIntr 该会员持多单 int long_openIntr_chg 该会员持多单变化量 int short_party_name 持空单排序的当前名次会员 string(中文) short_openIntr 该会员持空单 int short_openIntr_chg 该会员持空单变化量 int symbol 标的合约 string var 品种 string date 日期 string YYYYMMDD """ date = cons.convert_date( date) if date is not None else datetime.date.today() if date.strftime('%Y%m%d') not in calendar: print('%s非交易日' % date.strftime('%Y%m%d')) return {} if date <= datetime.date(2010, 8, 25): url = cons.CZCE_VOLRANK_URL_1 % (date.strftime('%Y%m%d')) data = _czce_df_read(url, skiprow=0) r = requests_link(url, 'utf-8') r.encoding = 'utf-8' soup = BeautifulSoup(r.text, 'lxml', from_encoding="gb2312") symbols = [] for link in soup.find_all('b'): strings = (str(link).split(' ')) if len(strings) > 5: try: symbol = chinese_to_english(strings[4]) except: symbol = strings[4] symbols.append(symbol) D = {} for i in range(len(symbols)): symbol = symbols[i] tableCut = data[i + 1] tableCut.columns = rank_columns tableCut = tableCut.iloc[:-1, :] tableCut.loc[:, 'rank'] = tableCut.index tableCut.loc['合计', 'rank'] = 999 tableCut.loc[ '合计', ['vol_party_name', 'long_party_name', 'short_party_name' ]] = None tableCut.loc[:, 'symbol'] = symbol tableCut.loc[:, 'var'] = symbol2varietie(symbol) D[symbol] = tableCut.reset_index(drop=True) return D elif date <= datetime.date(2015, 11, 11): url = cons.CZCE_VOLRANK_URL_2 % (date.year, date.strftime('%Y%m%d')) data = _czce_df_read(url, skiprow=1)[1] elif date < datetime.date(2017, 12, 28): url = cons.CZCE_VOLRANK_URL_3 % (date.year, date.strftime('%Y%m%d')) data = _czce_df_read(url, skiprow=1)[0] else: url = cons.CZCE_VOLRANK_URL_3 % (date.year, date.strftime('%Y%m%d')) data = _czce_df_read(url, skiprow=0)[0] if len(data.columns) < 6: return {} table = data.iloc[:, :9] table.columns = rank_columns table.loc[:, 'rank'] = table.index table[intColumns] = table[intColumns].astype(str) table[intColumns] = table[intColumns].applymap( lambda x: x.replace(',', '')) table = table.applymap(lambda x: 0 if x == '-' else x) indexs = [i for i in table.index if '合约' in i or '品种' in i] indexs.insert(0, 0) D = {} for i in range(len(indexs)): if indexs[i] == 0: tableCut = table.loc[:indexs[i + 1], :] string = tableCut.index.name elif i < len(indexs) - 1: tableCut = table.loc[indexs[i]:indexs[i + 1], :] string = tableCut.index[0] else: tableCut = table.loc[indexs[i]:, :] string = tableCut.index[0] if 'PTA' in string: symbol = 'TA' else: try: symbol = chinese_to_english( find_chinese(re.compile(':(.*) ').findall(string)[0])) except: symbol = re.compile(':(.*) ').findall(string)[0] var = symbol2varietie(symbol) if var in vars: tableCut = tableCut.dropna(how='any').iloc[1:, :] tableCut = tableCut.loc[[ x for x in tableCut.index if x in [str(i) for i in range(21)] ], :] tableCut = _tableCut_cal(tableCut, symbol) D[symbol] = tableCut.reset_index(drop=True) return D
def get_shfe_rank_table(date=None, vars=cons.vars): """ 抓取上海商品期货交易所前20会员持仓排名数据明细 注:该交易所只公布每个品种内部的标的排名,没有公布品种的总排名 Parameters ------ date: 日期 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象 为空时为当天 vars: 合约品种如RB、AL等列表 为空时为所有商品 数据从20020107开始,每交易日16:30左右更新数据 Return ------- DataFrame: rank 排名 int vol_party_name 成交量排序的当前名次会员 string(中文) vol 该会员成交量 int vol_chg 该会员成交量变化量 int long_party_name 持多单排序的当前名次会员 string(中文) long_openIntr 该会员持多单 int long_openIntr_chg 该会员持多单变化量 int short_party_name 持空单排序的当前名次会员 string(中文) short_openIntr 该会员持空单 int short_openIntr_chg 该会员持空单变化量 int symbol 标的合约 string var 品种 string date 日期 string YYYYMMDD """ date = cons.convert_date( date) if date is not None else datetime.date.today() if date.strftime('%Y%m%d') not in calendar: print('%s非交易日' % date.strftime('%Y%m%d')) return {} url = cons.SHFE_VOLRANK_URL % (date.strftime('%Y%m%d')) r = requests_link(url, 'utf-8') try: context = json.loads(r.text) except: return {} df = pd.DataFrame(context['o_cursor']) df = df.rename( columns={ 'CJ1': 'vol', 'CJ1_CHG': 'vol_chg', 'CJ2': 'long_openIntr', 'CJ2_CHG': 'long_openIntr_chg', 'CJ3': 'short_openIntr', 'CJ3_CHG': 'short_openIntr_chg', 'PARTICIPANTABBR1': 'vol_party_name', 'PARTICIPANTABBR2': 'long_party_name', 'PARTICIPANTABBR3': 'short_party_name', 'PRODUCTNAME': 'product1', 'RANK': 'rank', 'INSTRUMENTID': 'symbol', 'PRODUCTSORTNO': 'product2' }) if len(df.columns) < 3: return {} df = df.applymap(lambda x: x.strip() if type(x) == type('') else x) df = df.applymap(lambda x: None if x == '' else x) df['var'] = df['symbol'].apply(lambda x: symbol2varietie(x)) df = df[df['rank'] > 0] for col in [ 'PARTICIPANTID1', 'PARTICIPANTID2', 'PARTICIPANTID3', 'product1', 'product2' ]: try: del df[col] except: pass get_vars = [var for var in vars if var in df['var'].tolist()] D = {} for var in get_vars: df_var = df[df['var'] == var] for symbol in set(df_var['symbol']): df_symbol = df_var[df_var['symbol'] == symbol] D[symbol] = df_symbol.reset_index(drop=True) return D