def legacy_kdata_to_csv(): for index, security_item in get_security_list().iterrows(): for fuquan in (True, False): dir = get_kdata_dir_old(security_item, fuquan) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('all' not in f and 'json' in f and os.path.isfile(os.path.join(dir, f)))] for f in files: tmp = os.path.basename(f).split('_') if fuquan: csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'hfq') if not os.path.exists(csv_path): df = pd.read_json(f, dtype={'code': str}) logger.info("{} to {}".format(f, csv_path)) df = df.loc[:, ['timestamp', 'code', 'low', 'open', 'close', 'high', 'volume', 'turnover', 'securityId', 'fuquan']] df.columns = KDATA_COLUMN_SINA_FQ df.to_csv(csv_path, index=False) else: csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'bfq') if not os.path.exists(csv_path): df = pd.read_json(f, dtype={'code': str}) logger.info("{} to {}".format(f, csv_path)) df = df.loc[:, KDATA_COLUMN_SINA] df.to_csv(csv_path, index=False)
def legacy_kdata_to_csv(): for index, security_item in get_security_list().iterrows(): for fuquan in (True, False): dir = get_kdata_dir_old(security_item, fuquan) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('all' not in f and 'json' in f and os.path.isfile(os.path.join(dir, f)))] for f in files: tmp = os.path.basename(f).split('_') if fuquan: csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'hfq') if not os.path.exists(csv_path): df = pd.read_json(f, dtype={'code': str}) logger.info("{} to {}".format(f, csv_path)) df = df.loc[:, ['timestamp', 'code', 'low', 'open', 'close', 'high', 'volume', 'turnover', 'securityId', 'fuquan']] df.columns = KDATA_COLUMN_FQ df.to_csv(csv_path, index=False) else: csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'bfq') if not os.path.exists(csv_path): df = pd.read_json(f, dtype={'code': str}) logger.info("{} to {}".format(f, csv_path)) df = df.loc[:, KDATA_COLUMN] df.to_csv(csv_path, index=False)
def yield_request(self, item, trading_dates=[], fuquan=None): the_quarters = [] force_download = False if trading_dates: force_download = True for the_date in trading_dates: the_quarters.append(get_year_quarter(the_date)) else: the_quarters = get_quarters(item['listDate']) the_quarters = set(the_quarters) if fuquan: fuquans = [fuquan] else: fuquans = ['bfq', 'hfq'] # get day k data for year, quarter in the_quarters: for fuquan in fuquans: data_path = get_kdata_path(item, source='sina', year=year, quarter=quarter, fuquan=fuquan) data_exist = os.path.exists(data_path) or kdata_exist(item, year, quarter, fuquan, source='sina') if not data_exist or force_download: url = self.get_k_data_url(item['code'], year, quarter, fuquan) yield Request(url=url, headers=DEFAULT_KDATA_HEADER, meta={'path': data_path, 'item': item, 'fuquan': fuquan}, callback=self.download_day_k_data)
def check_convert_result(): for index, security_item in get_security_list().iterrows(): for fuquan in ('bfq', 'hfq'): dayk_path = get_kdata_path(security_item, fuquan=fuquan) if os.path.exists(dayk_path): df_result = pd.read_csv(dayk_path) if fuquan == 'hfq': df = pd.DataFrame(columns=data_contract.KDATA_COLUMN_FQ) else: df = pd.DataFrame(columns=data_contract.KDATA_COLUMN) dir = get_kdata_dir(security_item, fuquan=fuquan) if os.path.exists(dir): files = [ os.path.join(dir, f) for f in os.listdir(dir) if ('day' not in f and 'csv' in f and os.path.isfile(os.path.join(dir, f))) ] for f in files: df = df.append(pd.read_csv(f), ignore_index=True) assert_df(df, df_result) logger.info("{} merge as one ok".format( security_item['code']))
def get_kdata(security_item, the_date=None, start_date=None, end_date=None, fuquan='bfq', dtype=None, source='163', level='day'): if type(security_item) == str: if 'stock' in security_item: security_item = get_security_item(id=security_item) else: security_item = get_security_item(code=security_item) the_path = files_contract.get_kdata_path(security_item, source=source, fuquan=fuquan) if os.path.isfile(the_path): if not dtype: dtype = {"code": str} df = pd.read_csv(the_path, dtype=dtype) df = df.set_index(df['timestamp'], drop=False) df.index = pd.to_datetime(df.index) df = df.sort_index() if the_date: if the_date in df.index: return df.loc[the_date] else: return pd.DataFrame() if not start_date: start_date = security_item['listDate'] if not end_date: end_date = datetime.datetime.today() df = df.loc[start_date:end_date] return df return pd.DataFrame()
def yield_request(self, item, start_date=None, end_date=None): data_path = get_kdata_path(item, source='163') if start_date: start = start_date.strftime('%Y%m%d') else: start = item['listDate'].replace('-', '') if end_date: end = end_date.strftime('%Y%m%d') else: end = datetime.today().strftime('%Y%m%d') if not os.path.exists(data_path) or start_date or end_date: if item['exchange'] == 'sh': exchange_flag = 0 else: exchange_flag = 1 url = self.get_k_data_url(exchange_flag, item['code'], start, end) yield Request(url=url, meta={ 'path': data_path, 'item': item }, callback=self.download_day_k_data)
def yield_request(self, item, trading_dates=[], fuquan=None): the_quarters = [] force_download = False if trading_dates: force_download = True for the_date in trading_dates: the_quarters.append(get_year_quarter(the_date)) else: the_quarters = get_quarters(item['listDate']) the_quarters = set(the_quarters) if fuquan: fuquans = [fuquan] else: fuquans = ['bfq', 'hfq'] # get day k data for year, quarter in the_quarters: for fuquan in fuquans: data_path = get_kdata_path(item, source='sina', year=year, quarter=quarter, fuquan=fuquan) data_exist = os.path.exists(data_path) or kdata_exist(item, year, quarter, fuquan, source='sina') if not data_exist or force_download: url = self.get_k_data_url(item['code'], year, quarter, fuquan) yield Request(url=url, headers=DEFAULT_KDATA_HEADER, meta={'path': data_path, 'item': item, 'fuquan': fuquan}, callback=self.download_day_k_data)
def spider_closed(self, spider, reason): self.df_pe['close'] = self.df_close['close'] self.df_pe['code'] = self.security_item['code'] self.df_pe['securityId'] = self.security_item['id'] self.df_pe['name'] = self.security_item['name'] self.df_pe.to_csv(get_kdata_path(self.security_item), index=False) spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
def spider_closed(self, spider, reason): self.df_pe['close'] = self.df_close['close'] self.df_pe['code'] = self.security_item['code'] self.df_pe['securityId'] = self.security_item['id'] self.df_pe['name'] = self.security_item['name'] self.df_pe.to_csv(get_kdata_path(self.security_item), index=False) spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
def add_factor_to_163(security_item): path_163 = get_kdata_path(security_item, source='163', fuquan='bfq') df_163 = pd_read_csv(path_163) if 'factor' in df_163.columns: df = df_163[df_163['factor'].isna()] if df.empty: logger.info("{} 163 factor is ok", security_item['code']) return path_sina = get_kdata_path(security_item, source='sina', fuquan='hfq') df_sina = pd_read_csv(path_sina) df_sina = df_sina[~df_sina.index.duplicated(keep='first')] df_163['factor'] = df_sina['factor'] df_163.to_csv(path_163, index=False)
def add_factor_to_163(security_item): path_163 = get_kdata_path(security_item, source='163', fuquan='bfq') df_163 = pd.read_csv(path_163, dtype=str) df_163 = time_index_df(df_163) if 'factor' in df_163.columns: df = df_163[df_163['factor'].isna()] if df.empty: logger.info("{} 163 factor is ok", security_item['code']) return path_sina = get_kdata_path(security_item, source='sina', fuquan='hfq') df_sina = pd.read_csv(path_sina, dtype=str) df_sina = time_index_df(df_sina) df_163['factor'] = df_sina['factor'] df_163.to_csv(path_163, index=False)
def add_factor_to_163(security_item): path_163 = get_kdata_path(security_item, source='163', fuquan='bfq') df_163 = pd.read_csv(path_163, dtype=str) df_163 = time_index_df(df_163) if 'factor' in df_163.columns: df = df_163[df_163['factor'].isna()] if df.empty: logger.info("{} 163 factor is ok", security_item['code']) return path_sina = get_kdata_path(security_item, source='sina', fuquan='hfq') df_sina = pd.read_csv(path_sina, dtype=str) df_sina = time_index_df(df_sina) df_163['factor'] = df_sina['factor'] df_163.to_csv(path_163, index=False)
def fetch_kdata(exchange_str='bitstamp'): ccxt_exchange = eval("ccxt.{}()".format(exchange_str)) if ccxt_exchange.has['fetchOHLCV']: for _, security_item in get_security_list(security_type='cryptocurrency', exchanges=[exchange_str]).iterrows(): try: if security_item['name'] not in CRYPTOCURRENCY_PAIR: continue start_date, df = get_latest_download_trading_date(security_item) # 日K线只抓到昨天 end_date = pd.Timestamp.today() - pd.DateOffset(1) if start_date and (start_date > end_date): logger.info("{} kdata is ok".format(security_item['code'])) continue try: kdatas = ccxt_exchange.fetch_ohlcv(security_item['name'], timeframe='1d') # for rateLimit time.sleep(5) except Exception as e: logger.exception("fetch_kdata for {} {} failed".format(exchange_str, security_item['name']), e) continue for kdata in kdatas: timestamp = pd.Timestamp.fromtimestamp(int(kdata[0] / 1000)) if is_same_date(timestamp, pd.Timestamp.today()): continue kdata_json = { 'timestamp': to_time_str(timestamp), 'code': security_item['code'], 'name': security_item['name'], 'open': kdata[1], 'high': kdata[2], 'low': kdata[3], 'close': kdata[4], 'volume': kdata[5], 'securityId': security_item['id'], 'preClose': None, 'change': None, 'changePct': None } df = df.append(kdata_json, ignore_index=True) if not df.empty: df = df.loc[:, KDATA_COMMON_COL] kdata_df_save(df, get_kdata_path(security_item), calculate_change=True) logger.info( "fetch_kdata for exchange:{} security:{} success".format(exchange_str, security_item['name'])) except Exception as e: logger.info( "fetch_kdata for exchange:{} security:{} failed".format(exchange_str, security_item['name'], e)) else: logger.warning("exchange:{} not support fetchOHLCV".format(exchange_str))
def restore_kdata(): for index, security_item in get_security_list(start_code='600000', end_code='600017').iterrows(): path_163 = get_kdata_path(security_item, source='163', fuquan='bfq') df = pd.read_csv(path_163, dtype=str) df = time_index_df(df) if 'id' in df.columns: df = df.drop(['id'], axis=1) df = df[~df.index.duplicated(keep='first')] df.timestamp.apply(lambda x: to_time_str(x)) df.to_csv(path_163, index=False) for fuquan in ('hfq', 'bfq'): path_sina = get_kdata_path(security_item, source='sina', fuquan=fuquan) df = pd.read_csv(path_sina, dtype=str) df = time_index_df(df) if 'id' in df.columns: df = df.drop(['id'], axis=1) df = df[~df.index.duplicated(keep='first')] df.timestamp = df.timestamp.apply(lambda x: to_time_str(x)) df.to_csv(path_sina, index=False)
def merge_to_current_kdata(security_item, df, fuquan='bfq'): df = df.set_index(df['timestamp'], drop=False) df.index = pd.to_datetime(df.index) df = df.sort_index() df1 = get_kdata(security_item, source='sina', fuquan=fuquan, dtype=str) df1 = df1.append(df) df1 = df1.drop_duplicates(subset='timestamp', keep='last') df1 = df1.sort_index() the_path = files_contract.get_kdata_path(security_item, source='sina', fuquan=fuquan) df1.to_csv(the_path, index=False)
def merge_to_current_kdata(security_item, df, fuquan='bfq'): df = df.set_index(df['timestamp'], drop=False) df.index = pd.to_datetime(df.index) df = df.sort_index() df1 = get_kdata(security_item, source='sina', fuquan=fuquan, dtype=str) df1 = df1.append(df) df1 = df1.drop_duplicates(subset='timestamp', keep='last') df1 = df1.sort_index() the_path = files_contract.get_kdata_path(security_item, source='sina', fuquan=fuquan) df1.to_csv(the_path, index=False)
def check_result(): for index, security_item in get_security_list().iterrows(): for fuquan in ('bfq', 'hfq'): dayk_path = get_kdata_path(security_item, fuquan=fuquan) if not os.path.exists(dayk_path): logger.warn(get_security_dir(security_item)) dir = get_tick_dir(security_item) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('csv' in f and os.path.isfile(os.path.join(dir, f)))] if not files: logger.warn(get_security_dir(security_item))
def check_result(): for index, security_item in get_security_list().iterrows(): for fuquan in ('bfq', 'hfq'): dayk_path = get_kdata_path(security_item, fuquan=fuquan) if not os.path.exists(dayk_path): logger.warn(get_security_dir(security_item)) dir = get_tick_dir(security_item) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('csv' in f and os.path.isfile(os.path.join(dir, f)))] if not files: logger.warn(get_security_dir(security_item))
def merge_kdata_to_one(security_item=None, replace=False, fuquan='bfq'): if type(security_item) != 'NoneType': items = pd.DataFrame().append(security_item).iterrows() else: items = get_security_list().iterrows() if fuquan: fuquans = [fuquan] else: fuquans = ['bfq', 'hfq'] for index, security_item in items: for fuquan in fuquans: dayk_path = get_kdata_path(security_item, source='sina', fuquan=fuquan) if fuquan == 'hfq': df = pd.DataFrame( columns=data_contract.KDATA_COLUMN_SINA_FQ) else: df = pd.DataFrame(columns=data_contract.KDATA_COLUMN_SINA) the_dir = get_kdata_dir(security_item, fuquan=fuquan) if os.path.exists(the_dir): files = [ os.path.join(the_dir, f) for f in os.listdir(the_dir) if ('dayk.csv' not in f and os.path.isfile(os.path.join(the_dir, f))) ] for f in files: df = df.append(pd.read_csv(f, dtype=str), ignore_index=True) if df.size > 0: df = df.set_index(df['timestamp']) df.index = pd.to_datetime(df.index) df = df.sort_index() logger.info("{} to {}".format(security_item['code'], dayk_path)) if replace: df.to_csv(dayk_path, index=False) else: StockKDataSinaSpider.merge_to_current_kdata( security_item, df, fuquan=fuquan) for f in files: logger.info("remove {}".format(f)) os.remove(f) if fuquan == 'hfq': StockKDataSinaSpider.add_factor_to_163(security_item)
def yield_request(self, item, the_years=None): data_path = get_kdata_path(item, source='163') if not the_years: if not pd.isna(item['listDate']): # 163 could just provide the date after year 2002 the_years = range(max(int(item['listDate']), 2002), pd.Timestamp.today().year + 1) else: the_years = range(2005, pd.Timestamp.today().year + 1) for the_year in the_years: url = self.get_k_data_url(the_year, item['code']) yield Request(url=url, meta={'path': data_path, 'item': item}, callback=self.download_day_k_data)
def merge_to_current_kdata(security_item, df, fuquan='bfq'): df = df.set_index(df['timestamp'], drop=False) df.index = pd.to_datetime(df.index) df = df.sort_index() df1 = get_kdata(security_item, source='sina', fuquan=fuquan) df1 = df1.append(df) df1 = df1.drop_duplicates(subset='timestamp', keep='last') df1 = df1.sort_index() the_path = files_contract.get_kdata_path(security_item, source='sina', fuquan=fuquan) if fuquan == 'hfq': df1 = df1.loc[:, data_contract.KDATA_COLUMN_SINA_FQ] else: df1 = df1.loc[:, data_contract.KDATA_COLUMN_SINA] df1.to_csv(the_path, index=False)
def start_requests(self): for _, item in get_security_list().iterrows(): for fuquan in ['hfq', 'bfq']: data_path = get_kdata_path(item, fuquan=fuquan, source='ths') data_exist = os.path.isfile(data_path) if not data_exist or True: # get day k data if fuquan == 'hfq': flag = 2 else: flag = 0 url = self.get_k_data_url(item['code'], flag) yield Request(url=url, headers=TONGHUASHUN_KDATA_HEADER, meta={'path': data_path, 'item': item, 'fuquan': fuquan}, callback=self.download_day_k_data) else: self.logger.info("{} kdata existed".format(item['code']))
def start_requests(self): for _, item in get_security_list().iterrows(): for fuquan in ['hfq', 'bfq']: data_path = get_kdata_path(item, fuquan=fuquan, source='ths') data_exist = os.path.isfile(data_path) if not data_exist or True: # get day k data if fuquan == 'hfq': flag = 2 else: flag = 0 url = self.get_k_data_url(item['code'], flag) yield Request(url=url, headers=TONGHUASHUN_KDATA_HEADER, meta={'path': data_path, 'item': item, 'fuquan': fuquan}, callback=self.download_day_k_data) else: self.logger.info("{} kdata existed".format(item['code']))
def merge_kdata_to_one(security_item=None, replace=False, fuquan='bfq'): if type(security_item) != 'NoneType': items = pd.DataFrame().append(security_item).iterrows() else: items = get_security_list().iterrows() if fuquan: fuquans = [fuquan] else: fuquans = ['bfq', 'hfq'] for index, security_item in items: for fuquan in fuquans: dayk_path = get_kdata_path(security_item, source='sina', fuquan=fuquan) if fuquan == 'hfq': df = pd.DataFrame( columns=data_contract.KDATA_COLUMN_FQ) else: df = pd.DataFrame( columns=data_contract.KDATA_COLUMN) the_dir = get_kdata_dir(security_item, fuquan=fuquan) if os.path.exists(the_dir): files = [os.path.join(the_dir, f) for f in os.listdir(the_dir) if ('dayk.csv' not in f and os.path.isfile(os.path.join(the_dir, f)))] for f in files: df = df.append(pd.read_csv(f, dtype=str), ignore_index=True) if df.size > 0: df = df.set_index(df['timestamp']) df.index = pd.to_datetime(df.index) df = df.sort_index() logger.info("{} to {}".format(security_item['code'], dayk_path)) if replace: df.to_csv(dayk_path, index=False) else: merge_to_current_kdata(security_item, df, fuquan=fuquan) for f in files: logger.info("remove {}".format(f)) os.remove(f) if fuquan == 'hfq': add_factor_to_163(security_item)
def yield_request(self, item, the_years=None): data_path = get_kdata_path(item, source='163') if not the_years: if not pd.isna(item['listDate']): # 163 could just provide the date after year 2002 the_years = range(max(int(item['listDate']), 2002), pd.Timestamp.today().year + 1) else: the_years = range(2005, pd.Timestamp.today().year + 1) for the_year in the_years: url = self.get_k_data_url(the_year, item['code']) yield Request(url=url, meta={ 'path': data_path, 'item': item }, callback=self.download_day_k_data)
def yield_request(self, item, start_date=None, end_date=None): data_path = get_kdata_path(item, source='163') if start_date: start = start_date.strftime('%Y%m%d') else: start = item['listDate'].replace('-', '') if end_date: end = end_date.strftime('%Y%m%d') else: end = datetime.today().strftime('%Y%m%d') if not os.path.exists(data_path) or start_date or end_date: if item['exchange'] == 'sh': exchange_flag = 0 else: exchange_flag = 1 url = self.get_k_data_url(exchange_flag, item['code'], start, end) yield Request(url=url, meta={'path': data_path, 'item': item}, callback=self.download_day_k_data)
def get_kdata(security_item, the_date=None, start_date=None, end_date=None, fuquan='bfq', dtype=None, source='163', level='day'): if type(security_item) == str: if 'stock' in security_item: security_item = get_security_item(id=security_item) else: security_item = get_security_item(code=security_item) the_path = files_contract.get_kdata_path(security_item, source=source, fuquan=fuquan) if os.path.isfile(the_path): if not dtype: dtype = {"code": str, 'timestamp': str} df = pd.read_csv(the_path, dtype=dtype) df.timestamp = df.timestamp.apply(lambda x: to_time_str(x)) df = df.set_index(df['timestamp'], drop=False) df.index = pd.to_datetime(df.index) df = df.sort_index() if the_date: if the_date in df.index: return df.loc[the_date] else: return pd.DataFrame() if not start_date: if type(security_item['listDate']) != str and np.isnan(security_item['listDate']): start_date = '2002-01-01' else: start_date = security_item['listDate'] if not end_date: end_date = datetime.datetime.today() if start_date and end_date: df = df.loc[start_date:end_date] return df return pd.DataFrame()
def check_convert_result(): for index, security_item in get_security_list().iterrows(): for fuquan in ('bfq', 'hfq'): dayk_path = get_kdata_path(security_item, fuquan=fuquan) if os.path.exists(dayk_path): df_result = pd.read_csv(dayk_path) if fuquan == 'hfq': df = pd.DataFrame( columns=data_contract.KDATA_COLUMN_FQ) else: df = pd.DataFrame( columns=data_contract.KDATA_COLUMN) dir = get_kdata_dir(security_item, fuquan=fuquan) if os.path.exists(dir): files = [os.path.join(dir, f) for f in os.listdir(dir) if ('day' not in f and 'csv' in f and os.path.isfile(os.path.join(dir, f)))] for f in files: df = df.append(pd.read_csv(f), ignore_index=True) assert_df(df, df_result) logger.info("{} merge as one ok".format(security_item['code']))
def spider_closed(self, spider, reason): self.current_df = self.current_df.loc[:, KDATA_COLUMN_INDEX] print(self.current_df) self.current_df.to_csv(get_kdata_path(item=self.security_item), index=False) spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
def parse_shfe_data(force_parse=False): the_dir = get_exchange_cache_dir(security_type='future', exchange='shfe') need_parse_files = [] for the_zip_file in [ os.path.join(the_dir, f) for f in os.listdir(the_dir) if f.endswith('.zip') ]: dst_file = the_zip_file.replace('.zip', ".xls") if not os.path.exists(dst_file): dst_dir = the_zip_file.replace('.zip', "") os.makedirs(dst_dir) unzip(the_zip_file, dst_dir) files = [ os.path.join(dst_dir, f) for f in os.listdir(dst_dir) if f.endswith('.xls') ] if len(files) == 1: os.rename(files[0], dst_file) need_parse_files.append(dst_file) if force_parse: need_parse_files = [ os.path.join(the_dir, f) for f in os.listdir(the_dir) if f.endswith('.xls') ] for the_file in need_parse_files: logger.info("parse {}".format(the_file)) df = pd.read_excel(the_file, skiprows=2, skip_footer=4, index_col='合约', converters={'日期': str}) df.index = pd.Series(df.index).fillna(method='ffill') df = df.loc[:, [ '日期', '前收盘', '前结算', '开盘价', '最高价', '最低价', '收盘价', '结算价', '涨跌1', '涨跌2', '成交量', '成交金额', '持仓量' ]] df.columns = [ 'timestamp', 'preClose', 'preSettlement', 'open', 'high', 'low', 'close', 'settlement', 'change', 'change1', 'volume', 'turnover', 'openInterest' ] # 日期格式统一,方便导入es # df.timestamp = df.timestamp.apply(lambda x: to_time_str(x)) unique_index = df.index.drop_duplicates() security_list = get_security_list(security_type='future', exchanges=['shfe']) for the_contract in unique_index: logger.info("start handling {} in {}".format( the_contract, the_file)) security_item = { 'code': the_contract, 'name': get_future_name(the_contract), 'id': 'future_{}_{}'.format('shfe', the_contract), 'exchange': 'shfe', 'type': 'future' } # 检查是否需要保存合约meta if (not security_list.empty) and ('code' in security_list.columns): security_list = security_list.set_index(security_list['code'], drop=False) if the_contract not in security_list.index: security_list = security_list.append(security_item, ignore_index=True) security_list = security_list.sort_index() security_list.to_csv(get_security_list_path('future', 'shfe'), index=False) the_df = df.loc[the_contract, ] the_df['code'] = the_contract the_df['name'] = get_future_name(the_contract) the_df['securityId'] = 'future_{}_{}'.format('shfe', the_contract) the_df['changePct'] = the_df['change'] / the_df['preClose'] the_df['changePct1'] = the_df['change1'] / the_df['preSettlement'] kdata_path = get_kdata_path(item=security_item, source='exchange') # TODO:这些逻辑应该统一处理 kdata_dir = get_kdata_dir(item=security_item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir) if os.path.exists(kdata_path): saved_df = pd.read_csv(kdata_path, dtype=str) else: saved_df = pd.DataFrame() saved_df = saved_df.append(the_df, ignore_index=True) saved_df = saved_df.loc[:, KDATA_FUTURE_COL] if not saved_df.empty: kdata_df_save(saved_df, kdata_path) logger.info("end handling {} in {}".format(the_contract, the_file))
def parse_shfe_day_data(force_parse=False): cache_dir = get_exchange_cache_dir(security_type='future', exchange='shfe', the_year=datetime.datetime.today().year, data_type="day_kdata") the_parsed_path = os.path.join(cache_dir, 'parsed') the_parsed = [] if os.path.exists(the_parsed_path): with open(the_parsed_path) as data_file: the_parsed = json.load(data_file) if force_parse: the_dates = [f for f in os.listdir(cache_dir) if f != 'parsed' and f] else: the_dates = [ f for f in os.listdir(cache_dir) if f != 'parsed' and f not in the_parsed ] for the_date in the_dates: the_path = os.path.join(cache_dir, the_date) logger.info("start handling {}".format(the_path)) with open(the_path, 'r', encoding='UTF8') as f: tmp_str = f.read() the_json = json.loads(tmp_str) the_datas = the_json['o_curinstrument'] # 日期,代码,名称,最低,开盘,收盘,最高,成交量(手),成交额(元),唯一标识,前收盘,涨跌额,涨跌幅(%),持仓量,结算价,前结算,涨跌额(按结算价),涨跌幅(按结算价) KDATA_COLUMN_FUTURE = [ 'timestamp', 'code', 'name', 'low', 'open', 'close', 'high', 'volume', 'turnover', 'securityId', 'preClose', 'change', 'changePct', 'openInterest', 'settlement', 'preSettlement', 'change1', 'changePct1' ] for the_data in the_datas: # {'CLOSEPRICE': 11480, # 'DELIVERYMONTH': '1809', # 'HIGHESTPRICE': 11555, # 'LOWESTPRICE': 11320, # 'OPENINTEREST': 425692, # 'OPENINTERESTCHG': 3918, # 'OPENPRICE': 11495, # 'ORDERNO': 0, # 'PRESETTLEMENTPRICE': 11545, # 'PRODUCTID': 'ru_f ', # 'PRODUCTNAME': '天然橡胶 ', # 'PRODUCTSORTNO': 100, # 'SETTLEMENTPRICE': 11465, # 'VOLUME': 456574, # 'ZD1_CHG': -65, # 'ZD2_CHG': -80} if not re.match("\d{4}", the_data['DELIVERYMONTH']): continue code = "{}{}".format( the_data['PRODUCTID'][:the_data['PRODUCTID'].index('_')], the_data['DELIVERYMONTH']) logger.info("start handling {} for {}".format(code, the_date)) name = get_future_name(code) security_id = "future_shfe_{}".format(code) security_list = get_security_list(security_type='future', exchanges=['shfe']) logger.info("start handling {} for {}".format(code, the_date)) security_item = { 'code': code, 'name': name, 'id': security_id, 'exchange': 'shfe', 'type': 'future' } # 检查是否需要保存合约meta if security_list is not None and 'code' in security_list.columns: security_list = security_list.set_index( security_list['code'], drop=False) if code not in security_list.index: security_list = security_list.append(security_item, ignore_index=True) security_list.to_csv(get_security_list_path( 'future', 'shfe'), index=False) kdata_path = get_kdata_path(item=security_item, source='exchange') # TODO:这些逻辑应该统一处理 kdata_dir = get_kdata_dir(item=security_item) if not os.path.exists(kdata_dir): os.makedirs(kdata_dir) if os.path.exists(kdata_path): saved_df = pd.read_csv(kdata_path, dtype=str) saved_df = saved_df.set_index(saved_df['timestamp'], drop=False) else: saved_df = pd.DataFrame() if saved_df.empty or the_date not in saved_df.index: low_price = the_data['LOWESTPRICE'] if not low_price: low_price = 0 open_price = the_data['OPENPRICE'] if not open_price: open_price = 0 close_price = the_data['CLOSEPRICE'] if not close_price: close_price = 0 high_price = the_data['HIGHESTPRICE'] if not high_price: high_price = 0 volume = the_data['VOLUME'] if not volume: volume = 0 if type(the_data['ZD1_CHG']) == str: change = 0 else: change = the_data['ZD1_CHG'] if type(the_data['ZD2_CHG']) == str: change1 = 0 else: change1 = the_data['ZD2_CHG'] pre_close = close_price - change pre_settlement = the_data['PRESETTLEMENTPRICE'] # 首日交易 if pre_close != 0: change_pct = change / pre_close else: change_pct = 0 if pre_settlement != 0: change_pct1 = change1 / pre_settlement else: change_pct1 = 0 the_json = { "timestamp": to_time_str(the_date), "code": code, "name": name, "low": low_price, "open": open_price, "close": close_price, "high": high_price, "volume": volume, # 成交额为估算 "turnover": (low_price + open_price + close_price + high_price / 4) * volume, "securityId": security_id, "preClose": pre_close, "change": change, "changePct": change_pct, "openInterest": the_data['OPENINTEREST'], "settlement": the_data['SETTLEMENTPRICE'], "preSettlement": the_data['PRESETTLEMENTPRICE'], "change1": change1, "changePct1": change_pct1 } saved_df = saved_df.append(the_json, ignore_index=True) saved_df = saved_df.loc[:, KDATA_COLUMN_FUTURE] saved_df = saved_df.drop_duplicates(subset='timestamp', keep='last') saved_df = saved_df.set_index(saved_df['timestamp'], drop=False) saved_df.index = pd.to_datetime(saved_df.index) saved_df = saved_df.sort_index() saved_df.to_csv(kdata_path, index=False) logger.info("end handling {} for {}".format( code, the_date)) if the_date not in the_parsed: the_parsed.append(the_date) if the_parsed: result_list = drop_duplicate(the_parsed) result_list = sorted(result_list) with open(the_parsed_path, 'w') as outfile: json.dump(result_list, outfile) logger.info("end handling {}".format(the_path))
def get_kdata(security_item, exchange=None, the_date=None, start_date=None, end_date=None, fuquan='bfq', dtype=None, source=None, level='day'): """ get kdata. Parameters ---------- security_item : SecurityItem or str the security item,id or code exchange : str the exchange,set this for cryptocurrency the_date : TimeStamp str or TimeStamp get the kdata for the exact date start_date : TimeStamp str or TimeStamp start date end_date : TimeStamp str or TimeStamp end date fuquan : str {"qfq","hfq","bfq"},default:"bfq" dtype : type the data type for the csv column,default: None source : str the data source,{'163','sina','exchange'},just used for internal merge level : str or int the kdata level,{1,5,15,30,60,'day','week','month'},default : 'day' Returns ------- DataFrame """ # 由于数字货币的交易所太多,必须指定exchange security_item = to_security_item(security_item, exchange) source = adjust_source(security_item, source) # 163的数据是合并过的,有复权因子,都存在'bfq'目录下,只需从一个地方取数据,并做相应转换 if source == '163': the_path = files_contract.get_kdata_path(security_item, source=source, fuquan='bfq') else: the_path = files_contract.get_kdata_path(security_item, source=source, fuquan=fuquan) if os.path.isfile(the_path): if not dtype: dtype = {"code": str, 'timestamp': str} df = pd.read_csv(the_path, dtype=dtype) if 'factor' in df.columns and source == '163' and security_item[ 'type'] == 'stock': df_kdata_has_factor = df[df['factor'].notna()] if df_kdata_has_factor.shape[0] > 0: latest_factor = df_kdata_has_factor.tail(1).factor.iat[0] else: latest_factor = None df.timestamp = df.timestamp.apply(lambda x: to_time_str(x)) df = df.set_index(df['timestamp'], drop=False) df.index = pd.to_datetime(df.index) df = df.sort_index() if the_date: if the_date in df.index: df = df.loc[df['timestamp'] == the_date] else: return None else: if not start_date and not pd.isna(security_item['listDate']): start_date = security_item['listDate'] if not end_date: end_date = datetime.datetime.today() if start_date and end_date: df = df.loc[start_date:end_date] # 复权处理 if source == '163' and security_item['type'] == 'stock': if 'factor' in df.columns: # 后复权是不变的 df['hfqClose'] = df.close * df.factor df['hfqOpen'] = df.open * df.factor df['hfqHigh'] = df.high * df.factor df['hfqLow'] = df.low * df.factor # 前复权需要根据最新的factor往回算,当前价格不变 if latest_factor: df['qfqClose'] = df.hfqClose / latest_factor df['qfqOpen'] = df.hfqOpen / latest_factor df['qfqHigh'] = df.hfqHigh / latest_factor df['qfqLow'] = df.hfqLow / latest_factor else: logger.exception("missing latest factor for {}".format( security_item['id'])) return df return pd.DataFrame()
def get_kdata(security_item, exchange=None, the_date=None, start_date=None, end_date=None, fuquan='bfq', source=None, level='day', generate_id=False): """ get kdata. Parameters ---------- security_item : SecurityItem or str the security item,id or code exchange : str the exchange,set this for cryptocurrency the_date : TimeStamp str or TimeStamp get the kdata for the exact date start_date : TimeStamp str or TimeStamp start date end_date : TimeStamp str or TimeStamp end date fuquan : str {"qfq","hfq","bfq"},default:"bfq" source : str the data source,{'163','sina','exchange'},just used for internal merge level : str or int the kdata level,{1,5,15,30,60,'day','week','month'},default : 'day' Returns ------- DataFrame """ # 由于数字货币的交易所太多,必须指定exchange security_item = to_security_item(security_item, exchange) source = adjust_source(security_item, source) # 163的数据是合并过的,有复权因子,都存在'bfq'目录下,只需从一个地方取数据,并做相应转换 if source == '163': the_path = files_contract.get_kdata_path(security_item, source=source, fuquan='bfq') else: the_path = files_contract.get_kdata_path(security_item, source=source, fuquan=fuquan) if os.path.isfile(the_path): df = pd_utils.pd_read_csv(the_path, generate_id=generate_id) if 'factor' in df.columns and source == '163' and security_item[ 'type'] == 'stock': df_kdata_has_factor = df[df['factor'].notna()] if df_kdata_has_factor.shape[0] > 0: latest_factor = df_kdata_has_factor.tail(1).factor.iat[0] else: latest_factor = None if the_date: if the_date in df.index: df = df.loc[the_date:the_date, :] else: return None else: if start_date or end_date: df = df_for_date_range(df, start_date=start_date, end_date=end_date) # 复权处理 if source == '163' and security_item['type'] == 'stock': if 'factor' in df.columns: # 后复权是不变的 df['hfqClose'] = df.close * df.factor df['hfqOpen'] = df.open * df.factor df['hfqHigh'] = df.high * df.factor df['hfqLow'] = df.low * df.factor # 前复权需要根据最新的factor往回算,当前价格不变 if latest_factor: df['qfqClose'] = df.hfqClose / latest_factor df['qfqOpen'] = df.hfqOpen / latest_factor df['qfqHigh'] = df.hfqHigh / latest_factor df['qfqLow'] = df.hfqLow / latest_factor else: logger.exception("missing latest factor for {}".format( security_item['id'])) return df return pd.DataFrame()
def get_kdata(security_item, the_date=None, start_date=None, end_date=None, fuquan='bfq', dtype=None, source='163', level='day'): """ get kdata. Parameters ---------- security_item : SecurityItem or str the security item,id or code the_date : TimeStamp str or TimeStamp get the kdata for the exact date start_date : TimeStamp str or TimeStamp start date end_date : TimeStamp str or TimeStamp end date fuquan : str {"qfq","hfq","bfq"},default:"bfq" dtype : type the data type for the csv column,default: None source : str the data source,{'163','sina'},default: '163' level : str or int the kdata level,{1,5,15,30,60,'day','week','month'},default : 'day' Returns ------- DataFrame """ security_item = to_security_item(security_item) # 163的数据是合并过的,有复权因子,都存在'bfq'目录下,只需从一个地方取数据,并做相应转换 if source == '163': the_path = files_contract.get_kdata_path(security_item, source=source, fuquan='bfq') else: the_path = files_contract.get_kdata_path(security_item, source=source, fuquan=fuquan) if os.path.isfile(the_path): if not dtype: dtype = {"code": str, 'timestamp': str} df = pd.read_csv(the_path, dtype=dtype) df.timestamp = df.timestamp.apply(lambda x: to_time_str(x)) df = df.set_index(df['timestamp'], drop=False) df.index = pd.to_datetime(df.index) df = df.sort_index() if the_date: if the_date in df.index: return df.loc[the_date] else: return pd.DataFrame() if not start_date: if security_item['type'] == 'stock': if type(security_item['listDate']) != str and np.isnan( security_item['listDate']): start_date = '2002-01-01' else: start_date = security_item['listDate'] else: start_date = datetime.datetime.today() - datetime.timedelta( days=30) if not end_date: end_date = datetime.datetime.today() if start_date and end_date: df = df.loc[start_date:end_date] # if source == '163' and security_item['type'] == 'stock': if fuquan == 'bfq': return df if 'factor' in df.columns: current_factor = df.tail(1).factor.iat[0] # 后复权是不变的 df.close *= df.factor df.open *= df.factor df.high *= df.factor df.low *= df.factor if fuquan == 'qfq': # 前复权需要根据最新的factor往回算 df.close /= current_factor df.open /= current_factor df.high /= current_factor df.low /= current_factor return df return pd.DataFrame()
def spider_closed(self, spider, reason): self.current_df = self.current_df.loc[:, KDATA_INDEX_COL] print(self.current_df) self.current_df.to_csv(get_kdata_path(item=self.security_item), index=False) spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)