async def LoadData(asset_db_writer, daily_bar_writer, show_progress, start_session, end_session): bmdp = provider.BitmexDataProvider(start_session, end_session) futures_df = pd.DataFrame() urls = await bmdp.GetTradeFileUrls() with cli.maybe_show_progress(range(len(urls)), show_progress=show_progress, label="Loading BitMEX data", item_show_func=lambda x: urls[x] if x is not None else '') as progress: for _ in progress: # TODO: Pass granularity at command line. async for ohlcv in bmdp.LoadData( granularity=provider.BitmexDataProvider.Granularity.DAY): new_details_df = pd.DataFrame() asset_details = await bmdp.GetAssetDetails(ohlcv) for asset_detail in asset_details.values(): detail_data = GetFutureNeededAssetDetails(asset_detail) if detail_data is not None: # futures sid is the futures_df.index new_details_df = new_details_df.append( pd.DataFrame(pd.Series(detail_data)).T, ignore_index=True) futures_df = (futures_df.append(new_details_df).rename_axis( 'sid').drop_duplicates('symbol')) futures_df['sid'] = futures_df.index # flatten the multi-index ohlcv.columns = ohlcv.columns.droplevel() daily_bar_writer.write(GetOHLCVPerSid(ohlcv, futures_df), show_progress=show_progress) root_symbols_df = futures_df[['root_symbol', 'exchange']].drop_duplicates() root_symbols_df['root_symbol_id'] = root_symbols_df.index.values asset_db_writer.write(futures=futures_df, root_symbols=root_symbols_df) await bmdp.Close()
def write(self, data, assets=None, show_progress=False, invalid_data_behavior='warn'): """ Parameters ---------- data : iterable[tuple[int, pandas.DataFrame or bcolz.ctable]] The data chunks to write. Each chunk should be a tuple of sid and the data for that asset. assets : set[int], optional The assets that should be in ``data``. If this is provided we will check ``data`` against the assets and provide better progress information. show_progress : bool, optional Whether or not to show a progress bar while writing. invalid_data_behavior : {'warn', 'raise', 'ignore'}, optional What to do when data is encountered that is outside the range of a uint32. Returns ------- table : bcolz.ctable The newly-written table. """ ctx = maybe_show_progress( ((sid, to_ctable(df, invalid_data_behavior)) for sid, df in data), show_progress=show_progress, item_show_func=self.progress_bar_item_show_func, label=self.progress_bar_message, length=len(assets) if assets is not None else None, ) with ctx as it: return self._write_internal(it, assets)
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo', start, end, session=session, ).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade metadata.iloc[sid] = df.index[0], df.index[-1], symbol df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1
def write(self, data, show_progress=False, invalid_data_behavior='warn'): """Write a stream of minute data. Parameters ---------- data : iterable[(int, pd.DataFrame)] The data to write. Each element should be a tuple of sid, data where data has the following format: columns : ('open', 'high', 'low', 'close', 'volume') open : float64 high : float64 low : float64 close : float64 volume : float64|int64 index : DatetimeIndex of market minutes. A given sid may appear more than once in ``data``; however, the dates must be strictly increasing. show_progress : bool, optional Whether or not to show a progress bar while writing. """ ctx = maybe_show_progress( data, show_progress=show_progress, item_show_func=lambda e: e if e is None else str(e[0]), label="Merging minute equity files:", ) write_sid = self.write_sid with ctx as it: for e in it: write_sid(*e, invalid_data_behavior=invalid_data_behavior)
def insert(dest, codes): """插入股票代码分钟级别数据""" c, s, e = info_func() writer = BcolzMinuteBarWriter( dest, c, s, e, CN_EQUITIES_MINUTES_PER_DAY, ) ctx = maybe_show_progress( codes, show_progress=True, item_show_func=lambda e: e, label="【新增】分钟级别数据", ) d_fmt = r"%Y-%m-%d" start_str = s.strftime(d_fmt) end_str = e.strftime(d_fmt) m_index = c.minutes_for_sessions_in_range(start_str, end_str) with ctx as it: for code in it: sid = int(code) df = fetch_single_minutely_equity(code, s.date(), e.date()) if df.empty: continue # 务必转换为UTC时区 # 且由于指数分钟级别数据包含的是所有分钟,需要截断至交易分钟 df = df.tz_localize('Asia/Shanghai').tz_convert('UTC').reindex( m_index, method='ffill') writer.write_sid(sid, df)
def insert_equity_extra_data_sf1(sharadar_metadata_df, sf1_df, cursor, show_progress=True): tickers = sf1_df['ticker'].unique() related_tickers = sharadar_metadata_df['relatedtickers'].dropna() # Add a space at the begin and end of relatedtickers, search for ' TICKER ' related_tickers = ' ' + related_tickers.astype(str) + ' ' with maybe_show_progress(tickers, show_progress, label='Parsing fundamental data: ') as it: for ticker in it: df_ticker = sf1_df[sf1_df['ticker'] == ticker] df_ticker.set_index('datekey', inplace=True) df_ticker = df_ticker.sort_index(ascending=False) df_ticker = df_ticker.drop(['ticker', 'lastupdated', 'calendardate'], axis=1) sid = lookup_sid(sharadar_metadata_df, related_tickers, ticker) for datekey, row in df_ticker.iterrows(): for column in row.index: if column != 'dimension': field = column + '_' + row['dimension'].lower() value = row[column] if type(value) == float and np.isnan(value): continue date = datekey + ONE_DAY # end_date not used (set -1) sql = "INSERT OR REPLACE INTO equity_supplementary_mappings (sid, field, start_date, end_date, value) VALUES(?, ?, ?, -1, ?)" cursor.execute(sql, (sid, field, date.value, str(value)))
def write(self, data, show_progress=False): """Write a stream of minute data. Parameters ---------- data : iterable[(int, pd.DataFrame)] The data to write. Each element should be a tuple of sid, data where data has the following format: columns : ('open', 'high', 'low', 'close', 'volume') open : float64 high : float64 low : float64 close : float64 volume : float64|int64 index : DatetimeIndex of market minutes. A given sid may appear more than once in ``data``; however, the dates must be strictly increasing. show_progress : bool, optional Whether or not to show a progress bar while writing. """ ctx = maybe_show_progress( data, show_progress=show_progress, item_show_func=lambda e: e if e is None else str(e[0]), label="Merging minute equity files:", ) write_sid = self.write_sid with ctx as it: for e in it: write_sid(*e)
def fetch_symbol_metadata_frame(api_key, cache, retries=5, environ=None, show_progress=False): """ Download Quandl symbol metadata. Parameters ---------- api_key : str The quandl api key to use. If this is None then no api key will be sent. cache : DataFrameCache The cache to use for persisting the intermediate data. retries : int, optional The number of times to retry each request before failing. environ : mapping[str -> str], optional The environment to use to find the zipline home. By default this is ``os.environ``. show_progress : bool, optional Show a progress bar for the download of this data. Returns ------- metadata_frame : pd.DataFrame A dataframe with the following columns: symbol: the asset's symbol name: the full name of the asset start_date: the first date of data for this asset end_date: the last date of data for this asset auto_close_date: end_date + one day exchange: the exchange for the asset; this is always 'quandl' The index of the dataframe will be used for symbol->sid mappings but otherwise does not have specific meaning. """ raw_iter = _fetch_raw_metadata(api_key, cache, retries, environ) def item_show_func(_, _it=iter(count())): 'Downloading page: %d' % next(_it) with maybe_show_progress(raw_iter, show_progress, item_show_func=item_show_func, label='Downloading WIKI metadata: ') as blocks: data = pd.concat(blocks, ignore_index=True).rename( columns={ 'dataset_code': 'symbol', 'name': 'asset_name', 'oldest_available_date': 'start_date', 'newest_available_date': 'end_date', }).sort('symbol') data = data[~data.symbol.isin(excluded_symbols)] # cut out all the other stuff in the name column # we need to escape the paren because it is actually splitting on a regex data.asset_name = data.asset_name.str.split(r' \(', 1).str.get(0) data['exchange'] = 'quandl' data['auto_close_date'] = data['end_date'] + pd.Timedelta(days=1) return data
def append(dest, codes): """添加股票代码分钟级别数据""" c, s, e = info_func() writer = BcolzMinuteBarWriter.open(dest, e) ctx = maybe_show_progress( codes, show_progress=True, item_show_func=lambda e: e, label="【更新】分钟级别数据", ) d_fmt = r"%Y-%m-%d" start_str = s.strftime(d_fmt) end_str = e.strftime(d_fmt) m_index = c.minutes_for_sessions_in_range(start_str, end_str) with ctx as it: for code in it: sid = int(code) last_dt = writer.last_date_in_output_for_sid(sid) if last_dt is pd.NaT: start = s else: start = last_dt + c.day if start > e: continue # print(sid, start.date(), e.date()) df = fetch_single_minutely_equity(code, start.date(), e.date()) if df.empty: continue # 务必转换为UTC时区 # 且由于指数分钟级别数据包含的是所有分钟,需要截断至交易分钟 df = df.tz_localize('Asia/Shanghai').tz_convert('UTC').reindex( m_index, method='ffill') writer.write_sid(sid, df)
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Processing CSV: ') as it, \ requests.Session() as session: for symbol in it: # path = _cachpath(symbol, 'yuusha') # try: # df = cache[path] # except KeyError: # df = cache[path] = LoadOneSymbol(df_multiSymbol, symbol) df = LoadOneSymbol(df_multiSymbol, symbol) # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) ex_date = pd.to_datetime(df["expiration"][0]) strike = df["strike"][0] callput = df["callput"][0] style = df["style"][0] underlying = df["underlying"][0] exchange = df["exchange"][0] metadata.iloc[sid] = start_date, end_date, ac_date, ex_date, strike, \ callput, style, underlying, exchange, symbol yield sid, df sid += 1
def fetch_symbol_metadata_frame(api_key, cache, retries=5, environ=None, show_progress=False): """ Download Quandl symbol metadata. Parameters ---------- api_key : str The quandl api key to use. If this is None then no api key will be sent. cache : DataFrameCache The cache to use for persisting the intermediate data. retries : int, optional The number of times to retry each request before failing. environ : mapping[str -> str], optional The environment to use to find the zipline home. By default this is ``os.environ``. show_progress : bool, optional Show a progress bar for the download of this data. Returns ------- metadata_frame : pd.DataFrame A dataframe with the following columns: symbol: the asset's symbol name: the full name of the asset start_date: the first date of data for this asset end_date: the last date of data for this asset auto_close_date: end_date + one day exchange: the exchange for the asset; this is always 'quandl' The index of the dataframe will be used for symbol->sid mappings but otherwise does not have specific meaning. """ raw_iter = _fetch_raw_metadata(api_key, cache, retries, environ) def item_show_func(_, _it=iter(count())): 'Downloading page: %d' % next(_it) with maybe_show_progress(raw_iter, show_progress, item_show_func=item_show_func, label='Downloading WIKI metadata: ') as blocks: data = pd.concat(blocks, ignore_index=True).rename(columns={ 'dataset_code': 'symbol', 'name': 'asset_name', 'oldest_available_date': 'start_date', 'newest_available_date': 'end_date', }).sort('symbol') data = data[~data.symbol.isin(excluded_symbols)] # cut out all the other stuff in the name column # we need to escape the paren because it is actually splitting on a regex data.asset_name = data.asset_name.str.split(r' \(', 1).str.get(0) data['exchange'] = 'QUANDL' data['auto_close_date'] = data['end_date'] + pd.Timedelta(days=1) return data
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress): with maybe_show_progress(symbols, show_progress, label="Loading custom pricing data: ") as it: files = os.listdir(csvdir) for sid, symbol in enumerate(it): logger.debug("%s: sid %s" % (symbol, sid)) try: fname = [ fname for fname in files if "%s.csv" % symbol in fname ][0] except IndexError: raise ValueError("%s.csv file is not in %s" % (symbol, csvdir)) dfr = read_csv( os.path.join(csvdir, fname), parse_dates=[0], infer_datetime_format=True, index_col=0, ).sort_index() start_date = dfr.index[0] end_date = dfr.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol if "split" in dfr.columns: tmp = 1.0 / dfr[dfr["split"] != 1.0]["split"] split = DataFrame(data=tmp.index.tolist(), columns=["effective_date"]) split["ratio"] = tmp.tolist() split["sid"] = sid splits = divs_splits["splits"] index = Index( range(splits.shape[0], splits.shape[0] + split.shape[0])) split.set_index(index, inplace=True) divs_splits["splits"] = splits.append(split) if "dividend" in dfr.columns: # ex_date amount sid record_date declared_date pay_date tmp = dfr[dfr["dividend"] != 0.0]["dividend"] div = DataFrame(data=tmp.index.tolist(), columns=["ex_date"]) div["record_date"] = NaT div["declared_date"] = NaT div["pay_date"] = NaT div["amount"] = tmp.tolist() div["sid"] = sid divs = divs_splits["divs"] ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0])) div.set_index(ind, inplace=True) divs_splits["divs"] = divs.append(div) yield sid, dfr
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Tradea Database pricing data ') as it, \ requests.Session() as session: for symbol in it: logger.debug('zipline bundle downloading %s' % symbol) try: instrument = Instrument( symbol=symbol, asset_type=AssetType.us_equity) df = self.historical_market_data_service.getHistoricalData( instrument, period=Period.day, number_of_periods=1, fromDate=start, toDate=end, bar_type=BarType.time_bar, force_download=False, cleanOutliers=False) except Exception as e: logger.error( 'Error downloading bundle zipline %s : %s' % (symbol, str(e))) print('Error downloading bundle zipline %s : %s' % (symbol, str(e))) df = None continue # the start date is the date of the first trade and # the end date is the date of the last trade indexSet = df.index.copy() indexSet = (indexSet + pd.DateOffset(hours=3) ) - pd.DateOffset(days=1) df.index = indexSet start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[ sid] = start_date, end_date, ac_date, symbol df.rename( columns={ Bar.open: 'open', Bar.high: 'high', Bar.low: 'low', Bar.close: 'close', Bar.volume: 'volume', }, inplace=True, ) yield sid, df sid += 1
def _pricing_iter(self): with maybe_show_progress(self.symbols, self.show_progress, label='Loading custom pricing data: ') as it: for sid, symbol in enumerate(it): logger.debug('%s: sid %s' % (symbol, sid)) dfr = read_csv(os.path.join(self.csvdir, '%s.csv' % symbol), parse_dates=[0], infer_datetime_format=True, index_col=0).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade start_date = dfr.index[0] end_date = dfr.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + Timedelta(days=1) self.metadata.iloc[sid] = start_date, end_date, ac_date, symbol if 'split' in dfr.columns: if self.splits is None: self.splits = DataFrame() tmp = dfr[dfr['split'] != 1.0]['split'] split = DataFrame(data=tmp.index.tolist(), columns=['effective_date']) split['ratio'] = tmp.tolist() split['sid'] = sid index = Index( range(self.splits.shape[0], self.splits.shape[0] + split.shape[0])) split.set_index(index, inplace=True) self.splits = self.splits.append(split) if 'dividend' in dfr.columns: if self.dividends is None: self.dividends = DataFrame() # ex_date amount sid record_date declared_date pay_date tmp = dfr[dfr['dividend'] != 0.0]['dividend'] div = DataFrame(data=tmp.index.tolist(), columns=['ex_date']) div['record_date'] = NaT div['declared_date'] = NaT div['pay_date'] = NaT div['amount'] = tmp.tolist() div['sid'] = sid ind = Index( range(self.dividends.shape[0], self.dividends.shape[0] + div.shape[0])) div.set_index(ind, inplace=True) if self.dividends is None: self.dividends = DataFrame() self.dividends = self.dividends.append(div) yield sid, dfr
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress): with maybe_show_progress(symbols, show_progress, label='Loading custom pricing data: ') as it: files = os.listdir(csvdir) for sid, symbol in enumerate(it): logger.debug(f'{symbol}: sid {sid}') try: fname = [ fname for fname in files if '%s.csv' % symbol in fname ][0] except IndexError: raise ValueError(f"{symbol}.csv file is not in {csvdir}") dfr = read_csv(os.path.join(csvdir, fname), parse_dates=[0], infer_datetime_format=True, index_col=0).sort_index() start_date = dfr.index[0] end_date = dfr.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol if 'split' in dfr.columns: tmp = 1. / dfr[dfr['split'] != 1.0]['split'] split = DataFrame(data=tmp.index.tolist(), columns=['effective_date']) split['ratio'] = tmp.tolist() split['sid'] = sid splits = divs_splits['splits'] index = Index( range(splits.shape[0], splits.shape[0] + split.shape[0])) split.set_index(index, inplace=True) divs_splits['splits'] = splits.append(split) if 'dividend' in dfr.columns: # ex_date amount sid record_date declared_date pay_date tmp = dfr[dfr['dividend'] != 0.0]['dividend'] div = DataFrame(data=tmp.index.tolist(), columns=['ex_date']) div['record_date'] = NaT div['declared_date'] = NaT div['pay_date'] = NaT div['amount'] = tmp.tolist() div['sid'] = sid divs = divs_splits['divs'] ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0])) div.set_index(ind, inplace=True) divs_splits['divs'] = divs.append(div) yield sid, dfr
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress): with maybe_show_progress(symbols, show_progress, label='Loading custom pricing data: ') as it: files = os.listdir(csvdir) for sid, symbol in enumerate(it): logger.debug('%s: sid %s' % (symbol, sid)) try: fname = [fname for fname in files if '%s.csv' % symbol in fname][0] except IndexError: raise ValueError("%s.csv file is not in %s" % (symbol, csvdir)) dfr = read_csv(os.path.join(csvdir, fname), parse_dates=[0], infer_datetime_format=True, index_col=0).sort_index() start_date = dfr.index[0] end_date = dfr.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol if 'split' in dfr.columns: tmp = 1. / dfr[dfr['split'] != 1.0]['split'] split = DataFrame(data=tmp.index.tolist(), columns=['effective_date']) split['ratio'] = tmp.tolist() split['sid'] = sid splits = divs_splits['splits'] index = Index(range(splits.shape[0], splits.shape[0] + split.shape[0])) split.set_index(index, inplace=True) divs_splits['splits'] = splits.append(split) if 'dividend' in dfr.columns: # ex_date amount sid record_date declared_date pay_date tmp = dfr[dfr['dividend'] != 0.0]['dividend'] div = DataFrame(data=tmp.index.tolist(), columns=['ex_date']) div['record_date'] = NaT div['declared_date'] = NaT div['pay_date'] = NaT div['amount'] = tmp.tolist() div['sid'] = sid divs = divs_splits['divs'] ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0])) div.set_index(ind, inplace=True) divs_splits['divs'] = divs.append(div) yield sid, dfr
def _pricing_iter(): with maybe_show_progress( tickers_df.iterrows(), show_progress, label='Fetch stocks pricing data from db: ') as it, \ requests.Session() as session: for index, row in tickers_df.iterrows(): symbol = row['asset_name'] path = _cachpath(symbol, 'ohlcv') try: data = cache[path] except: sql_text = "SELECT tran_date as date, open, high, low, close, volume FROM `stock_spy` WHERE name='{0}' order by tran_date desc".format( symbol) data = cache[path] = pd.read_sql( sql_text, con=sqlite_conn, index_col='date', parse_dates=['date']).sort_index() if traceDebug: print("read {} sql and get df data:".format(symbol)) print(data) # the start date is the date of the first trade and # the end date is the date of the last trade start_date = pd.to_datetime(data.iloc[0].name) end_date = pd.to_datetime(data.iloc[-1].name) if traceDebug: print("start_date: ") print(start_date) # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) sid = row['sid'] if traceDebug: print("sid-{}:symbol-{}", sid, symbol) print("start_date", type(start_date), start_date) print("end_date", type(end_date), end_date) print("ac_date", type(ac_date), ac_date) metadata.iloc[sid] = start_date, end_date, ac_date, symbol new_index = ['open', 'high', 'low', 'close', 'volume'] data_df = data.reindex(columns=new_index, copy=False) # fix bug sessions = calendar.sessions_in_range(start_date, end_date) data_df = data_df.reindex( sessions.tz_localize(None), copy=False, ).fillna(0.0) yield sid, data_df
def ingest(environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_writer, calendar, start_session, end_session, cache, show_progress, output_dir): symbols = ['600019.SH'] pro_api = ts.pro_api(tushare_token) dtype = [('start_date', 'datetime64[ns]'), ('end_date', 'datetime64[ns]'), ('auto_close_date', 'datetime64[ns]'), ('symbol', 'object')] metadata = DataFrame(empty(len(symbols), dtype=dtype)) with maybe_show_progress(symbols, show_progress, label='Loading CN A %s pricing data: ' % (symbols)) as it: for sid, symbol in enumerate(it): tushare_daily = ts.pro_bar(pro_api=pro_api, ts_code=symbol, asset='E', start_date=start_session.strftime('%Y%m%d'), end_date=end_session.strftime('%Y%m%d'), adj='qfq') tushare_daily['day'] = pd.to_datetime(tushare_daily['trade_date']) tushare_daily['volume'] = tushare_daily['vol'] tushare_daily['id'] = tushare_daily['ts_code'] tushare_daily = tushare_daily.filter(items=['day', 'open', 'high', 'low', 'close', 'volume']) tushare_daily = tushare_daily.set_index('day').sort_index() start_date = tushare_daily.index[0] end_date = tushare_daily.index[-1] end_date = start_date if start_date > end_date else end_date # The auto_close date is the day after the last trade. ac_date = end_date + Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol daily_bar_writer.write([(sid, tushare_daily)], show_progress=show_progress) metadata['exchange'] = 'SSE' asset_db_writer.write(equities=metadata) adjustment_writer.write(None)
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: print symbol path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = get_data( symbol, start, end ) # df = cache[path] = DataReader( # symbol, # 'yahoo', # start, # end, # session=session, # ).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade df = df[df.Volume>0] start_date = df.index[0] end_date = df.index[-1] df = df.reindex(trading_days[(trading_days>=start_date)]) df.Volume = df.Volume.fillna(0) df = df.ffill() # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1
def _read_and_convert(self, calendar, show_progress): """returns the generator of symbol index and the dataframe storing its price data """ with maybe_show_progress(self._symbols, show_progress, label='Downloading from {}: '.format( self._exchange)) as it: for symbol_index, symbol in enumerate(it): # read data from csv file and set the index df_data = self._downloader(symbol) # apply filter when it is provided if self._filter is not None: df_data = self._filter(df_data) self._update_symbol_metadata(symbol_index, symbol, df_data) yield symbol_index, df_data
def _pricing_iter(mongo_db, symbols, metadata, divs_splits, show_progress): with maybe_show_progress(symbols, show_progress, label='Loading custom pricing data: ') as it: for sid, symbol in enumerate(it): logger.debug('%s: sid %s' % (symbol, sid)) collector = mongo_db[symbol] dfr = read_mongo(collector).sort_index() # print(dfr) start_date = dfr.index[0] end_date = dfr.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol if 'split' in dfr.columns: tmp = 1. / dfr[dfr['split'] != 1.0]['split'] split = DataFrame(data=tmp.index.tolist(), columns=['effective_date']) split['ratio'] = tmp.tolist() split['sid'] = sid splits = divs_splits['splits'] index = Index( range(splits.shape[0], splits.shape[0] + split.shape[0])) split.set_index(index, inplace=True) divs_splits['splits'] = splits.append(split) if 'dividend' in dfr.columns: # ex_date amount sid record_date declared_date pay_date tmp = dfr[dfr['dividend'] != 0.0]['dividend'] div = DataFrame(data=tmp.index.tolist(), columns=['ex_date']) div['record_date'] = NaT div['declared_date'] = NaT div['pay_date'] = NaT div['amount'] = tmp.tolist() div['sid'] = sid divs = divs_splits['divs'] ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0])) div.set_index(ind, inplace=True) divs_splits['divs'] = divs.append(div) yield sid, dfr
def create_equities_df(df, tickers, sessions, sharadar_metadata_df, show_progress): # Prepare an empty DataFrame for equities, the index of this dataframe is the sid. equities_df = pd.DataFrame(columns=METADATA_HEADERS) with maybe_show_progress(tickers, show_progress, label='Loading custom pricing data: ') as it: for ticker in it: df_ticker = df[df['ticker'] == ticker] df_ticker = df_ticker.sort_index() sid = df_ticker.index.get_level_values('sid')[0] sharadar_metadata = sharadar_metadata_df[ sharadar_metadata_df['permaticker'] == sid].iloc[0, :] asset_name = sharadar_metadata.loc['name'] # The date when this asset was created. start_date = sharadar_metadata.loc['firstpricedate'] # The last date we have trade data for this asset. end_date = sharadar_metadata.loc['lastpricedate'] # The first date we have trade data for this asset. first_traded = start_date # The date on which to close any positions in this asset. auto_close_date = end_date + pd.Timedelta(days=1) # The canonical name of the exchange, for example 'NYSE' or 'NASDAQ' exchange = sharadar_metadata.loc['exchange'] if (exchange is None) or (exchange == 'None'): exchange = 'OTC' # Synch to the official exchange calendar, if necessary date_index = df_ticker.index.get_level_values('date') start_date_df = date_index[0] end_date_df = date_index[-1] synch_to_calendar(sessions, start_date_df, end_date_df, df_ticker, df) # Add a row to the metadata DataFrame. equities_df.loc[ sid] = ticker, asset_name, start_date, end_date, first_traded, auto_close_date, exchange return equities_df
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Fetch stocks pricing data from db: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: query = "select * from '%s' order by date desc" % symbol df = cache[path] = pd.read_sql( sql=query, con=conn, index_col='date', parse_dates=['date']).sort_index() if boDebug: print("read_sqllite df", type(df), "length", len(df)) # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) if boDebug: print("start_date", type(start_date), start_date) print("end_date", type(end_date), end_date) print("ac_date", type(ac_date), ac_date) metadata.iloc[sid] = start_date, end_date, ac_date, symbol new_index = ['open', 'high', 'low', 'close', 'volume'] df = df.reindex(columns=new_index, copy=False) #fix bug # FIX IT sessions = calendar.sessions_in_range(start_date, end_date) df = df.reindex( sessions.tz_localize(None), copy=False, ).fillna(0.0) yield sid, df sid += 1
def _pricing_iter(metadata, symbols, show_progress, start_session, end_session, cache): sid = 0 with maybe_show_progress(symbols, show_progress, label='BitMex pricing data: ') as it: for symbol in it: _get_metadata(sid, symbol, metadata) for day in pd.date_range(start_session, end_session, freq='D', closed='left'): key = symbol + '-' + day.strftime("%Y-%m-%d") if key not in cache: cache[key] = _get_minute_bar(symbol, day) yield sid, cache[key] sid += 1
def _pricing_iter(): with maybe_show_progress(symbols, show_progress, label='Loading custom pricing data: ') as it: for sid, symbol in enumerate(it): logger.debug('%s: sid %s' % (symbol, sid)) df = pandas.read_csv(os.path.join(csvdir, '%s.csv' % symbol), parse_dates=[0], infer_datetime_format=True, index_col=0).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pandas.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol yield sid, df
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for ticker in it: path = _cache_path(ticker, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = DataReader( ticker, 'yahoo', start_session, end_session, session=session, ).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, ticker df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', 'Adj Close': 'price', }, inplace=True, ) yield sid, df sid += 1
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading IEX pricing data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = get_historical_data( symbol, start=start, end=None, output_format='pandas').sort_index() df.index = pd.to_datetime(df.index) # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1
def insert_daily_metrics(sharadar_metadata_df, daily_df, cursor, show_progress=True): tickers = daily_df['ticker'].unique() related_tickers = sharadar_metadata_df['relatedtickers'].dropna() # Add a space at the begin and end of relatedtickers, search for ' TICKER ' related_tickers = ' ' + related_tickers.astype(str) + ' ' with maybe_show_progress(tickers, show_progress, label='Parsing fundamental data: ') as it: for ticker in it: df_ticker = daily_df[daily_df['ticker'] == ticker] df_ticker.set_index('date', inplace=True) df_ticker = df_ticker.sort_index(ascending=False) df_ticker = df_ticker.drop(['ticker', 'lastupdated'], axis=1) sid = lookup_sid(sharadar_metadata_df, related_tickers, ticker) for date, row in df_ticker.iterrows(): for field in row.index: value = row[field] if np.isnan(value): continue # end_date not used (set -1) sql = "INSERT OR REPLACE INTO equity_supplementary_mappings (sid, field, start_date, end_date, value) VALUES(?, ?, ?, -1, ?)" cursor.execute(sql, (sid, field, date.value, str(value)))
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'csv') try: df = cache[path] except KeyError: df = cache[path] = load_yahoo_csv( r"/mnt/hgfs/595P/project_Z/tests/resources/yahoo-test.csv", identifier_col="Date") # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol df.rename( columns={ 'Open': 'adjusted_close', 'High': 'ask', 'Low': 'bid', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1
def _read_and_convert(self, symbols, show_progress): """returns the generator of symbol index and the dataframe storing its price data """ path = self._get_csvdir(show_progress) with maybe_show_progress(symbols, show_progress, label='Loading csv files: ') as it: for symbol_index, symbol in enumerate(it): file_path = '{0}/{1}.csv'.format(path, symbol) if os.path.exists(file_path): # read data from csv file and set the index df_data = pd.read_csv(file_path, index_col=self._index_column, parse_dates=True, dtype={ "Volumn": np.uint64 }).sort_index() # rename columns if necessary if self._column_mapper: df_data.rename(columns=self._column_mapper, inplace=True) self._filter(df_data) self._update_symbol_metadata(symbol_index, symbol, df_data) yield symbol_index, df_data
def ingest( environ, asset_db_writer, minute_bar_writer, # unused daily_bar_writer, adjustment_writer, fundamental_writer, calendar, start_session, end_session, cache, show_progress, output_dir, # pass these as defaults to make them 'nonlocal' in py2 start=start, end=end): if start is None: start = start_session if end is None: end = None metadata = pd.DataFrame( np.empty(len(symbols), dtype=[ ('start_date', 'datetime64[ns]'), ('end_date', 'datetime64[ns]'), ('auto_close_date', 'datetime64[ns]'), ('symbol', 'object'), ])) def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo', start, end, session=session, ).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1 daily_bar_writer.write(_pricing_iter(), show_progress=show_progress) symbol_map = pd.Series(metadata.symbol.index, metadata.symbol) # Hardcode the exchange to "YAHOO" for all assets and (elsewhere) # register "YAHOO" to resolve to the NYSE calendar, because these are # all equities and thus can use the NYSE calendar. metadata['exchange'] = "YAHOO" asset_db_writer.write(equities=metadata) adjustments = [] with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo adjustment data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'adjustment') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo-actions', start, end, session=session, ).sort_index() df['sid'] = symbol_map[symbol] adjustments.append(df) adj_df = pd.concat(adjustments) adj_df.index.name = 'date' adj_df.reset_index(inplace=True) splits = adj_df[adj_df.action == 'SPLIT'] splits = splits.rename(columns={ 'value': 'ratio', 'date': 'effective_date' }, ) splits.drop('action', axis=1, inplace=True) dividends = adj_df[adj_df.action == 'DIVIDEND'] dividends = dividends.rename(columns={ 'value': 'amount', 'date': 'ex_date' }, ) dividends.drop('action', axis=1, inplace=True) # we do not have this data in the yahoo dataset dividends['record_date'] = pd.NaT dividends['declared_date'] = pd.NaT dividends['pay_date'] = pd.NaT adjustment_writer.write(splits=splits, dividends=dividends)
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress): with maybe_show_progress(symbols, show_progress, label='Loading custom pricing data: ') as it: files = os.listdir(csvdir) # print(files) # Erol debug added for sid, symbol in enumerate(it): logger.debug('%s: sid %s' % (symbol, sid)) # print(sid) # Erol debug added # print(symbol) # Erol debug added try: fname = [ fname for fname in files # if '%s.csv' % symbol in fname][0] # Erol: it looks like this is what is f3$%ing it up if '%s.csv' % symbol == fname ][0] # Erol: I fixed it here # print(fname) # Erol debug added except IndexError: raise ValueError("%s.csv file is not in %s" % (symbol, csvdir)) dfr = read_csv(os.path.join(csvdir, fname), parse_dates=[0], infer_datetime_format=True, index_col=0).sort_index() start_date = dfr.index[0] end_date = dfr.index[-1] # Erol added this to sync to the official trading calendar, I'll probably just handle this on the dataside # Check valid trading dates, according to the selected exchange calendar # sessions = calendar.sessions_in_range(start_session, end_session) # dfr = dfr.reindex(sessions.tz_localize(None))[start_date:end_date] # The auto_close date is the day after the last trade. ac_date = end_date + Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol if 'split' in dfr.columns: tmp = 1. / dfr[dfr['split'] != 1.0]['split'] split = DataFrame(data=tmp.index.tolist(), columns=['effective_date']) split['ratio'] = tmp.tolist() split['sid'] = sid splits = divs_splits['splits'] index = Index( range(splits.shape[0], splits.shape[0] + split.shape[0])) split.set_index(index, inplace=True) divs_splits['splits'] = splits.append(split) if 'dividend' in dfr.columns: # ex_date amount sid record_date declared_date pay_date tmp = dfr[dfr['dividend'] != 0.0]['dividend'] div = DataFrame(data=tmp.index.tolist(), columns=['ex_date']) div['record_date'] = NaT div['declared_date'] = NaT div['pay_date'] = NaT # Erol add a pay_date - This fixed the problem div['pay_date'] = div['ex_date'] div['amount'] = tmp.tolist() div['sid'] = sid divs = divs_splits['divs'] ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0])) div.set_index(ind, inplace=True) divs_splits['divs'] = divs.append(div) yield sid, dfr
def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: provider = "yahoo" try: print("To Download symbol:", symbol, path) df = cache[path] = DataReader( name=symbol + '.ss' if symbol.startswith('6') else symbol + '.sz', data_source='yahoo', start=start, end=end, retry_count=1, session=session, ).sort_index() if df is None: #FIXIT timeout maybe raise Exception("Empty Result!", symbol) except Exception, e: print( 'Got a Exception - reason "%s" for stock(%s) in yahoo, try tushare' % (str(e), symbol)) import tushare as ts try: df = cache[path] = ts.get_h_data( symbol, start=start.strftime("%Y-%m-%d") if start != None else None, end=end.strftime("%Y-%m-%d") if end != None else None, retry_count=5, pause=1).sort_index() provider = 'tushare' if df is None: #FIXIT timeout maybe raise Exception("Empty Result!", symbol) except Exception, e1: print( 'Got a Exception - reason "%s" for stock(%s) in tushare, ignore it' % (str(e1), symbol)) sys.exit() #sid += 1 #continue print("Got stock(%s) from provide(%s)" % (symbol, provider)) # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[ sid] = start_date, end_date, ac_date, symbol if provider == 'tushare': new_index = [ 'open', 'high', 'low', 'close', 'volume' ] df = df.reindex(columns=new_index, copy=False) # fix bug else: df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Adj Close': 'close', 'Volume': 'volume', }, inplace=True, ) sessions = calendar.sessions_in_range( start_date, end_date) df = df.reindex( sessions.tz_localize(None), copy=False, ).fillna(0.0) yield sid, df sid += 1
def tiingo_metadata(tickers='ALL', asset_finder=None): tickers_df = pd.DataFrame(CLIENT.list_stock_tickers()) tickers_df = tickers_df.loc[ (tickers_df['exchange'].isin(['NYSE', 'NASDAQ'])) & (tickers_df['assetType'] == 'Stock')] if tickers != 'ALL': tickers_df = tickers_df.loc[tickers_df['ticker'].isin(tickers)] tickers_df['startDate'] = pd.to_datetime(tickers_df['startDate']) tickers_df['endDate'] = pd.to_datetime(tickers_df['endDate']) tickers_df.dropna(inplace=True) # we currently don't support it when a symbol is held by more than # one security at a time. for the ones duplicated, we choose to # get the currently traded duplicates = tickers_df.loc[tickers_df.duplicated(subset=['ticker'])] tickers_df.drop_duplicates(subset=['ticker'], inplace=True) tickers_df = tickers_df[~tickers_df['ticker'].isin(duplicates['ticker'])] tickers_df.drop(columns=['assetType', 'priceCurrency'], inplace=True) ex_duplicates = [] with maybe_show_progress( duplicates['ticker'], True, item_show_func=lambda e: e if e is None else str(e), label='Retrieving metadata for duplicate tickers: ') as it: for ticker in it: ex_duplicate = {} ticker_meta = CLIENT.get_ticker_metadata(ticker) ex_duplicate['ticker'] = ticker_meta['ticker'] ex_duplicate['exchange'] = ticker_meta['exchangeCode'] ex_duplicate['startDate'] = pd.to_datetime( ticker_meta['startDate']) ex_duplicate['endDate'] = pd.to_datetime(ticker_meta['endDate']) ex_duplicates.append(ex_duplicate) tickers_df = pd.concat([tickers_df, pd.DataFrame(ex_duplicates)]) tickers_df.dropna(inplace=True) tickers_df.reset_index(inplace=True) tickers_df.drop(columns=['index'], inplace=True) tickers_df.rename(columns={ 'ticker': 'symbol', 'startDate': 'start_date', 'endDate': 'end_date' }, inplace=True) assets_to_sids = asset_to_sid_map(asset_finder, tickers_df['symbol'].values) tickers_df['sid'] = [ assets_to_sids[symbol] for symbol in tickers_df['symbol'] ] tickers_df.index = tickers_df['sid'] tickers_df.drop(columns=['sid'], inplace=True) tickers_df['first_traded'] = tickers_df['start_date'] tickers_df['auto_close_date'] = tickers_df['end_date'] + Timedelta(days=1) return tickers_df, assets_to_sids
def _pricing_iter(symbols, divs_splits, show_progress, metadata, sids_written, assets_to_sids={}): start_date = pd.to_datetime('2000-1-1', utc=True) end_date = pd.to_datetime('today', utc=True) + Timedelta(days=20) cal: TradingCalendar = trading_calendars.get_calendar('NYSE') sessions = cal.sessions_in_range(start_date, end_date) with maybe_show_progress(symbols, show_progress, item_show_func=lambda e: e if e is None else str(e), label='Loading tiingo pricing data: ') as it: for symbol in it: sid = assets_to_sids[symbol] try: df = pd.DataFrame( CLIENT.get_ticker_price(symbol, fmt='json', startDate=start_date, frequency='daily')) if df.empty: print(f'No data for {symbol}, skpping...') continue df.index = pd.to_datetime(df['date'], utc=True) df.drop(columns=[ 'date', 'adjOpen', 'adjHigh', 'adjLow', 'adjClose', 'adjVolume' ], inplace=True) df.rename(columns={ 'splitFactor': 'split', 'divCash': 'dividend' }, inplace=True) df = fill_daily_gaps(df) df = drop_extra_sessions(df) if 'split' in df.columns: tmp = 1. / df[df['split'] != 1.0]['split'] split = DataFrame(data=tmp.index.tz_convert(None).tolist(), columns=['effective_date']) split['ratio'] = tmp.tolist() split['sid'] = sid splits = divs_splits['splits'] index = Index( range(splits.shape[0], splits.shape[0] + split.shape[0])) split.set_index(index, inplace=True) divs_splits['splits'] = splits.append(split) if 'dividend' in df.columns: # ex_date amount sid record_date declared_date pay_date tmp = df[df['dividend'] != 0.0]['dividend'] div = DataFrame(data=tmp.index.tz_convert(None).tolist(), columns=['ex_date']) natValue = pd.to_datetime('1800-1-1') div['record_date'] = natValue div['declared_date'] = natValue div['pay_date'] = [ sessions[sessions.get_loc(ex_date) + 10].tz_convert(None) for ex_date in div['ex_date'] ] div['amount'] = tmp.tolist() div['sid'] = sid divs = divs_splits['divs'] ind = Index( range(divs.shape[0], divs.shape[0] + div.shape[0])) div.set_index(ind, inplace=True) divs_splits['divs'] = divs.append(div) except KeyboardInterrupt: exit() except Exception as e: print(f'\nException for symbol {symbol}') print(e) sids_written.append(sid) yield sid, df
def ingest(environ, asset_db_writer, minute_bar_writer, # unused daily_bar_writer, adjustment_writer, calendar, cache, show_progress, output_dir, # pass these as defaults to make them 'nonlocal' in py2 start=start, end=end): if start is None: start = calendar[0] if end is None: end = None metadata = pd.DataFrame(np.empty(len(symbols), dtype=[ ('start_date', 'datetime64[ns]'), ('end_date', 'datetime64[ns]'), ('auto_close_date', 'datetime64[ns]'), ('symbol', 'object'), ])) def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo', start, end, session=session, ).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade start_date = df.index[0] end_date = df.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1 daily_bar_writer.write(_pricing_iter(), show_progress=True) symbol_map = pd.Series(metadata.symbol.index, metadata.symbol) asset_db_writer.write(equities=metadata) adjustments = [] with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo adjustment data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'adjustment') try: df = cache[path] except KeyError: df = cache[path] = DataReader( symbol, 'yahoo-actions', start, end, session=session, ).sort_index() df['sid'] = symbol_map[symbol] adjustments.append(df) adj_df = pd.concat(adjustments) adj_df.index.name = 'date' adj_df.reset_index(inplace=True) splits = adj_df[adj_df.action == 'SPLIT'] splits = splits.rename( columns={'value': 'ratio', 'date': 'effective_date'}, ) splits.drop('action', axis=1, inplace=True) dividends = adj_df[adj_df.action == 'DIVIDEND'] dividends = dividends.rename( columns={'value': 'amount', 'date': 'ex_date'}, ) dividends.drop('action', axis=1, inplace=True) # we do not have this data in the yahoo dataset dividends['record_date'] = pd.NaT dividends['declared_date'] = pd.NaT dividends['pay_date'] = pd.NaT adjustment_writer.write(splits=splits, dividends=dividends)
def ingest(environ, asset_db_writer, minute_bar_writer, # unused daily_bar_writer, adjustment_writer, calendar, start_session, end_session, cache, show_progress, output_dir, # pass these as defaults to make them 'nonlocal' in py2 start=start, end=end): if start is None: start = start_session if end is None: end = None metadata = pd.DataFrame(np.empty(len(symbols), dtype=[ ('start_date', 'datetime64[ns]'), ('end_date', 'datetime64[ns]'), ('auto_close_date', 'datetime64[ns]'), ('symbol', 'object'), ])) trading_days = get_calendar('SH').all_sessions trading_days = trading_days.astype("datetime64[ns]") def _pricing_iter(): sid = 0 with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo pricing data: ') as it, \ requests.Session() as session: for symbol in it: print symbol path = _cachpath(symbol, 'ohlcv') try: df = cache[path] except KeyError: df = cache[path] = get_data( symbol, start, end ) # df = cache[path] = DataReader( # symbol, # 'yahoo', # start, # end, # session=session, # ).sort_index() # the start date is the date of the first trade and # the end date is the date of the last trade df = df[df.Volume>0] start_date = df.index[0] end_date = df.index[-1] df = df.reindex(trading_days[(trading_days>=start_date)]) df.Volume = df.Volume.fillna(0) df = df.ffill() # The auto_close date is the day after the last trade. ac_date = end_date + pd.Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol df.rename( columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume', }, inplace=True, ) yield sid, df sid += 1 daily_bar_writer.write(_pricing_iter(), show_progress=show_progress) symbol_map = pd.Series(metadata.symbol.index, metadata.symbol) # Hardcode the exchange to "YAHOO" for all assets and (elsewhere) # register "YAHOO" to resolve to the NYSE calendar, because these are # all equities and thus can use the NYSE calendar. metadata['exchange'] = "hs300" asset_db_writer.write(equities=metadata) adjustments = [] with maybe_show_progress( symbols, show_progress, label='Downloading Yahoo adjustment data: ') as it, \ requests.Session() as session: for symbol in it: path = _cachpath(symbol, 'adjustment') try: df = cache[path] except KeyError: data = cache[path] = request( "123.56.77.52:10030", "Divid", {"symbol": symbol} ) df = pd.DataFrame(data).sort_index() # print df # df = cache[path] = DataReader( # symbol, # 'yahoo-actions', # start, # end, # session=session, # ).sort_index() df['sid'] = symbol_map[symbol] adjustments.append(df) adj_df = pd.concat(adjustments) adj_df.index.name = 'date' adj_df.reset_index(inplace=True) adj_df.date = pd.to_datetime(adj_df.date) adj_df = adj_df[adj_df.date > pd.Timestamp("2010-01-01")] splits = adj_df[adj_df.action == 'SPLIT'] splits = splits.rename( columns={'value': 'ratio', 'date': 'effective_date'}, ) splits.drop('action', axis=1, inplace=True) dividends = adj_df[adj_df.action == 'DIVIDEND'] dividends = dividends.rename( columns={'value': 'amount', 'date': 'ex_date'}, ) dividends.drop('action', axis=1, inplace=True) # we do not have this data in the yahoo dataset dividends['record_date'] = pd.NaT dividends['declared_date'] = pd.NaT dividends['pay_date'] = pd.NaT adjustment_writer.write(splits=splits, dividends=dividends)