def _get_expiry_dates_and_links(self): """ Gets available expiry dates. Returns ------- Tuple of: List of datetime.date objects Dict of datetime.date objects as keys and corresponding links """ url = self._OPTIONS_BASE_URL.format(sym=self.symbol) root = self._parse_url(url) try: links = root.xpath('//*[@id="options_menu"]/form/select/option') except IndexError: # pragma: no cover raise RemoteDataError('Expiry dates not available') expiry_dates = [ dt.datetime.strptime(element.text, "%B %d, %Y").date() for element in links ] links = [element.attrib['data-selectbox-link'] for element in links] if len(expiry_dates) == 0: raise RemoteDataError('Data not available') # pragma: no cover expiry_links = dict(zip(expiry_dates, links)) self._expiry_links = expiry_links self._expiry_dates = expiry_dates return expiry_dates, expiry_links
def _dl_mult_symbols(self, symbols): stocks = {} failed = [] passed = [] for sym_group in _in_chunks(symbols, self.chunksize): for sym in sym_group: try: stocks[sym] = self._read_one_data(self.url, self._get_params(sym)) passed.append(sym) except (IOError, KeyError): msg = "Failed to read symbol: {0!r}, replacing with NaN." warnings.warn(msg.format(sym), SymbolWarning) failed.append(sym) if len(passed) == 0: msg = "No data fetched using {0!r}" raise RemoteDataError(msg.format(self.__class__.__name__)) try: if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0: df_na = stocks[passed[0]].copy() df_na[:] = np.nan for sym in failed: stocks[sym] = df_na if PANDAS_0230: result = concat(stocks, sort=True).unstack(level=0) else: result = concat(stocks).unstack(level=0) result.columns.names = ["Attributes", "Symbols"] return result except AttributeError: # cannot construct a panel with just 1D nans indicating no data msg = "No data fetched using {0!r}" raise RemoteDataError(msg.format(self.__class__.__name__))
def _dl_mult_symbols(self, symbols): stocks = {} failed = [] passed = [] for sym_group in _in_chunks(symbols, self.chunksize): for sym in sym_group: try: stocks[sym] = self._read_one_data(self.url, self._get_params(sym)) passed.append(sym) except IOError: msg = 'Failed to read symbol: {0!r}, replacing with NaN.' warnings.warn(msg.format(sym), SymbolWarning) failed.append(sym) if len(passed) == 0: msg = "No data fetched using {0!r}" raise RemoteDataError(msg.format(self.__class__.__name__)) try: if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0: df_na = stocks[passed[0]].copy() df_na[:] = np.nan for sym in failed: stocks[sym] = df_na return Panel(stocks).swapaxes('items', 'minor') except AttributeError: # cannot construct a panel with just 1D nans indicating no data msg = "No data fetched using {0!r}" raise RemoteDataError(msg.format(self.__class__.__name__))
def _dl_mult_symbols(self, symbols): stocks = {} failed = [] passed = [] for sym in symbols: try: dfs = self._read_one_data(sym) for k in dfs: dfs[k]['Ticker'] = sym if k not in stocks: stocks[k] = [] stocks[k].append(dfs[k]) passed.append(sym) except IOError: msg = 'Failed to read symbol: {0!r}, replacing with NaN.' warnings.warn(msg.format(sym), SymbolWarning) failed.append(sym) if len(passed) == 0: msg = "No data fetched using {0!r}" raise RemoteDataError(msg.format(self.__class__.__name__)) else: for k in stocks: dfs[k] = concat(stocks[k]).set_index(['Ticker', 'Date']) return dfs
def Read_stock(stocks, start, end): stock_data = {} adj_start = start adj_end = end if adj_start < pd.Timestamp('1980-01-01'): # set lower bound for starting period adj_start = pd.Timestamp('1980-01-01') try: for ticker in stocks.keys(): df = pd.DataFrame() df[ticker] = web.DataReader(ticker, 'yahoo', adj_start, adj_end)['Close'] df = df.fillna(method='ffill') stock_data[ticker] = df if adj_start < stock_data[ticker].index[0]: adj_start = stock_data[ticker].index[0] if adj_end > stock_data[ticker].index[-1]: adj_end = stock_data[ticker].index[-1] for ticker in stock_data.keys(): # adjust dataframe to same length stock_data[ticker] = stock_data[ticker].truncate(before=adj_start, after=adj_end) if len(stock_data) > 0: combine_stock = pd.concat([stock_data[x] for x in stock_data.keys()], axis = 1) combine_stock = combine_stock.fillna(method='ffill') for ticker in combine_stock.columns: stock_data[ticker] = combine_stock[ticker] except OverflowError as err: raise OverflowError(err) except RemoteDataError as err: raise RemoteDataError(err) except KeyError as err: raise KeyError(err) else: return stock_data, adj_start, adj_end
def _get_response(self, url, params=None, headers=None): """ send raw HTTP request to get requests.Response from the specified url Parameters ---------- url : str target URL params : dict or None parameters passed to the URL """ # initial attempt + retry pause = self.pause for i in range(self.retry_count + 1): response = self.session.get(url, params=params, headers=headers) if response.status_code == requests.codes.ok: return response time.sleep(pause) # Increase time between subsequent requests, per subclass. pause *= self.pause_multiplier # Get a new breadcrumb if necessary, in case ours is invalidated if isinstance(params, list) and 'crumb' in params: params['crumb'] = self._get_crumb(self.retry_count) if params is not None and len(params) > 0: url = url + "?" + urlencode(params) raise RemoteDataError('Unable to read URL: {0}'.format(url))
def _read_one_data(self, ftppath, params): if re.search(_ZIP_RE, ftppath) is not None: index_file = self._read_zipfile(ftppath) elif re.search(_GZ_RE, ftppath) is not None: index_file = self._read_gzfile(ftppath) else: index_file = StringIO() index_list = [] try: self._sec_ftp_session.retrlines('RETR ' + ftppath, index_list.append) except EOFError: raise RemoteDataError('FTP server has closed the connection.') for line in index_list: index_file.write(line + '\n') index_file.seek(0) index_file = self._remove_header(index_file) index = read_csv(index_file, delimiter='|', header=None, index_col=False, names=_COLUMNS, low_memory=False, dtype=_COLUMN_TYPES) index['filename'] = index['filename'].map(self._fix_old_file_paths) return index
def _option_from_url(self, url): jd = self._parse_url(url) result = jd['optionChain']['result'] try: calls = result['options']['calls'] puts = result['options']['puts'] except IndexError: raise RemoteDataError('Option json not available ' 'for url: %s' % url) self.underlying_price = (result['quote']['regularMarketPrice'] if result['quote']['marketState'] == 'PRE' else result['quote']['preMarketPrice']) quote_unix_time = (result['quote']['regularMarketTime'] if result['quote']['marketState'] == 'PRE' else result['quote']['preMarketTime']) self.quote_time = dt.datetime.fromtimestamp(quote_unix_time) calls = _parse_options_data(calls) puts = _parse_options_data(puts) calls = self._process_data(calls) puts = self._process_data(puts) return {'calls': calls, 'puts': puts}
def _option_from_url(self, url): jd = self._parse_url(url) result = jd["optionChain"]["result"] try: calls = result["options"]["calls"] puts = result["options"]["puts"] except IndexError: raise RemoteDataError("Option json not available " "for url: %s" % url) self.underlying_price = (result["quote"]["regularMarketPrice"] if result["quote"]["marketState"] == "PRE" else result["quote"]["preMarketPrice"]) quote_unix_time = (result["quote"]["regularMarketTime"] if result["quote"]["marketState"] == "PRE" else result["quote"]["preMarketTime"]) self.quote_time = dt.datetime.fromtimestamp(quote_unix_time) calls = _parse_options_data(calls) puts = _parse_options_data(puts) calls = self._process_data(calls) puts = self._process_data(puts) return {"calls": calls, "puts": puts}
def _option_frames_from_url(self, url): root = self._parse_url(url) try: calls = root.xpath( '//*[@id="optionsCallsTable"]/div[2]/div/table')[0] puts = root.xpath( '//*[@id="optionsPutsTable"]/div[2]/div/table')[0] except IndexError: raise RemoteDataError('Option Table not available for url: %s' % url) if not hasattr(self, 'underlying_price'): try: self.underlying_price, self.quote_time = self._underlying_price_and_time_from_url( url) except IndexError: self.underlying_price, self.quote_time = np.nan, np.nan calls = _parse_options_data(calls) puts = _parse_options_data(puts) calls = self._process_data(calls, 'call') puts = self._process_data(puts, 'put') return {'calls': calls, 'puts': puts}
def download_data_from_yahoo(symbol, adjust=True, backfill=False, start_date=config.start_date, end_date=config.end_date): """Generates a .csv file containing the high, low, open, close, volume, and adjusted close by date from Yahoo for the given symbol For currency conversion rates, the format is like "USDCAD=X" For the S&P500 index, Yahoo uses the ticker GSPX. This function is hardcoded to query Yahoo using GSPX instead of SP500, but save the data to SP500. Parameters: symbol : str adjust : bool Whether to incluse the AdjClose column or not. Uses my implementation, not Yahoo's backfill : bool, optional start_date : date, optional end_date : date, optional Returns: dataframe A dataframe containing high, low, open, close, volume, and adjusted close by date for the given symbol """ # Get some extra data to minimize refreshing if start_date > config.start_date: start_date = config.start_date if end_date < config.end_date: end_date = config.end_date with utils.HiddenPrints(): df = yf.Ticker(symbol).history(auto_adjust=False, start=start_date, end=end_date) # This causes an error with pandas, so we're using yfinance instead # if dividends causes errors, go to the error and remove the 'eval' # df = dr.get_data_yahoo(symbol, start_date, end_date) # df = df.merge(dr.DataReader(symbol, 'yahoo-dividends', start_date, end_date), how='outer', left_index=True, right_index=True) if symbol.upper() in yf.shared._ERRORS: raise RemoteDataError("No data fetched for symbol " + symbol + " using yfinance") df.drop("Adj Close", axis=1, inplace=True) df.drop("Stock Splits", axis=1, inplace=True) if adjust: df["AdjClose"] = df["Close"].add(df["Dividends"].cumsum()) if backfill: utils.backfill(df) # df.sort_index(inplace=True) # Yahoo data is sorted anyways utils.debug(df) df.to_csv( utils.get_file_path(config.prices_data_path, price_table_filename, symbol=symbol)) return df
def _read_lines(self, out): try: df = pd.DataFrame.from_dict(out[self.data_key], orient='index') except KeyError: raise RemoteDataError() df.sort_index(ascending=True, inplace=True) df.index = [id[3:] for id in df.index] return df
def _download_nasdaq_symbols(timeout): """ @param timeout: the time to wait for the FTP connection """ try: ftp_session = FTP(_NASDAQ_FTP_SERVER, timeout=timeout) ftp_session.login() except all_errors as err: raise RemoteDataError("Error connecting to %r: %s" % (_NASDAQ_FTP_SERVER, err)) lines = [] try: ftp_session.retrlines("RETR " + _NASDAQ_TICKER_LOC, lines.append) except all_errors as err: raise RemoteDataError("Error downloading from %r: %s" % (_NASDAQ_FTP_SERVER, err)) finally: ftp_session.close() # Sanity Checking if not lines[-1].startswith("File Creation Time:"): raise RemoteDataError("Missing expected footer. Found %r" % lines[-1]) # Convert Y/N to True/False. converter_map = dict( (col, _bool_converter) for col, t in _TICKER_DTYPE if t is bool) # For pandas >= 0.20.0, the Python parser issues a warning if # both a converter and dtype are specified for the same column. # However, this measure is probably temporary until the read_csv # behavior is better formalized. with warnings.catch_warnings(record=True): data = read_csv( StringIO("\n".join(lines[:-1])), "|", dtype=_TICKER_DTYPE, converters=converter_map, index_col=1, ) # Properly cast enumerations for cat in _CATEGORICAL: data[cat] = data[cat].astype("category") return data
def _read_lines(self, out): if "Information" in out: raise RemoteDataError() else: out.pop("Meta Data") df = pd.DataFrame(out) columns = [ "RT", "1D", "5D", "1M", "3M", "YTD", "1Y", "3Y", "5Y", "10Y" ] df.columns = columns return df
def _parse_url(self, url): """ Downloads and parses a URL, returns xml root. """ try: from lxml.html import parse except ImportError: # pragma: no cover raise ImportError("Please install lxml if you want to use the " "{0!r} class".format(self.__class__.__name__)) try: doc = parse(url) except _network_error_classes: # pragma: no cover raise RemoteDataError("Unable to parse URL " "{0!r}".format(url)) else: root = doc.getroot() if root is None: # pragma: no cover raise RemoteDataError("Parsed URL {0!r} has no root" "element".format(url)) return root
def _read_zipfile(self, ftppath): zipf = BytesIO() try: self._sec_ftp_session.retrbinary('RETR ' + ftppath, zipf.write) except EOFError: raise RemoteDataError('FTP server has closed the connection.') zipf.seek(0) with ZipFile(zipf, 'r') as zf: data = zf.open(zf.namelist()[0]).read().decode() return StringIO(data)
def _read_lines(self, out): try: df = pd.DataFrame.from_dict(out[self.data_key], orient="index") except KeyError: if "Error Message" in out: raise ValueError("The requested symbol {} could not be " "retrieved. Check valid ticker" ".".format(self.symbols)) else: raise RemoteDataError() df = df[sorted(df.columns)] df.columns = [id[3:] for id in df.columns] return df
def _download_nasdaq_symbols(timeout): """ @param timeout: the time to wait for the FTP connection """ try: ftp_session = FTP(_NASDAQ_FTP_SERVER, timeout=timeout) ftp_session.login() except all_errors as err: raise RemoteDataError('Error connecting to %r: $s' % (_NASDAQ_FTP_SERVER, err)) lines = [] try: ftp_session.retrlines('RETR ' + _NASDAQ_TICKER_LOC, lines.append) except all_errors as err: raise RemoteDataError('Error downloading from %r: $s' % (_NASDAQ_FTP_SERVER, err)) finally: ftp_session.close() # Sanity Checking if not lines[-1].startswith('File Creation Time:'): raise RemoteDataError('Missing expected footer. Found %r' % lines[-1]) # Convert Y/N to True/False. converter_map = dict( (col, _bool_converter) for col, t in _TICKER_DTYPE if t is bool) data = read_csv(StringIO('\n'.join(lines[:-1])), '|', dtype=_TICKER_DTYPE, converters=converter_map, index_col=1) # Properly cast enumerations for cat in _CATEGORICAL: data[cat] = data[cat].astype('category') return data
def read(self): try: self._sec_ftp_session = FTP(_SEC_FTP, timeout=self.timeout) self._sec_ftp_session.login() except EOFError: raise RemoteDataError('FTP server has closed the connection.') try: if self.symbols == 'full': return self._read_one_data(self.url, self.params) elif self.symbols == 'daily': return self._read_daily_data(self.url, self.params) finally: self._sec_ftp_session.close()
def _read_one_data(self, symbol): """ read one data from specified symbol """ url = 'https://finance.yahoo.com/quote/{}/history'.format(symbol) params = self._get_params(symbol) resp = self._get_response(url, params=params) ptrn = r'root\.App\.main = (.*?);\n}\(this\)\);' try: j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1)) data = j['context']['dispatcher']['stores']['HistoricalPriceStore'] except KeyError: msg = 'No data fetched for symbol {} using {}' raise RemoteDataError(msg.format(symbol, self.__class__.__name__)) # price data prices = DataFrame(data['prices']) prices.columns = map(str.capitalize, prices.columns) prices['Date'] = to_datetime(prices['Date'], unit='s').dt.date prices = prices[prices['Data'].isnull()] prices = prices[[ 'Date', 'High', 'Low', 'Open', 'Close', 'Volume', 'Adjclose' ]] prices = prices.rename(columns={'Adjclose': 'Adj Close'}) dfs = {'prices': prices} # dividends & splits data if self.get_actions: actions = DataFrame(data['eventsData']) actions.columns = map(str.capitalize, actions.columns) actions['Date'] = to_datetime(actions['Date'], unit='s').dt.date types = actions['Type'].unique() if 'DIVIDEND' in types: divs = actions[actions.Type == 'DIVIDEND'].copy() divs = divs[['Date', 'Amount']].reset_index(drop=True) dfs['dividends'] = divs if 'SPLIT' in types: splits = actions[actions.Type == 'SPLIT'].copy() splits['SplitRatio'] = splits['Splitratio'].apply( lambda x: eval(x)) splits = splits[[ 'Date', 'Denominator', 'Numerator', 'SplitRatio' ]] splits = splits.reset_index(drop=True) dfs['splits'] = splits return dfs
def _get_mlsd(self, dir): dir_list = [] try: self._sec_ftp_session.retrlines('MLSD' + ' ' + dir, dir_list.append) except EOFError: raise RemoteDataError('FTP server has closed the connection.') dict_list = [] for line in dir_list: entry = self._process_mlsd_line(line) entry['path'] = dir dict_list.append(entry) return dict_list
def _read_gzfile(self, ftppath): zipf = BytesIO() try: self._sec_ftp_session.retrbinary('RETR ' + ftppath, zipf.write) except EOFError: raise RemoteDataError('FTP server has closed the connection.') zipf.seek(0) zf = gzip.GzipFile(fileobj=zipf, mode='rb') try: data = zf.read().decode('iso-8859-1') finally: zf.close() return StringIO(data)
def _parse_url(self, url): """ Downloads and parses a URL, returns xml root. """ try: from lxml.html import parse except ImportError: # pragma: no cover raise ImportError("Please install lxml if you want to use the " "{0!r} class".format(self.__class__.__name__)) doc = parse(self._read_url_as_StringIO(url)) root = doc.getroot() if root is None: # pragma: no cover raise RemoteDataError("Parsed URL {0!r} has no root" "element".format(url)) return root
def _parse_url(self, url): """ Downloads and parses a URL into a json object. Parameters ---------- url : String The url to load and parse Returns ------- A JSON object """ jd = json.loads(self._read_url_as_StringIO(url).read()) if jd is None: # pragma: no cover raise RemoteDataError("Parsed URL {0!r} is not " "a valid json object".format(url)) return jd
def _get_response(self, url, params=None): """ send raw HTTP request to get requests.Response from the specified url Parameters ---------- url : str target URL params : dict or None parameters passed to the URL """ # initial attempt + retry for i in range(self.retry_count + 1): response = self.session.get(url, params=params) if response.status_code == requests.codes.ok: return response time.sleep(self.pause) raise RemoteDataError('Unable to read URL: {0}'.format(url))
def _get_response(self, url, params=None, headers=None): """ send raw HTTP request to get requests.Response from the specified url Parameters ---------- url : str target URL params : dict or None parameters passed to the URL """ # initial attempt + retry pause = self.pause last_response_text = "" for _ in range(self.retry_count + 1): response = self.session.get(url, params=params, headers=headers, timeout=self.timeout) if response.status_code == requests.codes.ok: return response if response.encoding: last_response_text = response.text.encode(response.encoding) time.sleep(pause) # Increase time between subsequent requests, per subclass. pause *= self.pause_multiplier # Get a new breadcrumb if necessary, in case ours is invalidated if isinstance(params, list) and "crumb" in params: params["crumb"] = self._get_crumb(self.retry_count) # If our output error function returns True, exit the loop. if self._output_error(response): break if params is not None and len(params) > 0: url = url + "?" + urlencode(params) msg = "Unable to read URL: {0}".format(url) if last_response_text: msg += "\nResponse Text:\n{0}".format(last_response_text) raise RemoteDataError(msg)
def _get_expiry_dates(self): """ Gets available expiry dates. Returns ------- List of datetime.date objects """ url = self._OPTIONS_BASE_URL.format(sym=self.symbol) jd = self._parse_url(url) expiry_dates =\ [dt.datetime.utcfromtimestamp(ts).date() for ts in jd['optionChain']['result'][0]['expirationDates']] if len(expiry_dates) == 0: raise RemoteDataError('Data not available') # pragma: no cover self._expiry_dates = expiry_dates return expiry_dates
def _dl_mult_symbols(self, symbols): stocks = {} failed = [] passed = [] for sym in symbols: try: df = self._read_one_data(sym) df['PairCode'] = sym stocks[sym] = df passed.append(sym) except IOError: msg = 'Failed to read symbol: {0!r}, replacing with NaN.' warnings.warn(msg.format(sym), SymbolWarning) failed.append(sym) if len(passed) == 0: msg = "No data fetched using {0!r}" raise RemoteDataError(msg.format(self.__class__.__name__)) else: return concat(stocks).set_index(['PairCode', 'Date'])
def get_prices(symbols, dt_start, dt_end): """ Returns the 'adjusted' prices for the given timespan and the given symbols. Args: symbols [str]: The list of symbols dt_start (datetime): The data for the first t dt_end (datetime): Returns: Returns a pandas dataframe with the closing prices for the given symbols in the given timespan. """ try: prices = load_bars_from_yahoo(stocks=symbols, start=dt_start, end=dt_end) except RemoteDataError as e: msg = "An error occurred reading the prices for the given symbols." \ "Please make sure that the stock symbols are valid: {}".format(e) logging.getLogger().warning(msg) raise RemoteDataError(msg) return prices
def _dl_mult_symbols(self, symbols): stocks = {} failed = [] passed = [] for sym_group in _in_chunks(symbols, self.chunksize): for sym in sym_group: try: stocks[sym] = self._read_one_data(self.url, self._get_params(sym)) passed.append(sym) except IOError: msg = 'Failed to read symbol: {0!r}, replacing with NaN.' warnings.warn(msg.format(sym), SymbolWarning) failed.append(sym) if len(passed) == 0: msg = "No data fetched using {0!r}" raise RemoteDataError(msg.format(self.__class__.__name__)) elif len(stocks) > 0 and len(failed) > 0 and len(passed) > 0: for sym in failed: stocks[sym] = None return stocks