def _retry_read_url(url, retry_count, pause, name): """ Open url (and retry) """ for _ in range(retry_count): time.sleep(pause) # kludge to close the socket ASAP try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True, na_values='-')[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover rs = rs[:-1] #Get rid of unicode characters in index name. try: rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore') except AttributeError: #Python 3 string has no decode method. rs.index.name = rs.index.name.encode('ascii', 'ignore').decode() return rs raise IOError("after %d tries, %s did not " "return a 200 for url %r" % (retry_count, name, url))
def _retry_read_url(url, retry_count, pause, name): for _ in range(retry_count): time.sleep(pause) # kludge to close the socket ASAP try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True, na_values='-')[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs ) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover rs = rs[:-1] #Get rid of unicode characters in index name. try: rs.index.name = rs.index.name.decode('unicode_escape').encode( 'ascii', 'ignore') except AttributeError: #Python 3 string has no decode method. rs.index.name = rs.index.name.encode('ascii', 'ignore').decode() return rs raise IOError("after %d tries, %s did not " "return a 200 for url %r" % (retry_count, name, url))
def get_dividends_yahoo(sid, start, end): # Taken from get_data_yahoo in Pandas library and adjust a single parameter to get dividends from pandas.compat import StringIO, bytes_to_str from pandas.io.common import urlopen start, end = pd.to_datetime(start), pd.to_datetime(end) url = ('http://ichart.finance.yahoo.com/table.csv?' + 's=%s' % sid + '&a=%s' % (start.month - 1) + '&b=%s' % start.day + '&c=%s' % start.year + '&d=%s' % (end.month - 1) + '&e=%s' % end.day + '&f=%s' % end.year + '&g=v' + # THE CHANGE '&ignore=.csv') with urlopen(url) as resp: lines = resp.read() rs = pd.read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True, na_values='-')[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover rs = rs[:-1] return rs
def datasets(self): if self._datasets is None: response = self._requests_get('/ExplorerHandler.ashx?t=marts') # format is json, but `Nodes` is not double-quoted # Thus, unable to parse by response.json() # replace 1st Nodes to "Nodes" content = bytes_to_str(response.content) content = content.replace(str('Nodes'), str('"Nodes"'), 1) import json result = json.loads(content) nodes = result['Nodes'] # import html.parser # import HTMLParser # parser = HTMLParser.HTMLParser() def delabel(node): # print(node['label']) # print(parser.feed(node['label'])) return node nodes = [delabel(n) for n in nodes] datasets = pd.DataFrame(result['Nodes']) self._datasets = [] # for dataflow in root.iter(sdmx._STRUCTURE + 'Dataflow'): # name = sdmx._get_english_name(dataflow) # id = dataflow.get('id') # resource = EurostatResource(name=name, id=id) # self._datasets.append(resource) return self._datasets
def read_clipboard(**kwargs): # pragma: no cover """ Read text from clipboard and pass to read_table. See read_table for the full argument list Returns ------- parsed : DataFrame """ if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: kwargs['sep'] = '\s+' from pandas.util.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding'))) except: pass return read_table(StringIO(text), **kwargs)
def read_clipboard(**kwargs): # pragma: no cover """ Read text from clipboard and pass to read_table. See read_table for the full argument list If unspecified, `sep` defaults to '\s+' Returns ------- parsed : DataFrame """ if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: kwargs['sep'] = '\s+' from pandas.util.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding')) ) except: pass return read_table(StringIO(text), **kwargs)
def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover r""" Read text from clipboard and pass to read_table. See read_table for the full argument list Parameters ---------- sep : str, default '\s+'. A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. Returns ------- parsed : DataFrame """ encoding = kwargs.pop('encoding', 'utf-8') # only utf-8 is valid for passed value because that's what clipboard # supports if encoding is not None and encoding.lower().replace('-', '') != 'utf8': raise NotImplementedError( 'reading from clipboard only supports utf-8 encoding') from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding')) ) except: pass # Excel copies into clipboard with \t separation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly lines = text[:10000].split('\n')[:-1][:10] # Need to remove leading white space, since read_table # accepts: # a b # 0 1 2 # 1 3 4 counts = {x.lstrip().count('\t') for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = r'\t' if sep is None and kwargs.get('delim_whitespace') is None: sep = r'\s+' return read_table(StringIO(text), sep=sep, **kwargs)
def read_clipboard(sep='\s+', **kwargs): # pragma: no cover r""" Read text from clipboard and pass to read_table. See read_table for the full argument list Parameters ---------- sep : str, default '\s+'. A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. Returns ------- parsed : DataFrame """ encoding = kwargs.pop('encoding', 'utf-8') # only utf-8 is valid for passed value because that's what clipboard # supports if encoding is not None and encoding.lower().replace('-', '') != 'utf8': raise NotImplementedError( 'reading from clipboard only supports utf-8 encoding') from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding'))) except: pass # Excel copies into clipboard with \t separation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly lines = text[:10000].split('\n')[:-1][:10] # Need to remove leading white space, since read_table # accepts: # a b # 0 1 2 # 1 3 4 counts = set([x.lstrip().count('\t') for x in lines]) if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = '\t' if sep is None and kwargs.get('delim_whitespace') is None: sep = '\s+' return read_table(StringIO(text), sep=sep, **kwargs)
def _read_url_as_StringIO(self, url, params=None): """ Open url (and retry) """ response = self._get_response(url, params=params) out = StringIO() if isinstance(response.content, compat.binary_type): out.write(bytes_to_str(response.content)) else: out.write(response.content) out.seek(0) return out
def test_repr_binary_type(): import string letters = string.ascii_letters try: raw = bytes(letters, encoding=cf.get_option('display.encoding')) except TypeError: raw = bytes(letters) b = str(compat.bytes_to_str(raw)) res = printing.pprint_thing(b, quote_strings=True) assert res == repr(b) res = printing.pprint_thing(b, quote_strings=False) assert res == b
def test_repr_binary_type(): import string letters = string.ascii_letters btype = compat.binary_type try: raw = btype(letters, encoding=cf.get_option('display.encoding')) except TypeError: raw = btype(letters) b = compat.text_type(compat.bytes_to_str(raw)) res = com.pprint_thing(b, quote_strings=True) assert_equal(res, repr(b)) res = com.pprint_thing(b, quote_strings=False) assert_equal(res, b)
def __next__(self): newline = self.mmap.readline() # readline returns bytes, not str, but Python's CSV reader # expects str, so convert the output to str before continuing newline = compat.bytes_to_str(newline) # mmap doesn't raise if reading past the allocated # data but instead returns an empty string, so raise # if that is returned if newline == '': raise StopIteration return newline
def process_http_error(ex): # See `BigQuery Troubleshooting Errors <https://cloud.google.com/bigquery/troubleshooting-errors>`__ status = json.loads(bytes_to_str(ex.content))['error'] errors = status.get('errors', None) if errors: for error in errors: reason = error['reason'] message = error['message'] raise GenericGBQException("Reason: {0}, Message: {1}".format(reason, message)) raise GenericGBQException(errors)
def read_clipboard(**kwargs): # pragma: no cover """ Read text from clipboard and pass to read_table. See read_table for the full argument list If unspecified, `sep` defaults to '\s+' Returns ------- parsed : DataFrame """ from pandas.util.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding'))) except: pass # Excel copies into clipboard with \t seperation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly lines = text[:10000].split('\n')[:-1][:10] # Need to remove leading white space, since read_table # accepts: # a b # 0 1 2 # 1 3 4 counts = set([x.lstrip().count('\t') for x in lines]) if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: kwargs['sep'] = '\t' if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: kwargs['sep'] = '\s+' return read_table(StringIO(text), **kwargs)
def read_clipboard(**kwargs): # pragma: no cover """ Read text from clipboard and pass to read_table. See read_table for the full argument list If unspecified, `sep` defaults to '\s+' Returns ------- parsed : DataFrame """ from pandas.util.clipboard import clipboard_get from pandas.io.parsers import read_table text = clipboard_get() # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does if compat.PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding')) ) except: pass # Excel copies into clipboard with \t seperation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly lines = text[:10000].split('\n')[:-1][:10] # Need to remove leading white space, since read_table # accepts: # a b # 0 1 2 # 1 3 4 counts = set([x.lstrip().count('\t') for x in lines]) if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: kwargs['sep'] = '\t' if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: kwargs['sep'] = '\s+' return read_table(StringIO(text), **kwargs)
def _read_url_as_StringIO(self, url, params=None): """ Open url (and retry) """ response = self._get_response(url, params=params) text = self._sanitize_response(response) out = StringIO() if len(text) == 0: service = self.__class__.__name__ raise IOError("{} request returned no data; check URL for invalid " "inputs: {}".format(service, self.url)) if isinstance(text, compat.binary_type): out.write(bytes_to_str(text)) else: out.write(text) out.seek(0) return out
def _retry_read_url(url, retry_count, pause, name): for _ in range(retry_count): time.sleep(pause) # kludge to close the socket ASAP try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, parse_dates=True)[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover rs = rs[:-1] return rs raise IOError("after %d tries, %s did not " "return a 200 for url %r" % (retry_count, name, url))
def _read_url_as_StringIO(self, url, params=None, min=0, errors='ignore'): """重写基类同名方法 根据派生类提供的encoding解析文本 """ response = self._get_response(url, params=params) text = self._sanitize_response(response) out = StringIO() if len(text) <= self._read_url_as_StringIO_min_len: if self._read_url_as_StringIO_less_min_len: service = self.__class__.__name__ raise IOError("{} request returned no data; check URL for " "invalid inputs: {}".format(service, self.url)) else: return None if isinstance(text, compat.binary_type): out.write(bytes_to_str(text, encoding=self._encoding)) else: out.write(text) out.seek(0) return out
def csv_to_df(text): df = pd.read_csv(StringIO(bytes_to_str(text)), index_col=0, parse_dates=True, infer_datetime_format=True, na_values='-')[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(df) > 2 and df.index[-1] == df.index[-2]: # pragma: no cover df = df[:-1] # Get rid of unicode characters in index name. try: df.index.name = df.index.name.decode('unicode_escape').encode('ascii', 'ignore') except AttributeError: # Python 3 string has no decode method. df.index.name = df.index.name.encode('ascii', 'ignore').decode() column_renames = {'Adj. Open': 'Adj Open', 'Adj. High': 'Adj High', 'Adj. Low': 'Adj Low', 'Adj. Close': 'Adj Close', 'Adj. Volume': 'Adj Volume'} df.rename(columns=column_renames, inplace=True) return df.tz_localize(pytz.UTC)
def _read_raw(self, **kwargs): if self._raw_content is None: response = self._requests_get() content_length = response.headers.get("content-length") out = StringIO() try: content_length = int(content_length) pb = network.ProgressBar(total=content_length) for chunk in response.iter_content(self._chunk_size): if chunk: out.write(chunk) pb.update(self._chunk_size) self._raw_content = out except Exception as e: # print(e) # no content_length or any errors if isinstance(response.content, binary_type): out.write(bytes_to_str(response.content)) else: out.write(response.content) self._raw_content = out return self._raw_content
def _read_raw(self, **kwargs): if self._raw_content is None: response = self._requests_get() content_length = response.headers.get('content-length') out = StringIO() try: content_length = int(content_length) pb = network.ProgressBar(total=content_length) for chunk in response.iter_content(self._chunk_size): if chunk: out.write(chunk) pb.update(self._chunk_size) self._raw_content = out except Exception as e: # print(e) # no content_length or any errors if isinstance(response.content, binary_type): out.write(bytes_to_str(response.content)) else: out.write(response.content) self._raw_content = out return self._raw_content
def _get_data(symbol, start=None, end=None, retry_count=3, pause=0.001): """ Returns DataFrame of historical corporate actions (dividends and stock splits) from symbols, over date range, start to end. All dates in the resulting DataFrame correspond with dividend and stock split ex-dates. Parameters ---------- sym : string with a single Single stock symbol (ticker). start : string, (defaults to '1/1/2010') Starting date, timestamp. Parses many different kind of date representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') end : string, (defaults to today) Ending date, timestamp. Same format as starting date. retry_count : int, default 3 Number of times to retry query request. pause : int, default 0 Time, in seconds, of the pause between retries. """ start, end = _sanitize_dates(start, end) url = (_URL + 's=%s' % symbol + \ '&a=%s' % (start.month - 1) + \ '&b=%s' % start.day + \ '&c=%s' % start.year + \ '&d=%s' % (end.month - 1) + \ '&e=%s' % end.day + \ '&f=%s' % end.year + \ '&g=v') for _ in range(retry_count): time.sleep(pause) try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: actions_index = [] actions_entries = [] for line in csv.reader(StringIO(bytes_to_str(lines))): # Ignore lines that aren't dividends or splits (Yahoo # add a bunch of irrelevant fields.) if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'): continue action, date, value = line if action == 'DIVIDEND': actions_index.append(to_datetime(date)) actions_entries.append({ 'action': action, 'value': float(value) }) elif action == 'SPLIT' and ':' in value: # Convert the split ratio to a fraction. For example a # 4:1 split expressed as a fraction is 1/4 = 0.25. denominator, numerator = value.split(':', 1) split_fraction = float(numerator) / float(denominator) actions_index.append(to_datetime(date)) actions_entries.append({ 'action': action, 'value': split_fraction }) return DataFrame(actions_entries, index=actions_index) raise IOError("after %d tries, Yahoo! did not " \ "return a 200 for url %r" % (retry_count, url))
def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover r""" Read text from clipboard and pass to read_csv. See read_csv for the full argument list Parameters ---------- sep : str, default '\s+'. A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. Returns ------- parsed : DataFrame """ encoding = kwargs.pop('encoding', 'utf-8') # only utf-8 is valid for passed value because that's what clipboard # supports if encoding is not None and encoding.lower().replace('-', '') != 'utf8': raise NotImplementedError( 'reading from clipboard only supports utf-8 encoding') from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_csv text = clipboard_get() # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does if PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding')) ) except: pass # Excel copies into clipboard with \t separation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly lines = text[:10000].split('\n')[:-1][:10] # Need to remove leading white space, since read_csv # accepts: # a b # 0 1 2 # 1 3 4 counts = {x.lstrip().count('\t') for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = '\t' # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get('delim_whitespace') is None: sep = r'\s+' # Regex separator currently only works with python engine. # Default to python if separator is multi-character (regex) if len(sep) > 1 and kwargs.get('engine') is None: kwargs['engine'] = 'python' elif len(sep) > 1 and kwargs.get('engine') == 'c': warnings.warn('read_clipboard with regex separator does not work' ' properly with c engine') # In PY2, the c table reader first encodes text with UTF-8 but Python # table reader uses the format of the passed string. For consistency, # encode strings for python engine so that output from python and c # engines produce consistent results if kwargs.get('engine') == 'python' and PY2: text = text.encode('utf-8') return read_csv(StringIO(text), sep=sep, **kwargs)
def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover r""" Read text from clipboard and pass to read_csv. See read_csv for the full argument list Parameters ---------- sep : str, default '\s+'. A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. Returns ------- parsed : DataFrame """ encoding = kwargs.pop('encoding', 'utf-8') # only utf-8 is valid for passed value because that's what clipboard # supports if encoding is not None and encoding.lower().replace('-', '') != 'utf8': raise NotImplementedError( 'reading from clipboard only supports utf-8 encoding') from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_csv text = clipboard_get() # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does if PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or get_option('display.encoding'))) except AttributeError: pass # Excel copies into clipboard with \t separation # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly lines = text[:10000].split('\n')[:-1][:10] # Need to remove leading white space, since read_csv # accepts: # a b # 0 1 2 # 1 3 4 counts = {x.lstrip().count('\t') for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = '\t' # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get('delim_whitespace') is None: sep = r'\s+' # Regex separator currently only works with python engine. # Default to python if separator is multi-character (regex) if len(sep) > 1 and kwargs.get('engine') is None: kwargs['engine'] = 'python' elif len(sep) > 1 and kwargs.get('engine') == 'c': warnings.warn('read_clipboard with regex separator does not work' ' properly with c engine') # In PY2, the c table reader first encodes text with UTF-8 but Python # table reader uses the format of the passed string. For consistency, # encode strings for python engine so that output from python and c # engines produce consistent results if kwargs.get('engine') == 'python' and PY2: text = text.encode('utf-8') return read_csv(StringIO(text), sep=sep, **kwargs)
def get_data_yahoo_actions(symbol, start=None, end=None, retry_count=3, pause=0.001): """ Returns DataFrame of historical corporate actions (dividends and stock splits) from symbols, over date range, start to end. Parameters ---------- sym : string with a single Single stock symbol (ticker). start : string, (defaults to '1/1/2010') Starting date, timestamp. Parses many different kind of date representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') end : string, (defaults to today) Ending date, timestamp. Same format as starting date. retry_count : int, default 3 Number of times to retry query request. pause : int, default 0 Time, in seconds, of the pause between retries. """ start, end = _sanitize_dates(start, end) url = (_HISTORICAL_YAHOO_ACTIONS_URL + 's=%s' % symbol + '&a=%s' % (start.month - 1) + '&b=%s' % start.day + '&c=%s' % start.year + '&d=%s' % (end.month - 1) + '&e=%s' % end.day + '&f=%s' % end.year + '&g=v') for _ in range(retry_count): time.sleep(pause) try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: actions_index = [] actions_entries = [] for line in csv.reader(StringIO(bytes_to_str(lines))): # Ignore lines that aren't dividends or splits (Yahoo # add a bunch of irrelevant fields.) if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'): continue action, date, value = line if action == 'DIVIDEND': actions_index.append(to_datetime(date)) actions_entries.append({ 'action': action, 'value': float(value) }) elif action == 'SPLIT' and ':' in value: # Convert the split ratio to a fraction. For example a # 4:1 split expressed as a fraction is 1/4 = 0.25. denominator, numerator = value.split(':', 1) split_fraction = float(numerator) / float(denominator) actions_index.append(to_datetime(date)) actions_entries.append({ 'action': action, 'value': split_fraction }) return DataFrame(actions_entries, index=actions_index) raise IOError("after %d tries, Yahoo! did not " "return a 200 for url %r" % (retry_count, url))