def _read(obj): """Try to read from a url, file or string. Parameters ---------- obj : str, unicode, or file-like Returns ------- raw_text : str """ if _is_url(obj): with urlopen(obj) as url: text = url.read() elif hasattr(obj, 'read'): text = obj.read() elif isinstance(obj, char_types): text = obj try: if os.path.isfile(text): with open(text, 'rb') as f: return f.read() except (TypeError, ValueError): pass else: raise TypeError("Cannot read object of type %r" % type(obj).__name__) return text
def _read(io): """Try to read from a url, file or string. Parameters ---------- io : str, unicode, or file-like Returns ------- raw_text : str """ if _is_url(io): try: with urlopen(io) as url: raw_text = url.read() except urllib2.URLError: raise ValueError('Invalid URL: "{0}"'.format(io)) elif hasattr(io, "read"): raw_text = io.read() elif os.path.isfile(io): with open(io) as f: raw_text = f.read() elif isinstance(io, basestring): raw_text = io else: raise TypeError("Cannot read object of type " "'{0.__class__.__name__!r}'".format(io)) return raw_text
def __init__(self, io, **kwds): import xlrd # throw an ImportError if we need to ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) if ver < (0, 9): # pragma: no cover raise ImportError("pandas requires xlrd >= 0.9.0 for excel " "support, current version " + xlrd.__VERSION__) self.io = io engine = kwds.pop('engine', None) if engine is not None and engine != 'xlrd': raise ValueError("Unknown engine: %s" % engine) if isinstance(io, compat.string_types): if _is_url(io): data = _urlopen(io).read() self.book = xlrd.open_workbook(file_contents=data) else: self.book = xlrd.open_workbook(io) elif engine == 'xlrd' and isinstance(io, xlrd.Book): self.book = io elif not isinstance(io, xlrd.Book) and hasattr(io, "read"): # N.B. xlrd.Book has a read attribute too data = io.read() self.book = xlrd.open_workbook(file_contents=data) else: raise ValueError('Must explicitly set engine if not passing in' ' buffer or path for io.')
def _read(io): """Try to read from a url, file or string. Parameters ---------- io : str, unicode, or file-like Returns ------- raw_text : str """ if _is_url(io): with urlopen(io) as url: raw_text = url.read() elif hasattr(io, 'read'): raw_text = io.read() elif os.path.isfile(io): with open(io) as f: raw_text = f.read() elif isinstance(io, compat.string_types): raw_text = io else: raise TypeError("Cannot read object of type " "'{0.__class__.__name__!r}'".format(io)) return raw_text
def _write_cell(self, s, kind='td', indent=0, tags=None): if tags is not None: start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags) else: start_tag = '<{kind}>'.format(kind=kind) if self.escape: # escape & first to prevent double escaping of & esc = OrderedDict([('&', r'&'), ('<', r'<'), ('>', r'>')]) else: esc = {} rs = pprint_thing(s, escape_chars=esc).strip() if self.render_links and _is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() start_tag += '<a href="{url}" target="_blank">'.format( url=rs_unescaped) end_a = '</a>' else: end_a = '' self.write(u'{start}{rs}{end_a}</{kind}>'.format( start=start_tag, rs=rs, end_a=end_a, kind=kind), indent)
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=True, encoding=self.encoding) try: if _is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: raise e else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
def _write_cell(self, s, kind='td', indent=0, tags=None, i=None, j=None): if self.plot_type is not None and i is not None and j is not None: if self.is_multi_c: column_code = [c[j] for c in self.tr_frame.columns.codes] else: column_code = [j] if self.is_multi_r: row_code = [c[i] for c in self.tr_frame.index.codes] else: row_code = [i] column_code = np.asarray(column_code) row_code = np.asarray(row_code) color = np.hstack((column_code, row_code))[self.color_indices] color += np.ones((2, ), dtype=np.uint8) color = self.colormap(color[0], color[1]) miniplot_str = self.miniplot(self.tr_frame.iloc[i, j], self.rheight, self.pwidth, color, self.plot_type) if tags is not None: start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags) else: start_tag = '<{kind}>'.format(kind=kind) tags = "" start_miniplot_tag = '<{kind} {tags}>'.format( kind=kind, tags=tags + ' style="padding-right: 1em;"') if self.escape: # escape & first to prevent double escaping of & esc = OrderedDict([('&', r'&'), ('<', r'<'), ('>', r'>')]) else: esc = {} rs = s if self.render_links and _is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() rs = '<a href="{url}" target="_blank">{rs}</a>'.format( url=rs_unescaped, rs=rs) if self.plot_type is not None and i is not None and j is not None and self.ppos == "left": self.write( '{start}{rs}</{kind}>'.format(start=start_miniplot_tag, rs=miniplot_str, kind=kind), indent) self.write( '{start}{rs}</{kind}>'.format(start=start_tag, rs=rs, kind=kind), indent) if self.plot_type is not None and i is not None and j is not None and self.ppos == "right": self.write( '{start}{rs}</{kind}>'.format(start=start_miniplot_tag, rs=miniplot_str, kind=kind), indent)
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=False, encoding=self.encoding) try: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError): # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: # not a url scheme = parse_url(self.io).scheme if scheme not in _valid_schemes: # lxml can't parse it msg = (('{invalid!r} is not a valid url scheme, valid ' 'schemes are {valid}') .format(invalid=scheme, valid=_valid_schemes)) raise ValueError(msg) else: # something else happened: maybe a faulty connection raise else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
def __init__(self, filepath_or_buffer): """Reader using xlrd engine. Parameters ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. """ err_msg = "Install xlrd >= 1.0.0 for Excel support" try: import xlrd except ImportError: raise ImportError(err_msg) else: if xlrd.__VERSION__ < LooseVersion("1.0.0"): raise ImportError(err_msg + ". Current version " + xlrd.__VERSION__) from pandas.io.excel._base import ExcelFile # If filepath_or_buffer is a url, want to keep the data as bytes so # can't pass to get_filepath_or_buffer() if _is_url(filepath_or_buffer): filepath_or_buffer = urlopen(filepath_or_buffer) elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer( filepath_or_buffer) if isinstance(filepath_or_buffer, xlrd.Book): self.book = filepath_or_buffer elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too if hasattr(filepath_or_buffer, 'seek'): try: # GH 19779 filepath_or_buffer.seek(0) except UnsupportedOperation: # HTTPResponse does not support seek() # GH 20434 pass data = filepath_or_buffer.read() self.book = xlrd.open_workbook(file_contents=data) elif isinstance(filepath_or_buffer, compat.string_types): self.book = xlrd.open_workbook(filepath_or_buffer) else: raise ValueError('Must explicitly set engine if not passing in' ' buffer or path for io.')
def __init__(self, filepath_or_buffer): """Reader using xlrd engine. Parameters ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. """ err_msg = "Install xlrd >= 1.0.0 for Excel support" try: import xlrd except ImportError: raise ImportError(err_msg) else: if xlrd.__VERSION__ < LooseVersion("1.0.0"): raise ImportError(err_msg + ". Current version " + xlrd.__VERSION__) from pandas.io.excel._base import ExcelFile # If filepath_or_buffer is a url, want to keep the data as bytes so # can't pass to get_filepath_or_buffer() if _is_url(filepath_or_buffer): filepath_or_buffer = _urlopen(filepath_or_buffer) elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer( filepath_or_buffer) if isinstance(filepath_or_buffer, xlrd.Book): self.book = filepath_or_buffer elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too if hasattr(filepath_or_buffer, 'seek'): try: # GH 19779 filepath_or_buffer.seek(0) except UnsupportedOperation: # HTTPResponse does not support seek() # GH 20434 pass data = filepath_or_buffer.read() self.book = xlrd.open_workbook(file_contents=data) elif isinstance(filepath_or_buffer, compat.string_types): self.book = xlrd.open_workbook(filepath_or_buffer) else: raise ValueError('Must explicitly set engine if not passing in' ' buffer or path for io.')
def _read(obj): if _is_url(obj): with urlopen(obj) as url: text = url.read() elif hasattr(obj, 'read'): text = obj.read() elif isinstance(obj, char_types): text = obj try: if os.path.isfile(text): with open(text, 'rb') as f: return f.read() except (TypeError, ValueError): pass else: raise TypeError("Cannot read object of type %r" % type(obj).__name__) return text
def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if _is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too filepath_or_buffer.seek(0) self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." )
def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if _is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): filepath_or_buffer, _, _, _ = get_filepath_or_buffer( filepath_or_buffer) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too filepath_or_buffer.seek(0) self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) else: raise ValueError('Must explicitly set engine if not passing in' ' buffer or path for io.')
def filepath_to_buffer(filepath, encoding=None, compression=None, timeout=None, start_byte=0): if not is_str(filepath): #if start_byte: # filepath.seek(start_byte) return filepath, encoding, compression, filepath.size() if _is_url(filepath): headers = None if start_byte: headers = {"Range": "bytes={}-".format(start_byte)} req = requests.get(filepath, stream=True, headers=headers, timeout=timeout) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': compression = 'gzip' size = req.headers.get('Content-Length', 0) #return HttpDesc(req.raw, filepath), encoding, compression, int(size) return req.raw, encoding, compression, int(size) if _is_s3_url(filepath): from pandas.io import s3 reader, encoding, compression = s3_get_filepath_or_buffer( filepath, encoding=encoding, compression=compression) return reader, encoding, compression, reader.size if _is_buffer_url(filepath): buffer = _url_to_buffer(filepath) return buffer, encoding, compression, buffer.size() filepath = os.path.expanduser(filepath) if not os.path.exists(filepath): raise ValueError("wrong filepath: {}".format(filepath)) size = os.stat(filepath).st_size stream = io.FileIO(filepath) if start_byte: stream.seek(start_byte) return stream, encoding, compression, size
def _read(io): """Try to read from a url, file or string. Parameters ---------- io : str, unicode, or file-like Returns ------- raw_text : str """ if _is_url(io): with urlopen(io) as url: raw_text = url.read() elif hasattr(io, 'read'): raw_text = io.read() elif os.path.isfile(io): with open(io) as f: raw_text = f.read() elif isinstance(io, string_types): raw_text = io else: raise TypeError("Cannot read object of type %r" % type(io).__name__) return raw_text