def maybe_read_encoded_stream(reader, encoding=None, compression=None): """read an encoded stream from the reader and transform the bytes to unicode if required based on the encoding Parameters ---------- reader : a streamable file-like object encoding : optional, the encoding to attempt to read Returns ------- a tuple of (a stream of decoded bytes, the encoding which was used) """ if compat.PY3 or encoding is not None: # pragma: no cover if encoding: errors = 'strict' else: errors = 'replace' encoding = 'utf-8' if compression == 'gzip': reader = BytesIO(reader.read()) else: reader = StringIO(reader.read().decode(encoding, errors)) else: if compression == 'gzip': reader = BytesIO(reader.read()) encoding = None return reader, encoding
def test_variable_width_unicode(): data = """ שלום שלום ום שלל של ום """.strip("\r\n") encoding = "utf8" kwargs = dict(header=None, encoding=encoding) expected = read_fwf(BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs) result = read_fwf(BytesIO(data.encode(encoding)), **kwargs) tm.assert_frame_equal(result, expected)
def test_variable_width_unicode(self): if not compat.PY3: raise nose.SkipTest( 'Bytes-related test - only needs to work on Python 3') test = """ שלום שלום ום שלל של ום """.strip('\r\n') expected = read_fwf(BytesIO(test.encode('utf8')), colspecs=[(0, 4), (5, 9)], header=None, encoding='utf8') tm.assert_frame_equal(expected, read_fwf( BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST # are environment variables parsed_url = parse_url(filepath_or_buffer) s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') try: conn = boto.connect_s3(host=s3_host) except boto.exception.NoAuthHandlerFound: conn = boto.connect_s3(host=s3_host, anon=True) b = conn.get_bucket(parsed_url.netloc, validate=False) if compat.PY2 and (compression == 'gzip' or (compression == 'infer' and filepath_or_buffer.endswith(".gz"))): k = boto.s3.key.Key(b, parsed_url.path) filepath_or_buffer = BytesIO( k.get_contents_as_string(encoding=encoding)) else: k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) k.open('r') # Expose read errors immediately filepath_or_buffer = k return filepath_or_buffer, None, compression
def test_encode(self, html_encoding_file): _, encoding = os.path.splitext( os.path.basename(html_encoding_file) )[0].split('_') try: with open(html_encoding_file, 'rb') as fobj: from_string = self.read_html(fobj.read(), encoding=encoding, index_col=0).pop() with open(html_encoding_file, 'rb') as fobj: from_file_like = self.read_html(BytesIO(fobj.read()), encoding=encoding, index_col=0).pop() from_filename = self.read_html(html_encoding_file, encoding=encoding, index_col=0).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) except Exception: # seems utf-16/32 fail on windows if is_platform_windows(): if '16' in encoding or '32' in encoding: pytest.skip() raise
def setup(self, engine): N = 2000 C = 5 self.df = DataFrame(np.random.randn(N, C), columns=['float{}'.format(i) for i in range(C)], index=date_range('20000101', periods=N, freq='H')) self.df['object'] = tm.makeStringIndex(N) self.bio_read = BytesIO() self.writer_read = ExcelWriter(self.bio_read, engine=engine) self.df.to_excel(self.writer_read, sheet_name='Sheet1') self.writer_read.save() self.bio_read.seek(0) self.bio_write = BytesIO() self.bio_write.seek(0) self.writer_write = ExcelWriter(self.bio_write, engine=engine)
def setup(self): self.f = '__test__.msg' def remove(f): try: os.remove(self.f) except: pass self.N = 100000 self.C = 5 self.index = date_range('20000101', periods=self.N, freq='H') self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) self.N = 100000 self.C = 5 self.index = date_range('20000101', periods=self.N, freq='H') self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) self.df2['object'] = [('%08x' % randrange((16**8))) for _ in range(self.N)] remove(self.f) self.bio = BytesIO() self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') self.df[:2000].to_excel(self.writer) self.writer.save()
def test_buffer_rd_bytes_bad_unicode(self): # see gh-22748 t = BytesIO(b"\xB0") if PY3: t = TextIOWrapper(t, encoding='ascii', errors='surrogateescape') with pytest.raises(UnicodeError): self.read_csv(t, encoding='UTF-8')
def test_sniff_delimiter_encoding(python_parser_only, encoding): parser = python_parser_only data = """ignore this ignore this too index|A|B|C foo|1|2|3 bar|4|5|6 baz|7|8|9 """ if encoding is not None: data = u(data).encode(encoding) data = BytesIO(data) if compat.PY3: from io import TextIOWrapper data = TextIOWrapper(data, encoding=encoding) else: data = StringIO(data) result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding) expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"], index=Index(["foo", "bar", "baz"], name="index")) tm.assert_frame_equal(result, expected)
def setup(self): self.f = '__test__.msg' def remove(f): try: os.remove(self.f) except: pass self.N = 100000 self.C = 5 self.index = date_range('20000101', periods=self.N, freq='H') self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) self.N = 100000 self.C = 5 self.index = date_range('20000101', periods=self.N, freq='H') self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) self.df2['object'] = [('%08x' % randrange((16**8))) for _ in range(self.N)] remove(self.f) self.bio = BytesIO()
def _pickle_array(arr): arr = arr.view(np.ndarray) buf = BytesIO() write_array(buf, arr) return buf.getvalue()
def test_buffer_rd_bytes_bad_unicode(c_parser_only): # see gh-22748 t = BytesIO(b"\xB0") t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape") msg = "'utf-8' codec can't encode character" with pytest.raises(UnicodeError, match=msg): c_parser_only.read_csv(t, encoding="UTF-8")
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) if compression == 'infer': content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': compression = 'gzip' else: compression = None # cat on the compression to the tuple returned by the function to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \ [compression] return tuple(to_return) if _is_s3_url(filepath_or_buffer): try: import boto except: raise ImportError("boto is required to handle s3 files") # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are environment variables parsed_url = parse_url(filepath_or_buffer) try: conn = boto.connect_s3() except boto.exception.NoAuthHandlerFound: conn = boto.connect_s3(anon=True) b = conn.get_bucket(parsed_url.netloc, validate=False) if compat.PY2 and (compression == 'gzip' or (compression == 'infer' and filepath_or_buffer.endswith(".gz"))): k = boto.s3.key.Key(b, parsed_url.path) filepath_or_buffer = BytesIO(k.get_contents_as_string( encoding=encoding)) else: k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) k.open('r') # Expose read errors immediately filepath_or_buffer = k return filepath_or_buffer, None, compression # It is a pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) return _expand_user(filepath_or_buffer), None, compression
def test_variable_width_unicode(): if not compat.PY3: pytest.skip("Bytes-related test - only needs to work on Python 3") data = """ שלום שלום ום שלל של ום """.strip("\r\n") encoding = "utf8" kwargs = dict(header=None, encoding=encoding) expected = read_fwf(BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs) result = read_fwf(BytesIO(data.encode(encoding)), **kwargs) tm.assert_frame_equal(result, expected)
def _read_zipfile(self, url): zipf = BytesIO(self._get_response(url).content) with ZipFile(zipf, 'r') as zf: data = zf.open(zf.namelist()[0]).read().decode() return data
def test_BytesIO_input(self): if not compat.PY3: pytest.skip("Bytes-related test - only needs to work on Python 3") data = BytesIO("שלום::1234\n562::123".encode('cp1255')) result = self.read_table(data, sep="::", encoding='cp1255') expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) tm.assert_frame_equal(result, expected)
def test_read_csv_chunked_download(self, s3_resource, caplog): # 8 MB, S3FS usees 5MB chunks df = DataFrame(np.random.randn(100000, 4), columns=list('abcd')) buf = BytesIO() str_buf = StringIO() df.to_csv(str_buf) buf = BytesIO(str_buf.getvalue().encode('utf-8')) s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf) with caplog.at_level(logging.DEBUG, logger='s3fs.core'): read_csv("s3://pandas-test/large-file.csv", nrows=5) # log of fetch_range (start, stop) assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records))
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None, mode=None): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. Parameters ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional Returns ------- tuple of ({a filepath_ or buffer or S3File instance}, encoding, str, compression, str, should_close, bool) """ filepath_or_buffer = _stringify_path(filepath_or_buffer) if _is_url(filepath_or_buffer): req = urlopen(filepath_or_buffer) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression, mode=mode) if is_gcs_url(filepath_or_buffer): from pandas.io import gcs return gcs.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression, mode=mode) if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = "Invalid file path or buffer object type: {_type}" raise ValueError(msg.format(_type=type(filepath_or_buffer))) return filepath_or_buffer, None, compression, False
def write_graph(self, graph_object, graph_name='Graph', image_width=5.25): memfile = BytesIO() graph_object.get_figure().savefig(memfile) self.document.add_paragraph(graph_name, style='List Bullet') self.document.add_picture(memfile, width=Inches(image_width)) self.document.save(self.docname) memfile.close()
def test_BytesIO_input(self): if not compat.PY3: pytest.skip("Bytes-related test - only needs to work on Python 3") result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[2, 2], encoding='utf8') expected = DataFrame([["של", "ום"]], columns=["של", "ום"]) tm.assert_frame_equal(result, expected)
def _unpickle_array(bytes): arr = read_array(BytesIO(bytes)) # All datetimes should be stored as M8[ns]. When unpickling with # numpy1.6, it will read these as M8[us]. So this ensures all # datetime64 types are read as MS[ns] if is_datetime64_dtype(arr): arr = arr.view(_NS_DTYPE) return arr
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding): # see gh-3404 expected = DataFrame({"a": [1], "b": [2]}) parser = python_parser_only data = "1" + sep + "2" encoded_data = data.encode(encoding) result = parser.read_csv(BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding) tm.assert_frame_equal(result, expected)
def fastmsgpack_loads(data): raw = list(msgpack_unpack( BytesIO(_l1(data)), object_hook=object_hook, )) # raw will always be a list, which is most likely a list containing # a single dataframe or series if len(raw) == 1: # we only serialized one structure, just return it return raw[0] return raw
def test_utf16_example(self): path = tm.get_data_path('utf16_ex.txt') # it works! and is the right length result = self.read_table(path, encoding='utf-16') self.assertEqual(len(result), 50) if not compat.PY3: buf = BytesIO(open(path, 'rb').read()) result = self.read_table(buf, encoding='utf-16') self.assertEqual(len(result), 50)
def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 s3_object = s3_resource.meta.client.get_object(Bucket='pandas-test', Key='tips.csv') result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8') assert isinstance(result, DataFrame) assert not result.empty expected = read_csv(tips_file) tm.assert_frame_equal(result, expected)
def test_streaming_s3_objects(): # GH17135 # botocore gained iteration support in 1.10.47, can now be used in read_* pytest.importorskip('botocore', minversion='1.10.47') from botocore.response import StreamingBody data = [ b'foo,bar,baz\n1,2,3\n4,5,6\n', b'just,the,header\n', ] for el in data: body = StreamingBody(BytesIO(el), content_length=len(el)) read_csv(body)
def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ If the filepath_or_buffer is a url, translate and return the buffer passthru otherwise. Parameters ---------- filepath_or_buffer : a url, filepath, or buffer encoding : the encoding to use to decode py3 bytes, default is 'utf-8' Returns ------- a filepath_or_buffer, the encoding, the compression """ if _is_url(filepath_or_buffer): req = _urlopen(str(filepath_or_buffer)) if compression == 'infer': content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': compression = 'gzip' # cat on the compression to the tuple returned by the function to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \ [compression] return tuple(to_return) if _is_s3_url(filepath_or_buffer): try: import boto except: raise ImportError("boto is required to handle s3 files") # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY # are environment variables parsed_url = parse_url(filepath_or_buffer) try: conn = boto.connect_s3() except boto.exception.NoAuthHandlerFound: conn = boto.connect_s3(anon=True) b = conn.get_bucket(parsed_url.netloc, validate=False) k = boto.s3.key.Key(b) k.key = parsed_url.path filepath_or_buffer = BytesIO( k.get_contents_as_string(encoding=encoding)) return filepath_or_buffer, None, compression return _expand_user(filepath_or_buffer), None, compression
def test_encoding_non_utf8_multichar_sep(self): # see gh-3404 expected = DataFrame({'a': [1], 'b': [2]}) for sep in ['::', '#####', '!!!', '123', '#1!c5', '%!c!d', '@@#4:2', '_!pd#_']: data = '1' + sep + '2' for encoding in ['utf-16', 'utf-16-be', 'utf-16-le', 'utf-32', 'cp037']: encoded_data = data.encode(encoding) result = self.read_csv(BytesIO(encoded_data), sep=sep, names=['a', 'b'], encoding=encoding) tm.assert_frame_equal(result, expected)
def main(): expire_after = timedelta(days=1) if PY2: filename = 'cache_py2' else: filename = 'cache' session = requests_cache.CachedSession(cache_name=filename, expire_after=expire_after) dt = pd.to_datetime("2014-01-01") symbol = "AUD/USD" symbol = symbol.replace("/", "").upper() year = dt.year month = dt.month month_name = datetime.datetime(year=1970, month=month, day=1).strftime('%B').upper() #url = "http://www.truefx.com/dev/data/2014/JANUARY-2014/AUDUSD-2014-01.zip" url = "http://www.truefx.com/dev/data/{year:04d}/{month_name}-{year:04d}/{symbol}-{year:04d}-{month:02d}.zip".format( year=year, month=month, symbol=symbol, month_name=month_name) response = session.get(url) zip_data = BytesIO(response.content) filename = "{symbol}-{year:04d}-{month:02d}.csv".format(year=year, month=month, symbol=symbol) with ZipFile(zip_data, 'r') as zf: #filename = zf.namelist()[0] zfile = zf.open(filename) #print(zfile) #(symb, dt, ask, bid) = zfile.read().split(',') #print(zfile.__dict__) data = zfile.readlines() #df = pd.read_csv(zfile._fileobj) # ToFix: can't make it work correctly #return df = pd.DataFrame(data) #df = df[:100] # just for test df[0] = df[0].str.decode('utf8') df[0] = df[0].str.replace('\n', '') df[0] = df[0].map(lambda s: s.split(',')) df['Symbol'] = df[0].map(lambda t: t[0]) df['Date'] = df[0].map(lambda t: pd.to_datetime(t[1])) df['Bid'] = df[0].map(lambda t: t[2]).astype(float) df['Ask'] = df[0].map(lambda t: t[3]).astype(float) del df[0] df = df.set_index('Date') print(df)
def test_sniff_delimiter(self): text = """index|A|B|C foo|1|2|3 bar|4|5|6 baz|7|8|9 """ data = self.read_csv(StringIO(text), index_col=0, sep=None) self.assert_index_equal(data.index, Index(['foo', 'bar', 'baz'], name='index')) data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|') tm.assert_frame_equal(data, data2) text = """ignore this ignore this too index|A|B|C foo|1|2|3 bar|4|5|6 baz|7|8|9 """ data3 = self.read_csv(StringIO(text), index_col=0, sep=None, skiprows=2) tm.assert_frame_equal(data, data3) text = u("""ignore this ignore this too index|A|B|C foo|1|2|3 bar|4|5|6 baz|7|8|9 """).encode('utf-8') s = BytesIO(text) if compat.PY3: # somewhat False since the code never sees bytes from io import TextIOWrapper s = TextIOWrapper(s, encoding='utf-8') data4 = self.read_csv(s, index_col=0, sep=None, skiprows=2, encoding='utf-8') tm.assert_frame_equal(data, data4)