Ejemplo n.º 1
0
def maybe_read_encoded_stream(reader, encoding=None, compression=None):
    """read an encoded stream from the reader and transform the bytes to
    unicode if required based on the encoding

        Parameters
        ----------
        reader : a streamable file-like object
        encoding : optional, the encoding to attempt to read

        Returns
        -------
        a tuple of (a stream of decoded bytes, the encoding which was used)

    """

    if compat.PY3 or encoding is not None:  # pragma: no cover
        if encoding:
            errors = 'strict'
        else:
            errors = 'replace'
            encoding = 'utf-8'

        if compression == 'gzip':
            reader = BytesIO(reader.read())
        else:
            reader = StringIO(reader.read().decode(encoding, errors))
    else:
        if compression == 'gzip':
            reader = BytesIO(reader.read())
        encoding = None
    return reader, encoding
Ejemplo n.º 2
0
def test_variable_width_unicode():
    data = """
שלום שלום
ום   שלל
של   ום
""".strip("\r\n")
    encoding = "utf8"
    kwargs = dict(header=None, encoding=encoding)

    expected = read_fwf(BytesIO(data.encode(encoding)),
                        colspecs=[(0, 4), (5, 9)],
                        **kwargs)
    result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 3
0
    def test_variable_width_unicode(self):
        if not compat.PY3:
            raise nose.SkipTest(
                'Bytes-related test - only needs to work on Python 3')
        test = """
שלום שלום
ום   שלל
של   ום
""".strip('\r\n')
        expected = read_fwf(BytesIO(test.encode('utf8')),
                            colspecs=[(0, 4), (5, 9)],
                            header=None, encoding='utf8')
        tm.assert_frame_equal(expected, read_fwf(
            BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
Ejemplo n.º 4
0
def get_filepath_or_buffer(filepath_or_buffer,
                           encoding=None,
                           compression=None):

    # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST
    # are environment variables
    parsed_url = parse_url(filepath_or_buffer)
    s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com')

    try:
        conn = boto.connect_s3(host=s3_host)
    except boto.exception.NoAuthHandlerFound:
        conn = boto.connect_s3(host=s3_host, anon=True)

    b = conn.get_bucket(parsed_url.netloc, validate=False)
    if compat.PY2 and (compression == 'gzip' or
                       (compression == 'infer'
                        and filepath_or_buffer.endswith(".gz"))):
        k = boto.s3.key.Key(b, parsed_url.path)
        filepath_or_buffer = BytesIO(
            k.get_contents_as_string(encoding=encoding))
    else:
        k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding)
        k.open('r')  # Expose read errors immediately
        filepath_or_buffer = k
    return filepath_or_buffer, None, compression
Ejemplo n.º 5
0
    def test_encode(self, html_encoding_file):
        _, encoding = os.path.splitext(
            os.path.basename(html_encoding_file)
        )[0].split('_')

        try:
            with open(html_encoding_file, 'rb') as fobj:
                from_string = self.read_html(fobj.read(), encoding=encoding,
                                             index_col=0).pop()

            with open(html_encoding_file, 'rb') as fobj:
                from_file_like = self.read_html(BytesIO(fobj.read()),
                                                encoding=encoding,
                                                index_col=0).pop()

            from_filename = self.read_html(html_encoding_file,
                                           encoding=encoding,
                                           index_col=0).pop()
            tm.assert_frame_equal(from_string, from_file_like)
            tm.assert_frame_equal(from_string, from_filename)
        except Exception:
            # seems utf-16/32 fail on windows
            if is_platform_windows():
                if '16' in encoding or '32' in encoding:
                    pytest.skip()
                raise
Ejemplo n.º 6
0
    def setup(self, engine):
        N = 2000
        C = 5
        self.df = DataFrame(np.random.randn(N, C),
                            columns=['float{}'.format(i) for i in range(C)],
                            index=date_range('20000101', periods=N, freq='H'))
        self.df['object'] = tm.makeStringIndex(N)
        self.bio_read = BytesIO()
        self.writer_read = ExcelWriter(self.bio_read, engine=engine)
        self.df.to_excel(self.writer_read, sheet_name='Sheet1')
        self.writer_read.save()
        self.bio_read.seek(0)

        self.bio_write = BytesIO()
        self.bio_write.seek(0)
        self.writer_write = ExcelWriter(self.bio_write, engine=engine)
Ejemplo n.º 7
0
    def setup(self):
        self.f = '__test__.msg'

        def remove(f):
            try:
                os.remove(self.f)
            except:
                pass

        self.N = 100000
        self.C = 5
        self.index = date_range('20000101', periods=self.N, freq='H')
        self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N))
                                  for i in range(self.C)]),
                            index=self.index)
        self.N = 100000
        self.C = 5
        self.index = date_range('20000101', periods=self.N, freq='H')
        self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N))
                                   for i in range(self.C)]),
                             index=self.index)
        self.df2['object'] = [('%08x' % randrange((16**8)))
                              for _ in range(self.N)]
        remove(self.f)
        self.bio = BytesIO()
        self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter')
        self.df[:2000].to_excel(self.writer)
        self.writer.save()
Ejemplo n.º 8
0
 def test_buffer_rd_bytes_bad_unicode(self):
     # see gh-22748
     t = BytesIO(b"\xB0")
     if PY3:
         t = TextIOWrapper(t, encoding='ascii', errors='surrogateescape')
     with pytest.raises(UnicodeError):
         self.read_csv(t, encoding='UTF-8')
Ejemplo n.º 9
0
def test_sniff_delimiter_encoding(python_parser_only, encoding):
    parser = python_parser_only
    data = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""

    if encoding is not None:
        data = u(data).encode(encoding)
        data = BytesIO(data)

        if compat.PY3:
            from io import TextIOWrapper
            data = TextIOWrapper(data, encoding=encoding)
    else:
        data = StringIO(data)

    result = parser.read_csv(data, index_col=0, sep=None,
                             skiprows=2, encoding=encoding)
    expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                         columns=["A", "B", "C"],
                         index=Index(["foo", "bar", "baz"], name="index"))
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 10
0
    def setup(self):
        self.f = '__test__.msg'

        def remove(f):
            try:
                os.remove(self.f)
            except:
                pass

        self.N = 100000
        self.C = 5
        self.index = date_range('20000101', periods=self.N, freq='H')
        self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N))
                                  for i in range(self.C)]),
                            index=self.index)
        self.N = 100000
        self.C = 5
        self.index = date_range('20000101', periods=self.N, freq='H')
        self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N))
                                   for i in range(self.C)]),
                             index=self.index)
        self.df2['object'] = [('%08x' % randrange((16**8)))
                              for _ in range(self.N)]
        remove(self.f)
        self.bio = BytesIO()
Ejemplo n.º 11
0
def _pickle_array(arr):
    arr = arr.view(np.ndarray)

    buf = BytesIO()
    write_array(buf, arr)

    return buf.getvalue()
Ejemplo n.º 12
0
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
    # see gh-22748
    t = BytesIO(b"\xB0")
    t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
    msg = "'utf-8' codec can't encode character"
    with pytest.raises(UnicodeError, match=msg):
        c_parser_only.read_csv(t, encoding="UTF-8")
Ejemplo n.º 13
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        if compression == 'infer':
            content_encoding = req.headers.get('Content-Encoding', None)
            if content_encoding == 'gzip':
                compression = 'gzip'
            else:
                compression = None
        # cat on the compression to the tuple returned by the function
        to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
                    [compression]
        return tuple(to_return)

    if _is_s3_url(filepath_or_buffer):
        try:
            import boto
        except:
            raise ImportError("boto is required to handle s3 files")
        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
        # are environment variables
        parsed_url = parse_url(filepath_or_buffer)

        try:
            conn = boto.connect_s3()
        except boto.exception.NoAuthHandlerFound:
            conn = boto.connect_s3(anon=True)

        b = conn.get_bucket(parsed_url.netloc, validate=False)
        if compat.PY2 and (compression == 'gzip' or
                           (compression == 'infer' and
                            filepath_or_buffer.endswith(".gz"))):
            k = boto.s3.key.Key(b, parsed_url.path)
            filepath_or_buffer = BytesIO(k.get_contents_as_string(
                encoding=encoding))
        else:
            k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding)
            k.open('r')  # Expose read errors immediately
            filepath_or_buffer = k
        return filepath_or_buffer, None, compression

    # It is a pathlib.Path/py.path.local or string
    filepath_or_buffer = _stringify_path(filepath_or_buffer)
    return _expand_user(filepath_or_buffer), None, compression
Ejemplo n.º 14
0
def test_variable_width_unicode():
    if not compat.PY3:
        pytest.skip("Bytes-related test - only needs to work on Python 3")

    data = """
שלום שלום
ום   שלל
של   ום
""".strip("\r\n")
    encoding = "utf8"
    kwargs = dict(header=None, encoding=encoding)

    expected = read_fwf(BytesIO(data.encode(encoding)),
                        colspecs=[(0, 4), (5, 9)],
                        **kwargs)
    result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 15
0
    def _read_zipfile(self, url):

        zipf = BytesIO(self._get_response(url).content)

        with ZipFile(zipf, 'r') as zf:
            data = zf.open(zf.namelist()[0]).read().decode()

        return data
Ejemplo n.º 16
0
    def test_BytesIO_input(self):
        if not compat.PY3:
            pytest.skip("Bytes-related test - only needs to work on Python 3")

        data = BytesIO("שלום::1234\n562::123".encode('cp1255'))
        result = self.read_table(data, sep="::", encoding='cp1255')
        expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
        tm.assert_frame_equal(result, expected)
Ejemplo n.º 17
0
    def test_read_csv_chunked_download(self, s3_resource, caplog):
        # 8 MB, S3FS usees 5MB chunks
        df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
        buf = BytesIO()
        str_buf = StringIO()

        df.to_csv(str_buf)

        buf = BytesIO(str_buf.getvalue().encode('utf-8'))

        s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv",
                                                     Body=buf)

        with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
            read_csv("s3://pandas-test/large-file.csv", nrows=5)
            # log of fetch_range (start, stop)
            assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records))
Ejemplo n.º 18
0
def get_filepath_or_buffer(filepath_or_buffer,
                           encoding=None,
                           compression=None,
                           mode=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer.
    Otherwise passthrough.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional
    encoding : the encoding to use to decode bytes, default is 'utf-8'
    mode : str, optional

    Returns
    -------
    tuple of ({a filepath_ or buffer or S3File instance},
              encoding, str,
              compression, str,
              should_close, bool)
    """
    filepath_or_buffer = _stringify_path(filepath_or_buffer)

    if _is_url(filepath_or_buffer):
        req = urlopen(filepath_or_buffer)
        content_encoding = req.headers.get('Content-Encoding', None)
        if content_encoding == 'gzip':
            # Override compression based on Content-Encoding header
            compression = 'gzip'
        reader = BytesIO(req.read())
        req.close()
        return reader, encoding, compression, True

    if is_s3_url(filepath_or_buffer):
        from pandas.io import s3
        return s3.get_filepath_or_buffer(filepath_or_buffer,
                                         encoding=encoding,
                                         compression=compression,
                                         mode=mode)

    if is_gcs_url(filepath_or_buffer):
        from pandas.io import gcs
        return gcs.get_filepath_or_buffer(filepath_or_buffer,
                                          encoding=encoding,
                                          compression=compression,
                                          mode=mode)

    if isinstance(filepath_or_buffer,
                  (compat.string_types, compat.binary_type, mmap.mmap)):
        return _expand_user(filepath_or_buffer), None, compression, False

    if not is_file_like(filepath_or_buffer):
        msg = "Invalid file path or buffer object type: {_type}"
        raise ValueError(msg.format(_type=type(filepath_or_buffer)))

    return filepath_or_buffer, None, compression, False
    def write_graph(self, graph_object, graph_name='Graph', image_width=5.25):

        memfile = BytesIO()
        graph_object.get_figure().savefig(memfile)

        self.document.add_paragraph(graph_name, style='List Bullet')
        self.document.add_picture(memfile, width=Inches(image_width))
        self.document.save(self.docname)
        memfile.close()
Ejemplo n.º 20
0
    def test_BytesIO_input(self):
        if not compat.PY3:
            pytest.skip("Bytes-related test - only needs to work on Python 3")

        result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')),
                          widths=[2, 2],
                          encoding='utf8')
        expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
        tm.assert_frame_equal(result, expected)
Ejemplo n.º 21
0
def _unpickle_array(bytes):
    arr = read_array(BytesIO(bytes))

    # All datetimes should be stored as M8[ns].  When unpickling with
    # numpy1.6, it will read these as M8[us].  So this ensures all
    # datetime64 types are read as MS[ns]
    if is_datetime64_dtype(arr):
        arr = arr.view(_NS_DTYPE)

    return arr
Ejemplo n.º 22
0
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
    # see gh-3404
    expected = DataFrame({"a": [1], "b": [2]})
    parser = python_parser_only

    data = "1" + sep + "2"
    encoded_data = data.encode(encoding)

    result = parser.read_csv(BytesIO(encoded_data), sep=sep,
                             names=["a", "b"], encoding=encoding)
    tm.assert_frame_equal(result, expected)
Ejemplo n.º 23
0
def fastmsgpack_loads(data):
    raw = list(msgpack_unpack(
        BytesIO(_l1(data)),
        object_hook=object_hook,
    ))
    # raw will always be a list, which is most likely a list containing
    # a single dataframe or series
    if len(raw) == 1:
        # we only serialized one structure, just return it
        return raw[0]
    return raw
Ejemplo n.º 24
0
    def test_utf16_example(self):
        path = tm.get_data_path('utf16_ex.txt')

        # it works! and is the right length
        result = self.read_table(path, encoding='utf-16')
        self.assertEqual(len(result), 50)

        if not compat.PY3:
            buf = BytesIO(open(path, 'rb').read())
            result = self.read_table(buf, encoding='utf-16')
            self.assertEqual(len(result), 50)
Ejemplo n.º 25
0
    def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file):
        # see gh-16135

        s3_object = s3_resource.meta.client.get_object(Bucket='pandas-test',
                                                       Key='tips.csv')

        result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
        assert isinstance(result, DataFrame)
        assert not result.empty

        expected = read_csv(tips_file)
        tm.assert_frame_equal(result, expected)
Ejemplo n.º 26
0
def test_streaming_s3_objects():
    # GH17135
    # botocore gained iteration support in 1.10.47, can now be used in read_*
    pytest.importorskip('botocore', minversion='1.10.47')
    from botocore.response import StreamingBody

    data = [
        b'foo,bar,baz\n1,2,3\n4,5,6\n',
        b'just,the,header\n',
    ]
    for el in data:
        body = StreamingBody(BytesIO(el), content_length=len(el))
        read_csv(body)
Ejemplo n.º 27
0
def get_filepath_or_buffer(filepath_or_buffer,
                           encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath, or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        if compression == 'infer':
            content_encoding = req.headers.get('Content-Encoding', None)
            if content_encoding == 'gzip':
                compression = 'gzip'
        # cat on the compression to the tuple returned by the function
        to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
                    [compression]
        return tuple(to_return)

    if _is_s3_url(filepath_or_buffer):
        try:
            import boto
        except:
            raise ImportError("boto is required to handle s3 files")
        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
        # are environment variables
        parsed_url = parse_url(filepath_or_buffer)

        try:
            conn = boto.connect_s3()
        except boto.exception.NoAuthHandlerFound:
            conn = boto.connect_s3(anon=True)

        b = conn.get_bucket(parsed_url.netloc, validate=False)
        k = boto.s3.key.Key(b)
        k.key = parsed_url.path
        filepath_or_buffer = BytesIO(
            k.get_contents_as_string(encoding=encoding))
        return filepath_or_buffer, None, compression

    return _expand_user(filepath_or_buffer), None, compression
Ejemplo n.º 28
0
    def test_encoding_non_utf8_multichar_sep(self):
        # see gh-3404
        expected = DataFrame({'a': [1], 'b': [2]})

        for sep in ['::', '#####', '!!!', '123', '#1!c5',
                    '%!c!d', '@@#4:2', '_!pd#_']:
            data = '1' + sep + '2'

            for encoding in ['utf-16', 'utf-16-be', 'utf-16-le',
                             'utf-32', 'cp037']:
                encoded_data = data.encode(encoding)
                result = self.read_csv(BytesIO(encoded_data),
                                       sep=sep, names=['a', 'b'],
                                       encoding=encoding)
                tm.assert_frame_equal(result, expected)
def main():
    expire_after = timedelta(days=1)
    if PY2:
        filename = 'cache_py2'
    else:
        filename = 'cache'
    session = requests_cache.CachedSession(cache_name=filename,
                                           expire_after=expire_after)

    dt = pd.to_datetime("2014-01-01")
    symbol = "AUD/USD"
    symbol = symbol.replace("/", "").upper()
    year = dt.year
    month = dt.month
    month_name = datetime.datetime(year=1970, month=month,
                                   day=1).strftime('%B').upper()
    #url = "http://www.truefx.com/dev/data/2014/JANUARY-2014/AUDUSD-2014-01.zip"
    url = "http://www.truefx.com/dev/data/{year:04d}/{month_name}-{year:04d}/{symbol}-{year:04d}-{month:02d}.zip".format(
        year=year, month=month, symbol=symbol, month_name=month_name)
    response = session.get(url)
    zip_data = BytesIO(response.content)
    filename = "{symbol}-{year:04d}-{month:02d}.csv".format(year=year,
                                                            month=month,
                                                            symbol=symbol)

    with ZipFile(zip_data, 'r') as zf:
        #filename = zf.namelist()[0]
        zfile = zf.open(filename)
        #print(zfile)
        #(symb, dt, ask, bid) = zfile.read().split(',')
        #print(zfile.__dict__)
        data = zfile.readlines()
        #df = pd.read_csv(zfile._fileobj)  # ToFix: can't make it work correctly

    #return
    df = pd.DataFrame(data)
    #df = df[:100] # just for test
    df[0] = df[0].str.decode('utf8')
    df[0] = df[0].str.replace('\n', '')
    df[0] = df[0].map(lambda s: s.split(','))
    df['Symbol'] = df[0].map(lambda t: t[0])
    df['Date'] = df[0].map(lambda t: pd.to_datetime(t[1]))
    df['Bid'] = df[0].map(lambda t: t[2]).astype(float)
    df['Ask'] = df[0].map(lambda t: t[3]).astype(float)
    del df[0]
    df = df.set_index('Date')
    print(df)
Ejemplo n.º 30
0
    def test_sniff_delimiter(self):
        text = """index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
        data = self.read_csv(StringIO(text), index_col=0, sep=None)
        self.assert_index_equal(data.index,
                                Index(['foo', 'bar', 'baz'], name='index'))

        data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|')
        tm.assert_frame_equal(data, data2)

        text = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
        data3 = self.read_csv(StringIO(text),
                              index_col=0,
                              sep=None,
                              skiprows=2)
        tm.assert_frame_equal(data, data3)

        text = u("""ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
""").encode('utf-8')

        s = BytesIO(text)
        if compat.PY3:
            # somewhat False since the code never sees bytes
            from io import TextIOWrapper
            s = TextIOWrapper(s, encoding='utf-8')

        data4 = self.read_csv(s,
                              index_col=0,
                              sep=None,
                              skiprows=2,
                              encoding='utf-8')
        tm.assert_frame_equal(data, data4)