Example #1
0
class packers_write_excel_xlwt(object):
    goal_time = 0.2

    def setup(self):
        self.f = '__test__.msg'

        def remove(f):
            try:
                os.remove(self.f)
            except:
                pass

        self.N = 100000
        self.C = 5
        self.index = date_range('20000101', periods=self.N, freq='H')
        self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N))
                                  for i in range(self.C)]),
                            index=self.index)
        self.N = 100000
        self.C = 5
        self.index = date_range('20000101', periods=self.N, freq='H')
        self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N))
                                   for i in range(self.C)]),
                             index=self.index)
        self.df2['object'] = [('%08x' % randrange((16**8)))
                              for _ in range(self.N)]
        remove(self.f)
        self.bio = BytesIO()

    def time_packers_write_excel_xlwt(self):
        self.bio.seek(0)
        self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt')
        self.df[:2000].to_excel(self.writer)
        self.writer.save()
Example #2
0
class Excel(object):

    goal_time = 0.2
    params = ['openpyxl', 'xlsxwriter', 'xlwt']
    param_names = ['engine']

    def setup(self, engine):
        N = 2000
        C = 5
        self.df = DataFrame(np.random.randn(N, C),
                            columns=['float{}'.format(i) for i in range(C)],
                            index=date_range('20000101', periods=N, freq='H'))
        self.df['object'] = tm.makeStringIndex(N)
        self.bio_read = BytesIO()
        self.writer_read = ExcelWriter(self.bio_read, engine=engine)
        self.df.to_excel(self.writer_read, sheet_name='Sheet1')
        self.writer_read.save()
        self.bio_read.seek(0)

        self.bio_write = BytesIO()
        self.bio_write.seek(0)
        self.writer_write = ExcelWriter(self.bio_write, engine=engine)

    def time_read_excel(self, engine):
        read_excel(self.bio_read)

    def time_write_excel(self, engine):
        self.df.to_excel(self.writer_write, sheet_name='Sheet1')
        self.writer_write.save()
Example #3
0
def _pickle_array(arr):
    arr = arr.view(np.ndarray)

    buf = BytesIO()
    write_array(buf, arr)

    return buf.getvalue()
Example #4
0
class packers_write_excel_xlwt(object):
    goal_time = 0.2

    def setup(self):
        self.f = '__test__.msg'
        self.N = 100000
        self.C = 5
        self.index = date_range('20000101', periods=self.N, freq='H')
        self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
        self.N = 100000
        self.C = 5
        self.index = date_range('20000101', periods=self.N, freq='H')
        self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
        self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
        self.remove(self.f)
        self.bio = BytesIO()

    def time_packers_write_excel_xlwt(self):
        self.bio.seek(0)
        self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt')
        self.df[:2000].to_excel(self.writer)
        self.writer.save()

    def remove(self, f):
        try:
            os.remove(self.f)
        except:
            pass
Example #5
0
def _pickle_array(arr):
    arr = arr.view(np.ndarray)

    buf = BytesIO()
    write_array(buf, arr)

    return buf.getvalue()
Example #6
0
    def setup(self):
        self.f = '__test__.msg'

        def remove(f):
            try:
                os.remove(self.f)
            except:
                pass

        self.N = 100000
        self.C = 5
        self.index = date_range('20000101', periods=self.N, freq='H')
        self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N))
                                  for i in range(self.C)]),
                            index=self.index)
        self.N = 100000
        self.C = 5
        self.index = date_range('20000101', periods=self.N, freq='H')
        self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N))
                                   for i in range(self.C)]),
                             index=self.index)
        self.df2['object'] = [('%08x' % randrange((16**8)))
                              for _ in range(self.N)]
        remove(self.f)
        self.bio = BytesIO()
    def write_graph(self, graph_object, graph_name='Graph', image_width=5.25):

        memfile = BytesIO()
        graph_object.get_figure().savefig(memfile)

        self.document.add_paragraph(graph_name, style='List Bullet')
        self.document.add_picture(memfile, width=Inches(image_width))
        self.document.save(self.docname)
        memfile.close()
Example #8
0
class packers_read_excel(_Packers):
    def setup(self):
        self._setup()
        self.bio = BytesIO()
        self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter')
        self.df[:2000].to_excel(self.writer)
        self.writer.save()

    def time_packers_read_excel(self):
        self.bio.seek(0)
        pd.read_excel(self.bio)
Example #9
0
class packers_read_excel(_Packers):

    def setup(self):
        self._setup()
        self.bio = BytesIO()
        self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter')
        self.df[:2000].to_excel(self.writer)
        self.writer.save()

    def time_packers_read_excel(self):
        self.bio.seek(0)
        pd.read_excel(self.bio)
Example #10
0
 def setup(self, engine):
     N = 2000
     C = 5
     self.df = DataFrame(np.random.randn(N, C),
                         columns=['float{}'.format(i) for i in range(C)],
                         index=date_range('20000101', periods=N, freq='H'))
     self.df['object'] = tm.makeStringIndex(N)
     self.bio_read = BytesIO()
     self.writer_read = ExcelWriter(self.bio_read, engine=engine)
     self.df.to_excel(self.writer_read, sheet_name='Sheet1')
     self.writer_read.save()
     self.bio_read.seek(0)
Example #11
0
 def test_stringio_writer(self):
     _skip_if_no_xlsxwriter()
     _skip_if_no_xlrd()
     
     path = BytesIO()
     with ExcelWriter(path, engine='xlsxwriter', **{'options': {'in-memory': True}}) as ew:
         self.frame.to_excel(ew, 'test1', engine='xlsxwriter')
         ew.save()
         path.seek(0)
         ef = ExcelFile(path)
         found_df = ef.parse('test1')
         tm.assert_frame_equal(self.frame, found_df)
     path.close()
Example #12
0
def maybe_read_encoded_stream(reader, encoding=None, compression=None):
    """read an encoded stream from the reader and transform the bytes to
    unicode if required based on the encoding

        Parameters
        ----------
        reader : a streamable file-like object
        encoding : optional, the encoding to attempt to read

        Returns
        -------
        a tuple of (a stream of decoded bytes, the encoding which was used)

    """

    if compat.PY3 or encoding is not None:  # pragma: no cover
        if encoding:
            errors = 'strict'
        else:
            errors = 'replace'
            encoding = 'utf-8'

        if compression == 'gzip':
            reader = BytesIO(reader.read())
        else:
            reader = StringIO(reader.read().decode(encoding, errors))
    else:
        if compression == 'gzip':
            reader = BytesIO(reader.read())
        encoding = None
    return reader, encoding
Example #13
0
def test_variable_width_unicode():
    data = """
שלום שלום
ום   שלל
של   ום
""".strip("\r\n")
    encoding = "utf8"
    kwargs = dict(header=None, encoding=encoding)

    expected = read_fwf(BytesIO(data.encode(encoding)),
                        colspecs=[(0, 4), (5, 9)],
                        **kwargs)
    result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
    tm.assert_frame_equal(result, expected)
Example #14
0
    def test_variable_width_unicode(self):
        if not compat.PY3:
            raise nose.SkipTest(
                'Bytes-related test - only needs to work on Python 3')
        test = """
שלום שלום
ום   שלל
של   ום
""".strip('\r\n')
        expected = read_fwf(BytesIO(test.encode('utf8')),
                            colspecs=[(0, 4), (5, 9)],
                            header=None, encoding='utf8')
        tm.assert_frame_equal(expected, read_fwf(
            BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
Example #15
0
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
    # see gh-22748
    t = BytesIO(b"\xB0")
    t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
    msg = "'utf-8' codec can't encode character"
    with pytest.raises(UnicodeError, match=msg):
        c_parser_only.read_csv(t, encoding="UTF-8")
Example #16
0
 def test_buffer_rd_bytes_bad_unicode(self):
     # see gh-22748
     t = BytesIO(b"\xB0")
     if PY3:
         t = TextIOWrapper(t, encoding='ascii', errors='surrogateescape')
     with pytest.raises(UnicodeError):
         self.read_csv(t, encoding='UTF-8')
Example #17
0
def test_sniff_delimiter_encoding(python_parser_only, encoding):
    parser = python_parser_only
    data = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""

    if encoding is not None:
        data = u(data).encode(encoding)
        data = BytesIO(data)

        if compat.PY3:
            from io import TextIOWrapper
            data = TextIOWrapper(data, encoding=encoding)
    else:
        data = StringIO(data)

    result = parser.read_csv(data, index_col=0, sep=None,
                             skiprows=2, encoding=encoding)
    expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                         columns=["A", "B", "C"],
                         index=Index(["foo", "bar", "baz"], name="index"))
    tm.assert_frame_equal(result, expected)
Example #18
0
def get_filepath_or_buffer(filepath_or_buffer,
                           encoding=None,
                           compression=None):

    # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST
    # are environment variables
    parsed_url = parse_url(filepath_or_buffer)
    s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com')

    try:
        conn = boto.connect_s3(host=s3_host)
    except boto.exception.NoAuthHandlerFound:
        conn = boto.connect_s3(host=s3_host, anon=True)

    b = conn.get_bucket(parsed_url.netloc, validate=False)
    if compat.PY2 and (compression == 'gzip' or
                       (compression == 'infer'
                        and filepath_or_buffer.endswith(".gz"))):
        k = boto.s3.key.Key(b, parsed_url.path)
        filepath_or_buffer = BytesIO(
            k.get_contents_as_string(encoding=encoding))
    else:
        k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding)
        k.open('r')  # Expose read errors immediately
        filepath_or_buffer = k
    return filepath_or_buffer, None, compression
Example #19
0
    def test_encode(self, html_encoding_file):
        _, encoding = os.path.splitext(
            os.path.basename(html_encoding_file)
        )[0].split('_')

        try:
            with open(html_encoding_file, 'rb') as fobj:
                from_string = self.read_html(fobj.read(), encoding=encoding,
                                             index_col=0).pop()

            with open(html_encoding_file, 'rb') as fobj:
                from_file_like = self.read_html(BytesIO(fobj.read()),
                                                encoding=encoding,
                                                index_col=0).pop()

            from_filename = self.read_html(html_encoding_file,
                                           encoding=encoding,
                                           index_col=0).pop()
            tm.assert_frame_equal(from_string, from_file_like)
            tm.assert_frame_equal(from_string, from_filename)
        except Exception:
            # seems utf-16/32 fail on windows
            if is_platform_windows():
                if '16' in encoding or '32' in encoding:
                    pytest.skip()
                raise
Example #20
0
    def setup(self, engine):
        N = 2000
        C = 5
        self.df = DataFrame(np.random.randn(N, C),
                            columns=['float{}'.format(i) for i in range(C)],
                            index=date_range('20000101', periods=N, freq='H'))
        self.df['object'] = tm.makeStringIndex(N)
        self.bio_read = BytesIO()
        self.writer_read = ExcelWriter(self.bio_read, engine=engine)
        self.df.to_excel(self.writer_read, sheet_name='Sheet1')
        self.writer_read.save()
        self.bio_read.seek(0)

        self.bio_write = BytesIO()
        self.bio_write.seek(0)
        self.writer_write = ExcelWriter(self.bio_write, engine=engine)
Example #21
0
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                           compression=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer
    passthru otherwise.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    encoding : the encoding to use to decode py3 bytes, default is 'utf-8'

    Returns
    -------
    a filepath_or_buffer, the encoding, the compression
    """

    if _is_url(filepath_or_buffer):
        req = _urlopen(str(filepath_or_buffer))
        if compression == 'infer':
            content_encoding = req.headers.get('Content-Encoding', None)
            if content_encoding == 'gzip':
                compression = 'gzip'
            else:
                compression = None
        # cat on the compression to the tuple returned by the function
        to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
                    [compression]
        return tuple(to_return)

    if _is_s3_url(filepath_or_buffer):
        try:
            import boto
        except:
            raise ImportError("boto is required to handle s3 files")
        # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
        # are environment variables
        parsed_url = parse_url(filepath_or_buffer)

        try:
            conn = boto.connect_s3()
        except boto.exception.NoAuthHandlerFound:
            conn = boto.connect_s3(anon=True)

        b = conn.get_bucket(parsed_url.netloc, validate=False)
        if compat.PY2 and (compression == 'gzip' or
                           (compression == 'infer' and
                            filepath_or_buffer.endswith(".gz"))):
            k = boto.s3.key.Key(b, parsed_url.path)
            filepath_or_buffer = BytesIO(k.get_contents_as_string(
                encoding=encoding))
        else:
            k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding)
            k.open('r')  # Expose read errors immediately
            filepath_or_buffer = k
        return filepath_or_buffer, None, compression

    # It is a pathlib.Path/py.path.local or string
    filepath_or_buffer = _stringify_path(filepath_or_buffer)
    return _expand_user(filepath_or_buffer), None, compression
Example #22
0
    def test_BytesIO_input(self):
        if not compat.PY3:
            pytest.skip("Bytes-related test - only needs to work on Python 3")

        data = BytesIO("שלום::1234\n562::123".encode('cp1255'))
        result = self.read_table(data, sep="::", encoding='cp1255')
        expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
        tm.assert_frame_equal(result, expected)
Example #23
0
def test_variable_width_unicode():
    if not compat.PY3:
        pytest.skip("Bytes-related test - only needs to work on Python 3")

    data = """
שלום שלום
ום   שלל
של   ום
""".strip("\r\n")
    encoding = "utf8"
    kwargs = dict(header=None, encoding=encoding)

    expected = read_fwf(BytesIO(data.encode(encoding)),
                        colspecs=[(0, 4), (5, 9)],
                        **kwargs)
    result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
    tm.assert_frame_equal(result, expected)
Example #24
0
    def _read_zipfile(self, url):

        zipf = BytesIO(self._get_response(url).content)

        with ZipFile(zipf, 'r') as zf:
            data = zf.open(zf.namelist()[0]).read().decode()

        return data
Example #25
0
    def test_read_csv_chunked_download(self, s3_resource, caplog):
        # 8 MB, S3FS usees 5MB chunks
        df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
        buf = BytesIO()
        str_buf = StringIO()

        df.to_csv(str_buf)

        buf = BytesIO(str_buf.getvalue().encode('utf-8'))

        s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv",
                                                     Body=buf)

        with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
            read_csv("s3://pandas-test/large-file.csv", nrows=5)
            # log of fetch_range (start, stop)
            assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records))
Example #26
0
def get_filepath_or_buffer(filepath_or_buffer,
                           encoding=None,
                           compression=None,
                           mode=None):
    """
    If the filepath_or_buffer is a url, translate and return the buffer.
    Otherwise passthrough.

    Parameters
    ----------
    filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                         or buffer
    compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional
    encoding : the encoding to use to decode bytes, default is 'utf-8'
    mode : str, optional

    Returns
    -------
    tuple of ({a filepath_ or buffer or S3File instance},
              encoding, str,
              compression, str,
              should_close, bool)
    """
    filepath_or_buffer = _stringify_path(filepath_or_buffer)

    if _is_url(filepath_or_buffer):
        req = urlopen(filepath_or_buffer)
        content_encoding = req.headers.get('Content-Encoding', None)
        if content_encoding == 'gzip':
            # Override compression based on Content-Encoding header
            compression = 'gzip'
        reader = BytesIO(req.read())
        req.close()
        return reader, encoding, compression, True

    if is_s3_url(filepath_or_buffer):
        from pandas.io import s3
        return s3.get_filepath_or_buffer(filepath_or_buffer,
                                         encoding=encoding,
                                         compression=compression,
                                         mode=mode)

    if is_gcs_url(filepath_or_buffer):
        from pandas.io import gcs
        return gcs.get_filepath_or_buffer(filepath_or_buffer,
                                          encoding=encoding,
                                          compression=compression,
                                          mode=mode)

    if isinstance(filepath_or_buffer,
                  (compat.string_types, compat.binary_type, mmap.mmap)):
        return _expand_user(filepath_or_buffer), None, compression, False

    if not is_file_like(filepath_or_buffer):
        msg = "Invalid file path or buffer object type: {_type}"
        raise ValueError(msg.format(_type=type(filepath_or_buffer)))

    return filepath_or_buffer, None, compression, False
Example #27
0
    def test_BytesIO_input(self):
        if not compat.PY3:
            pytest.skip("Bytes-related test - only needs to work on Python 3")

        result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')),
                          widths=[2, 2],
                          encoding='utf8')
        expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
        tm.assert_frame_equal(result, expected)
Example #28
0
def _unpickle_array(bytes):
    arr = read_array(BytesIO(bytes))

    # All datetimes should be stored as M8[ns].  When unpickling with
    # numpy1.6, it will read these as M8[us].  So this ensures all
    # datetime64 types are read as MS[ns]
    if is_datetime64_dtype(arr):
        arr = arr.view(_NS_DTYPE)

    return arr
Example #29
0
def fastmsgpack_loads(data):
    raw = list(msgpack_unpack(
        BytesIO(_l1(data)),
        object_hook=object_hook,
    ))
    # raw will always be a list, which is most likely a list containing
    # a single dataframe or series
    if len(raw) == 1:
        # we only serialized one structure, just return it
        return raw[0]
    return raw
Example #30
0
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
    # see gh-3404
    expected = DataFrame({"a": [1], "b": [2]})
    parser = python_parser_only

    data = "1" + sep + "2"
    encoded_data = data.encode(encoding)

    result = parser.read_csv(BytesIO(encoded_data), sep=sep,
                             names=["a", "b"], encoding=encoding)
    tm.assert_frame_equal(result, expected)
Example #31
0
    def test_utf16_example(self):
        path = tm.get_data_path('utf16_ex.txt')

        # it works! and is the right length
        result = self.read_table(path, encoding='utf-16')
        self.assertEqual(len(result), 50)

        if not compat.PY3:
            buf = BytesIO(open(path, 'rb').read())
            result = self.read_table(buf, encoding='utf-16')
            self.assertEqual(len(result), 50)
Example #32
0
 def setup(self):
     self.f = '__test__.msg'
     self.N = 100000
     self.C = 5
     self.index = date_range('20000101', periods=self.N, freq='H')
     self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N))
                               for i in range(self.C)]),
                         index=self.index)
     self.N = 100000
     self.C = 5
     self.index = date_range('20000101', periods=self.N, freq='H')
     self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N))
                                for i in range(self.C)]),
                          index=self.index)
     self.df2['object'] = [('%08x' % randrange((16**8)))
                           for _ in range(self.N)]
     self.remove(self.f)
     self.bio = BytesIO()
     self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter')
     self.df[:2000].to_excel(self.writer)
     self.writer.save()
Example #33
0
    def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file):
        # see gh-16135

        s3_object = s3_resource.meta.client.get_object(Bucket='pandas-test',
                                                       Key='tips.csv')

        result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
        assert isinstance(result, DataFrame)
        assert not result.empty

        expected = read_csv(tips_file)
        tm.assert_frame_equal(result, expected)
Example #34
0
    def test_utf16_bom_skiprows(self):
        # #2298
        data = u("""skip this
skip this too
A\tB\tC
1\t2\t3
4\t5\t6""")

        data2 = u("""skip this
skip this too
A,B,C
1,2,3
4,5,6""")

        path = '__%s__.csv' % tm.rands(10)

        with tm.ensure_clean(path) as path:
            for sep, dat in [('\t', data), (',', data2)]:
                for enc in ['utf-16', 'utf-16le', 'utf-16be']:
                    bytes = dat.encode(enc)
                    with open(path, 'wb') as f:
                        f.write(bytes)

                    s = BytesIO(dat.encode('utf-8'))
                    if compat.PY3:
                        # somewhat False since the code never sees bytes
                        from io import TextIOWrapper
                        s = TextIOWrapper(s, encoding='utf-8')

                    result = self.read_csv(path,
                                           encoding=enc,
                                           skiprows=2,
                                           sep=sep)
                    expected = self.read_csv(s,
                                             encoding='utf-8',
                                             skiprows=2,
                                             sep=sep)
                    s.close()

                    tm.assert_frame_equal(result, expected)
Example #35
0
def maybe_read_encoded_stream(reader, encoding=None, compression=None):
    """read an encoded stream from the reader and transform the bytes to
    unicode if required based on the encoding

        Parameters
        ----------
        reader : a streamable file-like object
        encoding : optional, the encoding to attempt to read

        Returns
        -------
        a tuple of (a stream of decoded bytes, the encoding which was used)

    """

    if compat.PY3 or encoding is not None:  # pragma: no cover
        if encoding:
            errors = 'strict'
        else:
            errors = 'replace'
            encoding = 'utf-8'

        if compression == 'gzip':
            reader = BytesIO(reader.read())
        else:
            reader = StringIO(reader.read().decode(encoding, errors))
    else:
        if compression == 'gzip':
            reader = BytesIO(reader.read())
        encoding = None
    return reader, encoding
Example #36
0
    def test_utf16_bom_skiprows(self):
        # #2298
        data = u(
            """skip this
skip this too
A\tB\tC
1\t2\t3
4\t5\t6"""
        )

        data2 = u(
            """skip this
skip this too
A,B,C
1,2,3
4,5,6"""
        )

        path = "__%s__.csv" % tm.rands(10)

        with tm.ensure_clean(path) as path:
            for sep, dat in [("\t", data), (",", data2)]:
                for enc in ["utf-16", "utf-16le", "utf-16be"]:
                    bytes = dat.encode(enc)
                    with open(path, "wb") as f:
                        f.write(bytes)

                    s = BytesIO(dat.encode("utf-8"))
                    if compat.PY3:
                        # somewhat False since the code never sees bytes
                        from io import TextIOWrapper

                        s = TextIOWrapper(s, encoding="utf-8")

                    result = self.read_csv(path, encoding=enc, skiprows=2, sep=sep)
                    expected = self.read_csv(s, encoding="utf-8", skiprows=2, sep=sep)
                    s.close()

                    tm.assert_frame_equal(result, expected)
Example #37
0
 def setup(self):
     self.f = '__test__.msg'
     self.N = 100000
     self.C = 5
     self.index = date_range('20000101', periods=self.N, freq='H')
     self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
     self.N = 100000
     self.C = 5
     self.index = date_range('20000101', periods=self.N, freq='H')
     self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
     self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
     self.remove(self.f)
     self.bio = BytesIO()
Example #38
0
def test_streaming_s3_objects():
    # GH17135
    # botocore gained iteration support in 1.10.47, can now be used in read_*
    pytest.importorskip('botocore', minversion='1.10.47')
    from botocore.response import StreamingBody

    data = [
        b'foo,bar,baz\n1,2,3\n4,5,6\n',
        b'just,the,header\n',
    ]
    for el in data:
        body = StreamingBody(BytesIO(el), content_length=len(el))
        read_csv(body)
Example #39
0
 def setup(self):
     self.f = '__test__.msg'
     self.N = 100000
     self.C = 5
     self.index = date_range('20000101', periods=self.N, freq='H')
     self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
     self.N = 100000
     self.C = 5
     self.index = date_range('20000101', periods=self.N, freq='H')
     self.df2 = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index)
     self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]
     self.remove(self.f)
     self.bio = BytesIO()
     self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter')
     self.df[:2000].to_excel(self.writer)
     self.writer.save()
Example #40
0
class Excel(_Packers):

    def setup(self):
        self._setup()
        self.bio = BytesIO()

    def time_write_excel_openpyxl(self):
        self.bio.seek(0)
        self.writer = pd.io.excel.ExcelWriter(self.bio, engine='openpyxl')
        self.df[:2000].to_excel(self.writer)
        self.writer.save()

    def time_write_excel_xlsxwriter(self):
        self.bio.seek(0)
        self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter')
        self.df[:2000].to_excel(self.writer)
        self.writer.save()

    def time_write_excel_xlwt(self):
        self.bio.seek(0)
        self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt')
        self.df[:2000].to_excel(self.writer)
        self.writer.save()
Example #41
0
 def time_write_excel(self, engine):
     bio_write = BytesIO()
     bio_write.seek(0)
     writer_write = ExcelWriter(bio_write, engine=engine)
     self.df.to_excel(writer_write, sheet_name='Sheet1')
     writer_write.save()
Example #42
0
 def setup(self):
     self._setup()
     self.bio = BytesIO()
Example #43
0
 def setup(self):
     self._setup()
     self.bio = BytesIO()
     self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter')
     self.df[:2000].to_excel(self.writer)
     self.writer.save()
Example #44
0
def compserver(payload, serial):
    (allow_profiler,
     default_profiler_output,
     profile_by_default) = _get_profiler_info()
    requested_profiler_output = payload.get('profiler_output',
                                            default_profiler_output)
    profile = payload.get('profile')
    profiling = (allow_profiler and
                 (profile or (profile_by_default and requested_profiler_output)))
    if profile and not allow_profiler:
        return ('profiling is disabled on this server', RC.FORBIDDEN)

    with ExitStack() as response_construction_context_stack:
        if profiling:
            from cProfile import Profile

            if (default_profiler_output == ':response' and
                    requested_profiler_output != ':response'):
                # writing to the local filesystem is disabled
                return ("local filepaths are disabled on this server, only"
                        " ':response' is allowed for the 'profiler_output' field",
                        RC.FORBIDDEN)

            profiler_output = requested_profiler_output
            profiler = Profile()
            profiler.enable()
            # ensure that we stop profiling in the case of an exception
            response_construction_context_stack.callback(profiler.disable)

        expr = '<failed to parse expr>'

        @response_construction_context_stack.callback
        def log_time(start=time()):
            flask.current_app.logger.info('compute expr: %s\ntotal time (s): %.3f',
                                          expr,
                                          time() - start)

        ns = payload.get('namespace', {})
        compute_kwargs = payload.get('compute_kwargs') or {}
        odo_kwargs = payload.get('odo_kwargs') or {}
        dataset = _get_data()
        ns[':leaf'] = symbol('leaf', discover(dataset))

        expr = from_tree(payload['expr'], namespace=ns)
        assert len(expr._leaves()) == 1
        leaf = expr._leaves()[0]

        try:
            result = serial.materialize(compute(expr,
                                                {leaf: dataset},
                                                **compute_kwargs),
                                        expr.dshape,
                                        odo_kwargs)
        except NotImplementedError as e:
            return ("Computation not supported:\n%s" % e, RC.NOT_IMPLEMENTED)
        except Exception as e:
            return ("Computation failed with message:\n%s: %s" % (type(e).__name__, e),
                    RC.INTERNAL_SERVER_ERROR)

        response = {'datashape': pprint(expr.dshape, width=0),
                    'data': serial.data_dumps(result),
                    'names': expr.fields}

    if profiling:
        import marshal
        from pstats import Stats

        if profiler_output == ':response':
            from pandas.compat import BytesIO
            file = BytesIO()
        else:
            file = open(_prof_path(profiler_output, expr), 'wb')

        with file:
            # Use marshal to dump the stats data to the given file.
            # This is taken from cProfile which unfortunately does not have
            # an api that allows us to pass the file object directly, only
            # a file path.
            marshal.dump(Stats(profiler).stats, file)
            if profiler_output == ':response':
                response['profiler_output'] = {'__!bytes': file.getvalue()}

    return serial.dumps(response)
Example #45
0
def compserver(payload, serial):
    expected_keys = {u'namespace',
                     u'odo_kwargs',
                     u'compute_kwargs',
                     u'expr',
                     u'profile',
                     u'profiler_output'}
    if not set(payload.keys()) < expected_keys:
        return ('unexpected keys in payload: %r' % sorted(set(payload.keys()) -
                                                          expected_keys),
                RC.BAD_REQUEST)

    app = flask.current_app
    (allow_profiler,
     default_profiler_output,
     profile_by_default) = _get_profiler_info()
    requested_profiler_output = payload.get(u'profiler_output',
                                            default_profiler_output)
    profile = payload.get(u'profile')
    profiling = (allow_profiler and
                 (profile or (profile_by_default and requested_profiler_output)))
    if profile and not allow_profiler:
        return ('profiling is disabled on this server', RC.FORBIDDEN)

    with ExitStack() as response_construction_context_stack:
        if profiling:
            from cProfile import Profile

            if (default_profiler_output == ':response' and
                    requested_profiler_output != ':response'):
                # writing to the local filesystem is disabled
                return ("local filepaths are disabled on this server, only"
                        " ':response' is allowed for the 'profiler_output' field",
                        RC.FORBIDDEN)

            profiler_output = requested_profiler_output
            profiler = Profile()
            profiler.enable()
            # ensure that we stop profiling in the case of an exception
            response_construction_context_stack.callback(profiler.disable)

        expr = '<failed to parse expr>'

        @response_construction_context_stack.callback
        def log_time(start=time()):
            app.logger.info('compute expr: %s\ntotal time (s): %.3f',
                            expr,
                            time() - start)

        ns = payload.get(u'namespace', {})
        compute_kwargs = payload.get(u'compute_kwargs') or {}
        odo_kwargs = payload.get(u'odo_kwargs') or {}
        dataset = _get_data()
        ns[':leaf'] = symbol('leaf', discover(dataset))

        expr = from_tree(payload[u'expr'], namespace=ns)

        if len(expr._leaves()) != 1:
            return ('too many leaves, expected 1 got %d' % len(expr._leaves()),
                    RC.BAD_REQUEST)

        leaf = expr._leaves()[0]

        formatter = getattr(flask.current_app, 'log_exception_formatter',
                            _default_log_exception_formatter)
        try:
            result = serial.materialize(compute(expr,
                                                {leaf: dataset},
                                                **compute_kwargs),
                                        expr.dshape,
                                        odo_kwargs)
        except NotImplementedError as e:
            # Note: `sys.exc_info()[2]` holds the current traceback, for
            # Python 2 / 3 compatibility. It's important not to store a local
            # reference to it.
            formatted_tb = formatter(sys.exc_info()[2])
            error_msg = "Computation not supported:\n%s\n%s" % (e, formatted_tb)
            app.logger.error(error_msg)
            return (error_msg, RC.NOT_IMPLEMENTED)
        except Exception as e:
            formatted_tb = formatter(sys.exc_info()[2])
            error_msg = "Computation failed with message:\n%s: %s\n%s" % (type(e).__name__, e, formatted_tb)
            app.logger.error(error_msg)
            return (error_msg, RC.INTERNAL_SERVER_ERROR)

        response = {u'datashape': pprint(expr.dshape, width=0),
                    u'data': serial.data_dumps(result),
                    u'names': expr.fields}

    if profiling:
        import marshal
        from pstats import Stats

        if profiler_output == ':response':
            from pandas.compat import BytesIO
            file = BytesIO()
        else:
            file = open(_prof_path(profiler_output, expr), 'wb')

        with file:
            # Use marshal to dump the stats data to the given file.
            # This is taken from cProfile which unfortunately does not have
            # an api that allows us to pass the file object directly, only
            # a file path.
            marshal.dump(Stats(profiler).stats, file)
            if profiler_output == ':response':
                response[u'profiler_output'] = {'__!bytes': file.getvalue()}

    return serial.dumps(response)