def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) for len_, verbose in [(5, None), (5, False), (10, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context('max_info_columns', 4): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) for len_, verbose in [(10, None), (5, False), (10, True)]: # max_cols no exceeded with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) for len_, max_cols in [(10, 5), (5, 4)]: # setting truncates with option_context('max_info_columns', 4): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_) # setting wouldn't truncate with option_context('max_info_columns', 5): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() self.assertEqual(len(res.strip().split('\n')), len_)
def test_info_memory_usage_qualified(self): buf = StringIO() df = DataFrame(1, columns=list('ab'), index=[1, 2, 3]) df.info(buf=buf) assert '+' not in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list('ab'), index=list('ABC')) df.info(buf=buf) assert '+' in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list('ab'), index=pd.MultiIndex.from_product( [range(3), range(3)])) df.info(buf=buf) assert '+' not in buf.getvalue() buf = StringIO() df = DataFrame(1, columns=list('ab'), index=pd.MultiIndex.from_product( [range(3), ['foo', 'bar']])) df.info(buf=buf) assert '+' in buf.getvalue()
def test_to_csv_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) buf = StringIO() df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC) result = buf.getvalue() expected = ('"A","B"\n' '1,"foo"\n' '2,"bar"\n' '3,"baz"\n') self.assertEqual(result, expected) # quoting windows line terminators, presents with encoding? # #3503 text = 'a,b,c\n1,"test \r\n",3\n' df = pd.read_csv(StringIO(text)) buf = StringIO() df.to_csv(buf, encoding='utf-8', index=False) self.assertEqual(buf.getvalue(), text) # testing if quoting parameter is passed through with multi-indexes # related to issue #7791 df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) df = df.set_index(['a', 'b']) expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected)
class UnicodeWriter(object): """ A CSV writer which will write rows to CSV file "f", which is encoded in the given encoding. """ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): # Redirect output to a queue self.queue = StringIO() self.writer = csv.writer(self.queue, dialect=dialect, **kwds) self.stream = f self.encoder = codecs.getincrementalencoder(encoding)() self.quoting = kwds.get("quoting", None) def writerow(self, row): def _check_as_is(x): return (self.quoting == csv.QUOTE_NONNUMERIC and is_number(x)) or isinstance(x, str) row = [x if _check_as_is(x) else pprint_thing(x).encode("utf-8") for x in row] self.writer.writerow([s for s in row]) # Fetch UTF-8 output from the queue ... data = self.queue.getvalue() data = data.decode("utf-8") # ... and re-encode it into the target encoding data = self.encoder.encode(data) # write to the target stream self.stream.write(data) # empty queue self.queue.truncate(0) def writerows(self, rows): def _check_as_is(x): return (self.quoting == csv.QUOTE_NONNUMERIC and is_number(x)) or isinstance(x, str) for i, row in enumerate(rows): rows[i] = [x if _check_as_is(x) else pprint_thing(x).encode("utf-8") for x in row] self.writer.writerows([[s for s in row] for row in rows]) # Fetch UTF-8 output from the queue ... data = self.queue.getvalue() data = data.decode("utf-8") # ... and re-encode it into the target encoding data = self.encoder.encode(data) # write to the target stream self.stream.write(data) # empty queue self.queue.truncate(0)
def test_to_html(self): # big mixed biggie = DataFrame({'A': np.random.randn(200), 'B': tm.makeStringIndex(200)}, index=lrange(200)) biggie.loc[:20, 'A'] = np.nan biggie.loc[:20, 'B'] = np.nan s = biggie.to_html() buf = StringIO() retval = biggie.to_html(buf=buf) self.assertIsNone(retval) self.assertEqual(buf.getvalue(), s) tm.assertIsInstance(s, compat.string_types) biggie.to_html(columns=['B', 'A'], col_space=17) biggie.to_html(columns=['B', 'A'], formatters={'A': lambda x: '%.1f' % x}) biggie.to_html(columns=['B', 'A'], float_format=str) biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str) frame = DataFrame(index=np.arange(200)) frame.to_html()
def _coef_table(self): buf = StringIO() buf.write('%14s %10s %10s %10s %10s %10s %10s\n' % ('Variable', 'Coef', 'Std Err', 't-stat', 'p-value', 'CI 2.5%', 'CI 97.5%')) buf.write(scom.banner('')) coef_template = '\n%14s %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f' results = self._results beta = results['beta'] for i, name in enumerate(beta.index): if i and not (i % 5): buf.write('\n' + scom.banner('')) std_err = results['std_err'][name] CI1 = beta[name] - 1.96 * std_err CI2 = beta[name] + 1.96 * std_err t_stat = results['t_stat'][name] p_value = results['p_value'][name] line = coef_template % (name, beta[name], std_err, t_stat, p_value, CI1, CI2) buf.write(line) if self.nw_lags is not None: buf.write('\n') buf.write('*** The calculations are Newey-West ' 'adjusted with lags %5d\n' % self.nw_lags) return buf.getvalue()
def _coef_table(self): buffer = StringIO() buffer.write( "%13s %13s %13s %13s %13s %13s\n" % ("Variable", "Beta", "Std Err", "t-stat", "CI 2.5%", "CI 97.5%") ) template = "%13s %13.4f %13.4f %13.2f %13.4f %13.4f\n" for i, name in enumerate(self._cols): if i and not (i % 5): buffer.write("\n" + common.banner("")) mean_beta = self._results["mean_beta"][i] std_beta = self._results["std_beta"][i] t_stat = self._results["t_stat"][i] ci1 = mean_beta - 1.96 * std_beta ci2 = mean_beta + 1.96 * std_beta values = "(%s)" % name, mean_beta, std_beta, t_stat, ci1, ci2 buffer.write(template % values) if self._nw_lags_beta is not None: buffer.write("\n") buffer.write("*** The Std Err, t-stat are Newey-West " "adjusted with Lags %5d\n" % self._nw_lags_beta) return buffer.getvalue()
def save(self): """ Create the writer & save """ # GH21227 internal compression is not used when file-like passed. if self.compression and hasattr(self.path_or_buf, 'write'): msg = ("compression has no effect when passing file-like " "object as input.") warnings.warn(msg, RuntimeWarning, stacklevel=2) # when zip compression is called. is_zip = isinstance(self.path_or_buf, ZipFile) or ( not hasattr(self.path_or_buf, 'write') and self.compression == 'zip') if is_zip: # zipfile doesn't support writing string to archive. uses string # buffer to receive csv writing and dump into zip compression # file handle. GH21241, GH21118 f = StringIO() close = False elif hasattr(self.path_or_buf, 'write'): f = self.path_or_buf close = False else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=self.encoding, compression=self.compression) close = True try: writer_kwargs = dict(lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, doublequote=self.doublequote, escapechar=self.escapechar, quotechar=self.quotechar) if self.encoding == 'ascii': self.writer = csvlib.writer(f, **writer_kwargs) else: writer_kwargs['encoding'] = self.encoding self.writer = UnicodeWriter(f, **writer_kwargs) self._save() finally: if is_zip: # GH17778 handles zip compression separately. buf = f.getvalue() if hasattr(self.path_or_buf, 'write'): self.path_or_buf.write(buf) else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=self.encoding, compression=self.compression) f.write(buf) close = True if close: f.close() for _fh in handles: _fh.close()
def test_to_html(self): # big mixed biggie = DataFrame({'A': np.random.randn(200), 'B': tm.makeStringIndex(200)}, index=lrange(200)) biggie.loc[:20, 'A'] = np.nan biggie.loc[:20, 'B'] = np.nan s = biggie.to_html() buf = StringIO() retval = biggie.to_html(buf=buf) assert retval is None assert buf.getvalue() == s assert isinstance(s, compat.string_types) biggie.to_html(columns=['B', 'A'], col_space=17) biggie.to_html(columns=['B', 'A'], formatters={'A': lambda x: '{x:.1f}'.format(x=x)}) biggie.to_html(columns=['B', 'A'], float_format=str) biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str) frame = DataFrame(index=np.arange(200)) frame.to_html()
def _coef_table(self): buf = StringIO() buf.write( "%14s %10s %10s %10s %10s %10s %10s\n" % ("Variable", "Coef", "Std Err", "t-stat", "p-value", "CI 2.5%", "CI 97.5%") ) buf.write(scom.banner("")) coef_template = "\n%14s %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f" results = self._results beta = results["beta"] for i, name in enumerate(beta.index): if i and not (i % 5): buf.write("\n" + scom.banner("")) std_err = results["std_err"][name] CI1 = beta[name] - 1.96 * std_err CI2 = beta[name] + 1.96 * std_err t_stat = results["t_stat"][name] p_value = results["p_value"][name] line = coef_template % (name, beta[name], std_err, t_stat, p_value, CI1, CI2) buf.write(line) if self.nw_lags is not None: buf.write("\n") buf.write("*** The calculations are Newey-West " "adjusted with lags %5d\n" % self.nw_lags) return buf.getvalue()
def _coef_table(self): buffer = StringIO() buffer.write('%13s %13s %13s %13s %13s %13s\n' % ('Variable', 'Beta', 'Std Err', 't-stat', 'CI 2.5%', 'CI 97.5%')) template = '%13s %13.4f %13.4f %13.2f %13.4f %13.4f\n' for i, name in enumerate(self._cols): if i and not (i % 5): buffer.write('\n' + common.banner('')) mean_beta = self._results['mean_beta'][i] std_beta = self._results['std_beta'][i] t_stat = self._results['t_stat'][i] ci1 = mean_beta - 1.96 * std_beta ci2 = mean_beta + 1.96 * std_beta values = '(%s)' % name, mean_beta, std_beta, t_stat, ci1, ci2 buffer.write(template % values) if self._nw_lags_beta is not None: buffer.write('\n') buffer.write('*** The Std Err, t-stat are Newey-West ' 'adjusted with Lags %5d\n' % self._nw_lags_beta) return buffer.getvalue()
def test_to_csv_numpy_16_bug(self): frame = DataFrame({'a': date_range('1/1/2000', periods=10)}) buf = StringIO() frame.to_csv(buf) result = buf.getvalue() self.assertIn('2000-01-01', result)
def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover """ Attempt to write text representation of object to the system clipboard The clipboard can be then pasted into Excel for example. Parameters ---------- obj : the object to write to the clipboard excel : boolean, defaults to True if True, use the provided separator, writing in a csv format for allowing easy pasting into excel. if False, write a string representation of the object to the clipboard sep : optional, defaults to tab other keywords are passed to to_csv Notes ----- Requirements for your platform - Linux: xclip, or xsel (with gtk or PyQt4 modules) - Windows: - OS X: """ encoding = kwargs.pop('encoding', 'utf-8') # testing if an invalid encoding is passed to clipboard if encoding is not None and encoding.lower().replace('-', '') != 'utf8': raise ValueError('clipboard only supports utf-8 encoding') from pandas.io.clipboard import clipboard_set if excel is None: excel = True if excel: try: if sep is None: sep = '\t' buf = StringIO() # clipboard_set (pyperclip) expects unicode obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs) text = buf.getvalue() if PY2: text = text.decode('utf-8') clipboard_set(text) return except TypeError: warnings.warn('to_clipboard in excel mode requires a single ' 'character separator.') elif sep is not None: warnings.warn('to_clipboard with excel=False ignores the sep argument') if isinstance(obj, ABCDataFrame): # str(df) has various unhelpful defaults, like truncation with option_context('display.max_colwidth', 999999): objstr = obj.to_string(**kwargs) else: objstr = str(obj) clipboard_set(objstr)
def test_to_csv_quote_none(self): # GH4328 df = DataFrame({'A': ['hello', '{"hello"}']}) for encoding in (None, 'utf-8'): buf = StringIO() df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) result = buf.getvalue() expected = 'A\nhello\n{"hello"}\n' self.assertEqual(result, expected)
def test_to_csv_line_terminators(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) buf = StringIO() df.to_csv(buf, line_terminator='\r\n') expected = (',A,B\r\n' 'one,1,4\r\n' 'two,2,5\r\n' 'three,3,6\r\n') self.assertEqual(buf.getvalue(), expected) buf = StringIO() df.to_csv(buf) # The default line terminator remains \n expected = (',A,B\n' 'one,1,4\n' 'two,2,5\n' 'three,3,6\n') self.assertEqual(buf.getvalue(), expected)
def test_to_csv_from_csv_categorical(self): # CSV with categoricals should result in the same output as when one # would add a "normal" Series/DataFrame. s = Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) s2 = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) res = StringIO() s.to_csv(res) exp = StringIO() s2.to_csv(exp) self.assertEqual(res.getvalue(), exp.getvalue()) df = DataFrame({"s": s}) df2 = DataFrame({"s": s2}) res = StringIO() df.to_csv(res) exp = StringIO() df2.to_csv(exp) self.assertEqual(res.getvalue(), exp.getvalue())
def test_to_csv_index_no_leading_comma(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) buf = StringIO() df.to_csv(buf, index_label=False) expected = ('A,B\n' 'one,1,4\n' 'two,2,5\n' 'three,3,6\n') self.assertEqual(buf.getvalue(), expected)
def profiled(): pr = cProfile.Profile() pr.enable() yield pr.disable() s = StringIO() ps = pstats.Stats(pr, stream=s).sort_stats('cumulative') ps.print_stats() # uncomment this to see who's calling what # ps.print_callers() print(s.getvalue())
def test_to_csv_index_no_leading_comma(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) buf = StringIO() df.to_csv(buf, index_label=False) expected = ('A,B\n' 'one,1,4\n' 'two,2,5\n' 'three,3,6\n') assert buf.getvalue() == expected
def test_to_csv_gcs(mock): df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], 'dt': date_range('2018-06-18', periods=2)}) with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem: s = StringIO() instance = MockFileSystem.return_value instance.open.return_value = s df1.to_csv('gs://test/test.csv', index=True) df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0) assert_frame_equal(df1, df2)
def test_to_csv_index_no_leading_comma(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) buf = StringIO() df.to_csv(buf, index_label=False) expected_rows = ['A,B', 'one,1,4', 'two,2,5', 'three,3,6'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert buf.getvalue() == expected
def test_to_csv_unicodewriter_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) buf = StringIO() df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8') result = buf.getvalue() expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected
def test_to_csv_unicodewriter_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) buf = StringIO() df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8') result = buf.getvalue() expected = ('"A","B"\n' '1,"foo"\n' '2,"bar"\n' '3,"baz"\n') self.assertEqual(result, expected)
def test_info_shows_column_dtypes(self): dtypes = ["int64", "float64", "datetime64[ns]", "timedelta64[ns]", "complex128", "object", "bool"] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() df.info(buf=buf) res = buf.getvalue() for i, dtype in enumerate(dtypes): name = "%d %d non-null %s" % (i, n, dtype) assert name in res
def save(self): # create the writer & save if self.encoding is None: if compat.PY2: encoding = 'ascii' else: encoding = 'utf-8' else: encoding = self.encoding # PR 21300 uses string buffer to receive csv writing and dump into # file-like output with compression as option. GH 21241, 21118 f = StringIO() if not is_file_like(self.path_or_buf): # path_or_buf is path path_or_buf = self.path_or_buf elif hasattr(self.path_or_buf, 'name'): # path_or_buf is file handle path_or_buf = self.path_or_buf.name else: # path_or_buf is file-like IO objects. f = self.path_or_buf path_or_buf = None try: writer_kwargs = dict(lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, doublequote=self.doublequote, escapechar=self.escapechar, quotechar=self.quotechar) if encoding == 'ascii': self.writer = csvlib.writer(f, **writer_kwargs) else: writer_kwargs['encoding'] = encoding self.writer = UnicodeWriter(f, **writer_kwargs) self._save() finally: # GH 17778 handles zip compression for byte strings separately. buf = f.getvalue() if path_or_buf: f, handles = _get_handle(path_or_buf, self.mode, encoding=encoding, compression=self.compression) f.write(buf) f.close() for _fh in handles: _fh.close()
def test_to_csv_quote_none(self): # GH4328 df = DataFrame({'A': ['hello', '{"hello"}']}) for encoding in (None, 'utf-8'): buf = StringIO() df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) result = buf.getvalue() expected_rows = ['A', 'hello', '{"hello"}'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected
def test_to_csv_gcs(monkeypatch): df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], 'dt': date_range('2018-06-18', periods=2)}) s = StringIO() class MockGCSFileSystem(): def open(*args): return s monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem) df1.to_csv('gs://test/example.csv', index=True) df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0) assert_frame_equal(df1, df2)
def test_info_shows_column_dtypes(self): dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool'] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() df.info(buf=buf) res = buf.getvalue() for i, dtype in enumerate(dtypes): name = '%d %d non-null %s' % (i, n, dtype) assert name in res
def test_to_csv_gcs(monkeypatch): df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], 'dt': date_range('2018-06-18', periods=2)}) s = StringIO() class MockGCSFileSystem(): def open(*args): return s monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem) df1.to_csv('gs://test/test.csv', index=True) df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0) assert_frame_equal(df1, df2)
def test_repr_bool_fails(self): s = Series([DataFrame(np.random.randn(2, 2)) for i in range(5)]) import sys buf = StringIO() tmp = sys.stderr sys.stderr = buf try: # it works (with no Cython exception barf)! repr(s) finally: sys.stderr = tmp self.assertEqual(buf.getvalue(), '')
def test_to_csv_from_csv_categorical(self): # CSV with categoricals should result in the same output # as when one would add a "normal" Series/DataFrame. s = Series(pd.Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) s2 = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) res = StringIO() s.to_csv(res, header=False) exp = StringIO() s2.to_csv(exp, header=False) assert res.getvalue() == exp.getvalue() df = DataFrame({"s": s}) df2 = DataFrame({"s": s2}) res = StringIO() df.to_csv(res) exp = StringIO() df2.to_csv(exp) assert res.getvalue() == exp.getvalue()
def _get_pretty_string(obj): """Return a prettier version of obj Parameters ---------- obj : object Object to pretty print Returns ------- s : str Pretty print object repr """ sio = StringIO() pprint.pprint(obj, stream=sio) return sio.getvalue()
def test_to_csv_gcs(mock): df1 = DataFrame({ 'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], 'dt': date_range('2018-06-18', periods=2) }) with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem: s = StringIO() instance = MockFileSystem.return_value instance.open.return_value = s df1.to_csv('gs://test/test.csv', index=True) df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0) assert_frame_equal(df1, df2)
def test_info_wide(self): from pandas import set_option, reset_option io = StringIO() df = DataFrame(np.random.randn(5, 101)) df.info(buf=io) io = StringIO() df.info(buf=io, max_cols=101) rs = io.getvalue() self.assertTrue(len(rs.splitlines()) > 100) xp = rs set_option('display.max_info_columns', 101) io = StringIO() df.info(buf=io) self.assertEqual(rs, xp) reset_option('display.max_info_columns')
def test_read_csv_chunked_download(self, s3_resource, caplog): # 8 MB, S3FS usees 5MB chunks df = DataFrame(np.random.randn(100000, 4), columns=list('abcd')) buf = BytesIO() str_buf = StringIO() df.to_csv(str_buf) buf = BytesIO(str_buf.getvalue().encode('utf-8')) s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf) with caplog.at_level(logging.DEBUG, logger='s3fs.core'): read_csv("s3://pandas-test/large-file.csv", nrows=5) # log of fetch_range (start, stop) assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records))
def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover """ Attempt to write text representation of object to the system clipboard The clipboard can be then pasted into Excel for example. Parameters ---------- obj : the object to write to the clipboard excel : boolean, defaults to True if True, use the provided separator, writing in a csv format for allowing easy pasting into excel. if False, write a string representation of the object to the clipboard sep : optional, defaults to tab other keywords are passed to to_csv Notes ----- Requirements for your platform - Linux: xclip, or xsel (with gtk or PyQt4 modules) - Windows: - OS X: """ from pandas.util.clipboard import clipboard_set if excel is None: excel = True if excel: try: if sep is None: sep = '\t' buf = StringIO() obj.to_csv(buf, sep=sep, **kwargs) clipboard_set(buf.getvalue()) return except: pass if isinstance(obj, DataFrame): # str(df) has various unhelpful defaults, like truncation with option_context('display.max_colwidth', 999999): objstr = obj.to_string(**kwargs) else: objstr = str(obj) clipboard_set(objstr)
def test_info_memory(self): # https://github.com/pandas-dev/pandas/issues/21056 df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')}) buf = StringIO() df.info(buf=buf) result = buf.getvalue() bytes = float(df.memory_usage().sum()) expected = textwrap.dedent("""\ <class 'pandas.core.frame.DataFrame'> RangeIndex: 2 entries, 0 to 1 Data columns (total 1 columns): a 2 non-null int64 dtypes: int64(1) memory usage: {} bytes """.format(bytes)) assert result == expected
def test_to_html(biggie_df_fixture): # TODO: split this test df = biggie_df_fixture s = df.to_html() buf = StringIO() retval = df.to_html(buf=buf) assert retval is None assert buf.getvalue() == s assert isinstance(s, str) df.to_html(columns=['B', 'A'], col_space=17) df.to_html(columns=['B', 'A'], formatters={'A': lambda x: '{x:.1f}'.format(x=x)}) df.to_html(columns=['B', 'A'], float_format=str) df.to_html(columns=['B', 'A'], col_space=12, float_format=str)
def predict(args): predict_curl = pycurl.Curl() storage = StringIO() values = [(args.input_node_name, (pycurl.FORM_FILE, args.file_path))] predict_curl.setopt(predict_curl.URL, args.server_url) predict_curl.setopt(predict_curl.WRITEFUNCTION, storage.write) predict_curl.setopt(predict_curl.HTTPPOST, values) predict_curl.perform() predict_curl.close() content = storage.getvalue() content = content.replace('\n', '') res = json.loads(content) if os.path.isfile(args.label_file_path): labels = [ line.strip() for line in open(args.label_file_path).readlines() ] if labels is not None: for item in res["prediction"]: item['class'] = labels[int(item['class'])] print res
def test_verbose_import(self): text = """a,b,c,d one,1,2,3 one,1,2,3 ,1,2,3 one,1,2,3 ,1,2,3 ,1,2,3 one,1,2,3 two,1,2,3""" buf = StringIO() sys.stdout = buf try: # engines are verbose in different ways self.read_csv(StringIO(text), verbose=True) if self.engine == 'c': self.assertIn('Tokenization took:', buf.getvalue()) self.assertIn('Parser memory cleanup took:', buf.getvalue()) else: # Python engine self.assertEqual(buf.getvalue(), 'Filled 3 NA values in column a\n') finally: sys.stdout = sys.__stdout__ buf = StringIO() sys.stdout = buf text = """a,b,c,d one,1,2,3 two,1,2,3 three,1,2,3 four,1,2,3 five,1,2,3 ,1,2,3 seven,1,2,3 eight,1,2,3""" try: # engines are verbose in different ways self.read_csv(StringIO(text), verbose=True, index_col=0) if self.engine == 'c': self.assertIn('Tokenization took:', buf.getvalue()) self.assertIn('Parser memory cleanup took:', buf.getvalue()) else: # Python engine self.assertEqual(buf.getvalue(), 'Filled 1 NA values in column a\n') finally: sys.stdout = sys.__stdout__
def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover """ Attempt to write text representation of object to the system clipboard The clipboard can be then pasted into Excel for example. Parameters ---------- obj : the object to write to the clipboard excel : boolean, defaults to True if True, use the provided separator, writing in a csv format for allowing easy pasting into excel. if False, write a string representation of the object to the clipboard sep : optional, defaults to tab other keywords are passed to to_csv Notes ----- Requirements for your platform - Linux: xclip, or xsel (with gtk or PyQt4 modules) - Windows: - OS X: """ from pandas.util.clipboard import clipboard_set if excel is None: excel = True if excel: try: if sep is None: sep = '\t' buf = StringIO() obj.to_csv(buf, sep=sep, **kwargs) clipboard_set(buf.getvalue()) return except: pass clipboard_set(str(obj))
def test_info_memory_usage(self): # Ensure memory usage is displayed, when asserted, on the last line dtypes = [ 'int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool' ] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() # display memory usage case df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert "memory usage: " in res[-1] # do not display memory usage case df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() assert "memory usage: " not in res[-1] df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # memory usage is a lower bound, so print it as XYZ+ MB assert re.match(r"memory usage: [^+]+\+", res[-1]) df.iloc[:, :5].info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # excluded column with object dtype, so estimate is accurate assert not re.match(r"memory usage: [^+]+\+", res[-1]) # POJO.Test a DataFrame with duplicate columns dtypes = ['int64', 'int64', 'int64', 'float64'] data = {} n = 100 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) df.columns = dtypes df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) df_with_object_index.info(buf=buf, memory_usage='deep') res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+$", res[-1]) # Ensure df size is as expected # (cols * rows * bytes) + index size df_size = df.memory_usage().sum() exp_size = len(dtypes) * n * 8 + df.index.nbytes assert df_size == exp_size # Ensure number of cols in memory_usage is the same as df size_df = np.size(df.columns.values) + 1 # index=True; default assert size_df == np.size(df.memory_usage()) # assert deep works only on object assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() # test for validity DataFrame(1, index=['a'], columns=['A']).memory_usage(index=True) DataFrame(1, index=['a'], columns=['A']).index.nbytes df = DataFrame(data=1, index=pd.MultiIndex.from_product([['a'], range(1000)]), columns=['A']) df.index.nbytes df.memory_usage(index=True) df.index.values.nbytes mem = df.memory_usage(deep=True).sum() assert mem > 0
def test_to_csv_quoting(self): df = DataFrame({ 'c_bool': [True, False], 'c_float': [1.0, 3.2], 'c_int': [42, np.nan], 'c_string': ['a', 'b,c'], }) expected = """\ ,c_bool,c_float,c_int,c_string 0,True,1.0,42.0,a 1,False,3.2,,"b,c" """ result = df.to_csv() assert result == expected result = df.to_csv(quoting=None) assert result == expected result = df.to_csv(quoting=csv.QUOTE_MINIMAL) assert result == expected expected = """\ "","c_bool","c_float","c_int","c_string" "0","True","1.0","42.0","a" "1","False","3.2","","b,c" """ result = df.to_csv(quoting=csv.QUOTE_ALL) assert result == expected # see gh-12922, gh-13259: make sure changes to # the formatters do not break this behaviour expected = """\ "","c_bool","c_float","c_int","c_string" 0,True,1.0,42.0,"a" 1,False,3.2,"","b,c" """ result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) assert result == expected msg = "need to escape, but no escapechar set" tm.assert_raises_regex(csv.Error, msg, df.to_csv, quoting=csv.QUOTE_NONE) tm.assert_raises_regex(csv.Error, msg, df.to_csv, quoting=csv.QUOTE_NONE, escapechar=None) expected = """\ ,c_bool,c_float,c_int,c_string 0,True,1.0,42.0,a 1,False,3.2,,b!,c """ result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='!') assert result == expected expected = """\ ,c_bool,c_ffloat,c_int,c_string 0,True,1.0,42.0,a 1,False,3.2,,bf,c """ result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='f') assert result == expected # see gh-3503: quoting Windows line terminators # presents with encoding? text = 'a,b,c\n1,"test \r\n",3\n' df = pd.read_csv(StringIO(text)) buf = StringIO() df.to_csv(buf, encoding='utf-8', index=False) assert buf.getvalue() == text # xref gh-7791: make sure the quoting parameter is passed through # with multi-indexes df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) df = df.set_index(['a', 'b']) expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
def summary(self): """ This returns the formatted result of the OLS computation """ template = """ %(bannerTop)s Formula: Y ~ %(formula)s Number of Observations: %(nobs)d Number of Degrees of Freedom: %(df)d R-squared: %(r2)10.4f Adj R-squared: %(r2_adj)10.4f Rmse: %(rmse)10.4f F-stat %(f_stat_shape)s: %(f_stat)10.4f, p-value: %(f_stat_p_value)10.4f Degrees of Freedom: model %(df_model)d, resid %(df_resid)d %(bannerCoef)s %(coef_table)s %(bannerEnd)s """ coef_table = self._coef_table results = self._results f_stat = results['f_stat'] bracketed = ['<%s>' % str(c) for c in results['beta'].index] formula = StringIO() formula.write(bracketed[0]) tot = len(bracketed[0]) line = 1 for coef in bracketed[1:]: tot = tot + len(coef) + 3 if tot // (68 * line): formula.write('\n' + ' ' * 12) line += 1 formula.write(' + ' + coef) params = { 'bannerTop': scom.banner('Summary of Regression Analysis'), 'bannerCoef': scom.banner('Summary of Estimated Coefficients'), 'bannerEnd': scom.banner('End of Summary'), 'formula': formula.getvalue(), 'r2': results['r2'], 'r2_adj': results['r2_adj'], 'nobs': results['nobs'], 'df': results['df'], 'df_model': results['df_model'], 'df_resid': results['df_resid'], 'coef_table': coef_table, 'rmse': results['rmse'], 'f_stat': f_stat['f-stat'], 'f_stat_shape': '(%d, %d)' % (f_stat['DF X'], f_stat['DF Resid']), 'f_stat_p_value': f_stat['p-value'], } return template % params
visualizer = PredictionError(RandomForestRegressor()) # Fit visualizer.fit(X_train, y_train) # Score and visualize visualizer.score(X_test, y_test) visualizer.poof() from sklearn import tree from IPython.core.display import Image from pandas.compat import StringIO import pydotplus # Visualize tree dot_data = StringIO() tree.export_graphviz(model.estimators_[0], out_file=dot_data) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) image = graph.write("random_network") from sklearn.ensemble import AdaBoostRegressor model = AdaBoostRegressor() model.fit(X_train, y_train) yhat = model.predict(X_test) r2 = r2_score(y_test, yhat) me = mse(y_test, yhat) print("r2={:0.3f} MSE={:0.3f}".format(r2, me)) from yellowbrick.regressor import PredictionError # Instantiate the visualizer visualizer = PredictionError(AdaBoostRegressor()) # Fit
def test_dtype_name_in_info(self, data): buf = StringIO() pd.DataFrame({"A": data}).info(buf=buf) result = buf.getvalue() assert data.dtype.name in result
def save(self): """ Create the writer & save """ # GH21227 internal compression is not used when file-like passed. if self.compression and hasattr(self.path_or_buf, 'write'): msg = ("compression has no effect when passing file-like " "object as input.") warnings.warn(msg, RuntimeWarning, stacklevel=2) # when zip compression is called. is_zip = isinstance(self.path_or_buf, ZipFile) or (not hasattr(self.path_or_buf, 'write') and self.compression == 'zip') if is_zip: # zipfile doesn't support writing string to archive. uses string # buffer to receive csv writing and dump into zip compression # file handle. GH21241, GH21118 f = StringIO() close = False elif hasattr(self.path_or_buf, 'write'): f = self.path_or_buf close = False else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=self.encoding, compression=self.compression) close = True try: writer_kwargs = dict(lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, doublequote=self.doublequote, escapechar=self.escapechar, quotechar=self.quotechar) if self.encoding == 'ascii': self.writer = csvlib.writer(f, **writer_kwargs) else: writer_kwargs['encoding'] = self.encoding self.writer = UnicodeWriter(f, **writer_kwargs) self._save() finally: if is_zip: # GH17778 handles zip compression separately. buf = f.getvalue() if hasattr(self.path_or_buf, 'write'): self.path_or_buf.write(buf) else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=self.encoding, compression=self.compression) f.write(buf) close = True if close: f.close() for _fh in handles: _fh.close()
def test_info_memory_usage(self): # Ensure memory usage is displayed, when asserted, on the last line dtypes = [ 'int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', 'complex128', 'object', 'bool' ] data = {} n = 10 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) buf = StringIO() # display memory usage case df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() self.assertTrue("memory usage: " in res[-1]) # do not display memory usage cas df.info(buf=buf, memory_usage=False) res = buf.getvalue().splitlines() self.assertTrue("memory usage: " not in res[-1]) df.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # memory usage is a lower bound, so print it as XYZ+ MB self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) df.iloc[:, :5].info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() # excluded column with object dtype, so estimate is accurate self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1])) df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1])) df_with_object_index.info(buf=buf, memory_usage='deep') res = buf.getvalue().splitlines() self.assertTrue(re.match(r"memory usage: [^+]+$", res[-1])) self.assertGreater( df_with_object_index.memory_usage(index=True, deep=True).sum(), df_with_object_index.memory_usage(index=True).sum()) df_object = pd.DataFrame({'a': ['a']}) self.assertGreater( df_object.memory_usage(deep=True).sum(), df_object.memory_usage().sum()) # Test a DataFrame with duplicate columns dtypes = ['int64', 'int64', 'int64', 'float64'] data = {} n = 100 for i, dtype in enumerate(dtypes): data[i] = np.random.randint(2, size=n).astype(dtype) df = DataFrame(data) df.columns = dtypes # Ensure df size is as expected # (cols * rows * bytes) + index size df_size = df.memory_usage().sum() exp_size = len(dtypes) * n * 8 + df.index.nbytes self.assertEqual(df_size, exp_size) # Ensure number of cols in memory_usage is the same as df size_df = np.size(df.columns.values) + 1 # index=True; default self.assertEqual(size_df, np.size(df.memory_usage())) # assert deep works only on object self.assertEqual(df.memory_usage().sum(), df.memory_usage(deep=True).sum()) # test for validity DataFrame(1, index=['a'], columns=['A']).memory_usage(index=True) DataFrame(1, index=['a'], columns=['A']).index.nbytes df = DataFrame(data=1, index=pd.MultiIndex.from_product([['a'], range(1000)]), columns=['A']) df.index.nbytes df.memory_usage(index=True) df.index.values.nbytes # sys.getsizeof will call the .memory_usage with # deep=True, and add on some GC overhead diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df) self.assertTrue(abs(diff) < 100)
def test_dump_to_file(self): f = StringIO() ujson.dump([1, 2, 3], f) assert "[1,2,3]" == f.getvalue()