def _get(self, symbol, year, month, filename_cache, as_): url = self.url(symbol, year, month) if os.path.isfile(filename_cache) and os.path.getsize(filename_cache) > 0: logger.debug("loading file '%s'" % filename_cache) fd = open(filename_cache, 'r') from_file_cache = True return fd, from_file_cache else: logger.debug("querying url '%s'" % url) response = self.session.get(url) response = self.session.get(url) #response = self.session.get(url, stream=True) #total_length = int(response.headers.get('content-length')) #for chunk in progress.bar(response.iter_content(chunk_size=1024), expected_size=(total_length/1024) + 1): # pass if not response.status_code == requests.codes.ok: msg = "status_code is %d instead of %d" % (response.status_code, requests.codes.ok) raise NotImplementedError(msg) from_file_cache = False if as_ == 'bytes': data = compat.BytesIO(response.content) elif as_ == 'string': data = compat.StringIO(response.content) else: data = response.content return data, from_file_cache
def _parse_data(data): data_io = compat.StringIO(data) df = pd.read_csv(data_io, header=None, \ names=['Symbol', 'Date', 'Bid', 'Bid_point', \ 'Ask', 'Ask_point', 'High', 'Low', 'Open']) df['Date'] = pd.to_datetime(df['Date'], unit='ms') df = df.set_index('Symbol') return (df)
def test_max_nan_bug(): raw = """,Date,app,File -04-23,2013-04-23 00:00:00,,log080001.log -05-06,2013-05-06 00:00:00,,log.log -05-07,2013-05-07 00:00:00,OE,xlsx""" df = pd.read_csv(compat.StringIO(raw), parse_dates=[0]) gb = df.groupby('Date') r = gb[['File']].max() e = gb['File'].max().to_frame() tm.assert_frame_equal(r, e) assert not r['File'].isna().any()
def test_info_categorical_column(self): # make sure it works n = 2500 df = DataFrame({'int64': np.random.randint(100, size=n)}) df['category'] = Series(np.array(list('abcdefghij')).take( np.random.randint(0, 10, size=n))).astype('category') df.isna() buf = StringIO() df.info(buf=buf) df2 = df[df['category'] == 'd'] buf = compat.StringIO() df2.info(buf=buf)
def test_apply_issues(): # GH 5788 s = """2011.05.16,00:00,1.40893 2011.05.16,01:00,1.40760 2011.05.16,02:00,1.40750 2011.05.16,03:00,1.40649 2011.05.17,02:00,1.40893 2011.05.17,03:00,1.40760 2011.05.17,04:00,1.40750 2011.05.17,05:00,1.40649 2011.05.18,02:00,1.40893 2011.05.18,03:00,1.40760 2011.05.18,04:00,1.40750 2011.05.18,05:00,1.40649""" df = pd.read_csv(compat.StringIO(s), header=None, names=['date', 'time', 'value'], parse_dates=[['date', 'time']]) df = df.set_index('date_time') expected = df.groupby(df.index.date).idxmax() result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) tm.assert_frame_equal(result, expected) # GH 5789 # don't auto coerce dates df = pd.read_csv(compat.StringIO(s), header=None, names=['date', 'time', 'value']) exp_idx = pd.Index(['2011.05.16', '2011.05.17', '2011.05.18'], dtype=object, name='date') expected = Series(['00:00', '02:00', '02:00'], index=exp_idx) result = df.groupby('date').apply(lambda x: x['time'][x['value'].idxmax()]) tm.assert_series_equal(result, expected)
def _lexer_split_from_str(dt_str): # The StringIO(str(_)) is for dateutil 2.2 compatibility return _timelex.split(compat.StringIO(str(dt_str)))