def load_from_csv(self, tickers, index, fields=Fields.QUOTES, **kwargs): ''' Return a quote panel ''' #TODO Replace adj_close with actual_close #TODO Add reindex methods, and start, end, delta parameters reverse = kwargs.get('reverse', False) verbose = kwargs.get('verbose', False) if self.connected['database']: symbols, markets = self.db.getTickersCodes(tickers) elif not symbols: self._logger.error('** No database neither informations provided') return None timestamps = du.getNYSEdays(index[0], index[-1], dt.timedelta(hours=16)) csv = da.DataAccess('Yahoo') df = csv.get_data(timestamps, symbols.values(), fields, verbose=verbose) quotes_dict = dict() for ticker in tickers: j = 0 quotes_dict[ticker] = dict() for field in fields: serie = df[j][symbols[ticker]].groupby(index.freq.rollforward).aggregate(np.mean) #TODO add a function parameter to decide what to do about it clean_serie = serie.fillna(method='pad') quotes_dict[ticker][field] = clean_serie j += 1 if reverse: return Panel.from_dict(quotes_dict, intersect=True, orient='minor') return Panel.from_dict(quotes_dict, intersect=True)
def test_swaplevel_panel(self): panel = Panel({"ItemA": self.frame, "ItemB": self.frame * 2}) result = panel.swaplevel(0, 1, axis="major") expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1) tm.assert_panel_equal(result, expected)
def test_swaplevel_panel(self): panel = Panel({'ItemA' : self.frame, 'ItemB' : self.frame * 2}) result = panel.swaplevel(0, 1, axis='major') expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1)
def execute(self, strategy): forecasts = {} for forecaster in self.forecasters: forecasts[forecaster.name] = forecaster(strategy).data forecasts = Panel(forecasts) mean_fcst = self.normalise(forecasts.mean(axis = 'items')) return Signal(mean_fcst, [-20, 20], forecasts)
def test_panel_join_many(self): tm.K = 10 panel = tm.makePanel() tm.K = 4 panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] joined = panels[0].join(panels[1:]) tm.assert_panel_equal(joined, panel) panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] data_dict = {} for p in panels: data_dict.update(p.iteritems()) joined = panels[0].join(panels[1:], how='inner') expected = Panel.from_dict(data_dict, intersect=True) tm.assert_panel_equal(joined, expected) joined = panels[0].join(panels[1:], how='outer') expected = Panel.from_dict(data_dict, intersect=False) tm.assert_panel_equal(joined, expected) # edge cases self.assertRaises(ValueError, panels[0].join, panels[1:], how='outer', lsuffix='foo', rsuffix='bar') self.assertRaises(ValueError, panels[0].join, panels[1:], how='right')
def create_data(): """ create the pickle/msgpack data """ data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] } index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), ts=TimeSeries(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=['one', 'two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), cat=Series(Categorical(['foo', 'bar', 'baz'])), per=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict(float=DataFrame(dict(A=series['float'], B=series['float'] + 1)), int=DataFrame(dict(A=series['int'], B=series['int'] + 1)), mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])), mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)), index=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'baz'], ['one', 'two', 'one', 'two', 'three']])), names=['first', 'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A']), cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), B=np.arange(3).astype(np.int64))), mixed_dup=mixed_dup_df) mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) mixed_dup_panel.items = ['ItemA', 'ItemA'] panel = dict(float=Panel(dict(ItemA=frame['float'], ItemB=frame['float'] + 1)), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A']), mixed_dup=mixed_dup_panel) return dict(series=series, frame=frame, panel=panel, index=index, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()))
def test_resample_panel(self): rng = date_range("1/1/2000", "6/30/2000") n = len(rng) panel = Panel( np.random.randn(3, n, 5), items=["one", "two", "three"], major_axis=rng, minor_axis=["a", "b", "c", "d", "e"], ) result = panel.resample("M", axis=1) def p_apply(panel, f): result = {} for item in panel.items: result[item] = f(panel[item]) return Panel(result, items=panel.items) expected = p_apply(panel, lambda x: x.resample("M")) tm.assert_panel_equal(result, expected) panel2 = panel.swapaxes(1, 2) result = panel2.resample("M", axis=2) expected = p_apply(panel2, lambda x: x.resample("M", axis=1)) tm.assert_panel_equal(result, expected)
def test_resample_panel(): rng = date_range('1/1/2000', '6/30/2000') n = len(rng) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) panel = Panel(np.random.randn(3, n, 5), items=['one', 'two', 'three'], major_axis=rng, minor_axis=['a', 'b', 'c', 'd', 'e']) result = panel.resample('M', axis=1).mean() def p_apply(panel, f): result = {} for item in panel.items: result[item] = f(panel[item]) return Panel(result, items=panel.items) expected = p_apply(panel, lambda x: x.resample('M').mean()) tm.assert_panel_equal(result, expected) panel2 = panel.swapaxes(1, 2) result = panel2.resample('M', axis=2).mean() expected = p_apply(panel2, lambda x: x.resample('M', axis=1).mean()) tm.assert_panel_equal(result, expected)
def test_panel_setitem(self): with catch_warnings(record=True): # GH 7763 # loc and setitem have setting differences np.random.seed(0) index = range(3) columns = list('abc') panel = Panel({'A': DataFrame(np.random.randn(3, 3), index=index, columns=columns), 'B': DataFrame(np.random.randn(3, 3), index=index, columns=columns), 'C': DataFrame(np.random.randn(3, 3), index=index, columns=columns)}) replace = DataFrame(np.eye(3, 3), index=range(3), columns=columns) expected = Panel({'A': replace, 'B': replace, 'C': replace}) p = panel.copy() for idx in list('ABC'): p[idx] = replace tm.assert_panel_equal(p, expected) p = panel.copy() for idx in list('ABC'): p.loc[idx, :, :] = replace tm.assert_panel_equal(p, expected)
def execute(self, strategy): measures = {} for pars in self.par_pairs: name = "ewmac_{}_{}".format(max(pars), min(pars)) ewmac = EWMAC(EMA(max(pars)), EMA(min(pars)), self.vol) measures[name] = ewmac(strategy).data measures = Panel(measures) return Signal(measures.mean(axis = 'items'), [-20, 20], measures)
def test_resample_panel_numpy(self): rng = date_range('1/1/2000', '6/30/2000') n = len(rng) panel = Panel(np.random.randn(3, n, 5), items=['one', 'two', 'three'], major_axis=rng, minor_axis=['a', 'b', 'c', 'd', 'e']) result = panel.resample('M', how=lambda x: x.mean(), axis=1) expected = panel.resample('M', how='mean', axis=1) tm.assert_panel_equal(result, expected)
def test_sparse_panel(self): items = ["x", "y", "z"] p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) sp = p.to_sparse() self._check_roundtrip(sp, tm.assert_panel_equal, check_panel_type=True) sp2 = p.to_sparse(kind="integer") self._check_roundtrip(sp2, tm.assert_panel_equal, check_panel_type=True) sp3 = p.to_sparse(fill_value=0) self._check_roundtrip(sp3, tm.assert_panel_equal, check_panel_type=True)
def run(mpfile, **kwargs): from pandas import Panel, np meta_data = mpfile.document['_hdata'].pop('input') file_path = os.path.join(os.environ['HOME'], 'work', meta_data['file_path']) if not os.path.exists(file_path): print 'Please upload', file_path return table_columns = meta_data['table_columns'].split(' -- ') identifier = mpfile.ids[0] with tarfile.open(file_path, "r:gz") as tar: for member in tar.getmembers(): name = os.path.splitext(member.name)[0] print 'load', name, '...' f = tar.extractfile(member) if 'pump' in name: #fstr = f.read() #fstr = ''.join([f.readline() for x in xrange(10)]) # only load a small area list1, list2 = range(1), range(6) tuples = [(x, y) for x in list1 for y in list2] delta = 150 for x, y in tuples: lines = [] for i in xrange((x+1)*delta): line = f.readline() if i > x*delta: lines.append(line) sub_lines = [] for line in lines: sub_line = line.strip().split(',')[y*delta:(y+1)*delta] sub_lines.append(','.join(sub_line)) fstr = '\n'.join(sub_lines) print 'read_csv ...' df = read_csv(fstr, header=None) arr = [[[cell] for cell in row] for row in df.values] sub_name = '{}_{}_{}'.format(name, x, y) df = Panel(arr, minor_axis=[sub_name]).transpose(2, 0, 1).to_frame() print df.head() print 'add', sub_name, '...' mpfile.add_data_table(identifier, df, sub_name) f.seek(0) else: fstr = f.read() df = read_csv(fstr, names=table_columns) print 'add', name, '...' mpfile.add_data_table(identifier, df, name) print 'Added data from {}'.format(file_path)
def test_resample_panel_numpy(self): rng = date_range("1/1/2000", "6/30/2000") n = len(rng) panel = Panel( np.random.randn(3, n, 5), items=["one", "two", "three"], major_axis=rng, minor_axis=["a", "b", "c", "d", "e"], ) result = panel.resample("M", how=lambda x: x.mean(), axis=1) expected = panel.resample("M", how="mean", axis=1) tm.assert_panel_equal(result, expected)
def test_pipe_panel(self): with catch_warnings(record=True): wp = Panel({'r1': DataFrame({"A": [1, 2, 3]})}) f = lambda x, y: x + y result = wp.pipe(f, 2) expected = wp + 2 assert_panel_equal(result, expected) result = wp.pipe((f, 'y'), x=1) expected = wp + 1 assert_panel_equal(result, expected) with pytest.raises(ValueError): result = wp.pipe((f, 'y'), x=1, y=1)
def test_sparse_panel(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): items = ["x", "y", "z"] p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) sp = p.to_sparse() self._check_roundtrip(sp, tm.assert_panel_equal, check_panel_type=True) sp2 = p.to_sparse(kind="integer") self._check_roundtrip(sp2, tm.assert_panel_equal, check_panel_type=True) sp3 = p.to_sparse(fill_value=0) self._check_roundtrip(sp3, tm.assert_panel_equal, check_panel_type=True)
def test_sparse_panel(self): items = ['x', 'y', 'z'] p = Panel(dict((i, tm.makeDataFrame()) for i in items)) sp = p.to_sparse() self._check_roundtrip(sp, tm.assert_panel_equal, check_panel_type=True) sp2 = p.to_sparse(kind='integer') self._check_roundtrip(sp2, tm.assert_panel_equal, check_panel_type=True) sp3 = p.to_sparse(fill_value=0) self._check_roundtrip(sp3, tm.assert_panel_equal, check_panel_type=True)
class PanelMethods(object): params = ['items', 'major', 'minor'] param_names = ['axis'] def setup(self, axis): with warnings.catch_warnings(record=True): self.panel = Panel(np.random.randn(100, 1000, 100)) def time_pct_change(self, axis): with warnings.catch_warnings(record=True): self.panel.pct_change(1, axis=axis) def time_shift(self, axis): with warnings.catch_warnings(record=True): self.panel.shift(1, axis=axis)
def rolling_corr_pairwise(df, window, min_periods=None): """ Computes pairwise rolling correlation matrices as Panel whose items are dates Parameters ---------- df : DataFrame window : int min_periods : int, default None Returns ------- correls : Panel """ from pandas import Panel from collections import defaultdict all_results = defaultdict(dict) for i, k1 in enumerate(df.columns): for k2 in df.columns[i:]: corr = rolling_corr(df[k1], df[k2], window, min_periods=min_periods) all_results[k1][k2] = corr all_results[k2][k1] = corr return Panel.from_dict(all_results).swapaxes('items', 'major')
def load_secoora_ncs(run_name): """ Loads local files using the run_name date. NOTE: Consider moving this inside the notebook. """ fname = '{}-{}.nc'.format OBS_DATA = nc2df(os.path.join(run_name, fname(run_name, 'OBS_DATA'))) SECOORA_OBS_DATA = nc2df(os.path.join(run_name, fname(run_name, 'SECOORA_OBS_DATA'))) ALL_OBS_DATA = concat([OBS_DATA, SECOORA_OBS_DATA], axis=1) index = ALL_OBS_DATA.index dfs = dict(OBS_DATA=ALL_OBS_DATA) for fname in glob(os.path.join(run_name, "*.nc")): if 'OBS_DATA' in fname: continue else: model = fname.split('.')[0].split('-')[-1] df = nc2df(fname) # FIXME: Horrible work around duplicate times. if len(df.index.values) != len(np.unique(df.index.values)): kw = dict(subset='index', keep='last') df = df.reset_index().drop_duplicates(**kw).set_index('index') kw = dict(method='time', limit=30) df = df.reindex(index).interpolate(**kw).ix[index] dfs.update({model: df}) return Panel.fromDict(dfs).swapaxes(0, 2)
def __init__(self, tickers, start_date, end_date): ''' Constructor ''' self.start = start_date self.end = end_date self.downloader = Data.Handler("/home/mark/Data/MarketData/Stocks/Python/") self.instruments = Panel({ticker:DataFrame(None) for ticker in sorted(tickers)})
def test_panel_assignment(self): with catch_warnings(record=True): # GH3777 wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], major_axis=date_range('1/1/2000', periods=5), minor_axis=['A', 'B', 'C', 'D']) wp2 = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], major_axis=date_range('1/1/2000', periods=5), minor_axis=['A', 'B', 'C', 'D']) # TODO: unused? # expected = wp.loc[['Item1', 'Item2'], :, ['A', 'B']] with pytest.raises(NotImplementedError): wp.loc[['Item1', 'Item2'], :, ['A', 'B']] = wp2.loc[ ['Item1', 'Item2'], :, ['A', 'B']]
def test_big_table(self): raise nose.SkipTest('no big table') # create and write a big table wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%s' % i for i in xrange(20) ], major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%s' % i for i in xrange(1000) ]) wp.ix[:,100:200,300:400] = np.nan try: store = HDFStore(self.scratchpath) store._debug_memory = True store.append('wp',wp) recons = store.select('wp') finally: store.close() os.remove(self.scratchpath)
def test_panel_setitem_with_multiindex(self): with catch_warnings(record=True): # 10360 # failing with a multi-index arr = np.array([[[1, 2, 3], [0, 0, 0]], [[0, 0, 0], [0, 0, 0]]], dtype=np.float64) # reg index axes = dict(items=['A', 'B'], major_axis=[0, 1], minor_axis=['X', 'Y', 'Z']) p1 = Panel(0., **axes) p1.iloc[0, 0, :] = [1, 2, 3] expected = Panel(arr, **axes) tm.assert_panel_equal(p1, expected) # multi-indexes axes['items'] = MultiIndex.from_tuples( [('A', 'a'), ('B', 'b')]) p2 = Panel(0., **axes) p2.iloc[0, 0, :] = [1, 2, 3] expected = Panel(arr, **axes) tm.assert_panel_equal(p2, expected) axes['major_axis'] = MultiIndex.from_tuples( [('A', 1), ('A', 2)]) p3 = Panel(0., **axes) p3.iloc[0, 0, :] = [1, 2, 3] expected = Panel(arr, **axes) tm.assert_panel_equal(p3, expected) axes['minor_axis'] = MultiIndex.from_product( [['X'], range(3)]) p4 = Panel(0., **axes) p4.iloc[0, 0, :] = [1, 2, 3] expected = Panel(arr, **axes) tm.assert_panel_equal(p4, expected) arr = np.array( [[[1, 0, 0], [2, 0, 0]], [[0, 0, 0], [0, 0, 0]]], dtype=np.float64) p5 = Panel(0., **axes) p5.iloc[0, :, 0] = [1, 2] expected = Panel(arr, **axes) tm.assert_panel_equal(p5, expected)
def as_dataframe(self): """ Creates a dataframe object for a shapefile's main layer using layer_as_dataframe. This object is cached on disk for layer use, but the cached copy will only be picked up if the shapefile's mtime is older than the dataframe's mtime. :return: either a pandas DataFrame object if there is but one raster band or a Panel if there are N. """ dfx_path = self.get_filename('dfx') tiff_path = self.get_filename('tif') if hasattr(self, '_df'): return self._df elif os.path.exists(dfx_path) and os.stat(dfx_path).st_mtime >= os.stat(tiff_path).st_mtime: self._df = Panel.read_pickle(dfx_path) return self._df else: ds = gdal.Open(tiff_path) try: df= Panel(ds.ReadAsArray()) df.to_pickle(dfx_path) self._df = df return self._df except: df = DataFrame(ds.ReadAsArray()) df.to_pickle(dfx_path) self._df = df return self._df
def apply(self, func, *args, **kwargs): result = {} for key, df in self.obj.iteritems(): grp = DataFrameGroupBy(df, grouper=self.grouper) if not callable(func): f = getattr(grp, func) res = f(*args, **kwargs) result[key] = res return Panel.from_dict(result)
def test_panel_aggregation(): ind = pd.date_range('1/1/2000', periods=100) data = np.random.randn(2, len(ind), 4) wp = Panel(data, items=['Item1', 'Item2'], major_axis=ind, minor_axis=['A', 'B', 'C', 'D']) tg = TimeGrouper('M', axis=1) _, grouper, _ = tg._get_grouper(wp) bingrouped = wp.groupby(grouper) binagg = bingrouped.mean() def f(x): assert (isinstance(x, Panel)) return x.mean(1) result = bingrouped.agg(f) tm.assert_panel_equal(result, binagg)
def _check(frame): dense_frame = frame.to_dense() wp = Panel.from_dict({"foo": frame}) from_dense_lp = wp.to_frame() from_sparse_lp = spf.stack_sparse_frame(frame) self.assert_(np.array_equal(from_dense_lp.values, from_sparse_lp.values))
def testFamaMacBethRolling(self): # self.checkFamaMacBethExtended('rolling', self.panel_x, self.panel_y, # nw_lags_beta=2) # df = DataFrame(np.random.randn(50, 10)) x = dict((k, DataFrame(np.random.randn(50, 10))) for k in "abcdefg") x = Panel.from_dict(x) y = DataFrame(np.random.randn(50, 10)) + DataFrame(0.01 * np.random.randn(50, 10)) self.checkFamaMacBethExtended("rolling", x, y, nw_lags_beta=2) self.checkFamaMacBethExtended("expanding", x, y, nw_lags_beta=2)
def _check(frame): dense_frame = frame.to_dense() # noqa wp = Panel.from_dict({'foo': frame}) from_dense_lp = wp.to_frame() from_sparse_lp = spf.stack_sparse_frame(frame) self.assert_numpy_array_equal(from_dense_lp.values, from_sparse_lp.values)
def create_data(): """ create the pickle/msgpack data """ data = { u'A': [0., 1., 2., 3., np.nan], u'B': [0, 1, 0, 1, 0], u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'], u'D': date_range('1/1/2009', periods=5), u'E': [0., 1, Timestamp('20100101'), u'foo', 2.] } scalars = dict(timestamp=Timestamp('20130101'), period=Period('2012', 'M')) index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10), float=Index(np.arange(10, dtype=np.float64)), uint=Index(np.arange(10, dtype=np.uint64)), timedelta=timedelta_range('00:00:00', freq='30T', periods=10)) if _loose_version >= LooseVersion('0.18'): from pandas import RangeIndex index['range'] = RangeIndex(10) if _loose_version >= LooseVersion('0.21'): from pandas import interval_range index['interval'] = interval_range(0, periods=10) mi = dict(reg2=MultiIndex.from_tuples(tuple( zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo', u'foo', u'qux', u'qux'], [u'one', u'two', u'one', u'two', u'one', u'two', u'one', u'two'] ])), names=[u'first', u'second'])) series = dict( float=Series(data[u'A']), int=Series(data[u'B']), mixed=Series(data[u'E']), ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101', periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple( zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=[u'one', u'two'])), dup=Series(np.arange(5).astype(np.float64), index=[u'A', u'B', u'C', u'D', u'A']), cat=Series(Categorical([u'foo', u'bar', u'baz'])), dt=Series(date_range('20130101', periods=5)), dt_tz=Series(date_range('20130101', periods=5, tz='US/Eastern')), period=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list(u"ABCDA") frame = dict( float=DataFrame({ u'A': series[u'float'], u'B': series[u'float'] + 1 }), int=DataFrame({ u'A': series[u'int'], u'B': series[u'int'] + 1 }), mixed=DataFrame({k: data[k] for k in [u'A', u'B', u'C', u'D']}), mi=DataFrame( { u'A': np.arange(5).astype(np.float64), u'B': np.arange(5).astype(np.int64) }, index=MultiIndex.from_tuples(tuple( zip(*[[u'bar', u'bar', u'baz', u'baz', u'baz'], [u'one', u'two', u'one', u'two', u'three']])), names=[u'first', u'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=[u'A', u'B', u'A']), cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}), cat_and_float=DataFrame({ u'A': Categorical([u'foo', u'bar', u'baz']), u'B': np.arange(3).astype(np.int64) }), mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame( { u'A': Timestamp('20130102', tz='US/Eastern'), u'B': Timestamp('20130603', tz='CET') }, index=range(5)), dt_mixed2_tzs=DataFrame( { u'A': Timestamp('20130102', tz='US/Eastern'), u'B': Timestamp('20130603', tz='CET'), u'C': Timestamp('20130603', tz='UTC') }, index=range(5))) with catch_warnings(record=True): mixed_dup_panel = Panel({ u'ItemA': frame[u'float'], u'ItemB': frame[u'int'] }) mixed_dup_panel.items = [u'ItemA', u'ItemA'] panel = dict(float=Panel({ u'ItemA': frame[u'float'], u'ItemB': frame[u'float'] + 1 }), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=[u'A', u'B', u'A']), mixed_dup=mixed_dup_panel) cat = dict(int8=Categorical(list('abcdefg')), int16=Categorical(np.arange(1000)), int32=Categorical(np.arange(10000))) timestamp = dict(normal=Timestamp('2011-01-01'), nat=NaT, tz=Timestamp('2011-01-01', tz='US/Eastern')) if _loose_version < LooseVersion('0.19.2'): timestamp['freq'] = Timestamp('2011-01-01', offset='D') timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', offset='M') else: timestamp['freq'] = Timestamp('2011-01-01', freq='D') timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', freq='M') off = { 'DateOffset': DateOffset(years=1), 'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824), 'BusinessDay': BusinessDay(offset=timedelta(seconds=9)), 'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'), 'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'), 'SemiMonthBegin': SemiMonthBegin(day_of_month=9), 'SemiMonthEnd': SemiMonthEnd(day_of_month=24), 'MonthBegin': MonthBegin(1), 'MonthEnd': MonthEnd(1), 'QuarterBegin': QuarterBegin(1), 'QuarterEnd': QuarterEnd(1), 'Day': Day(1), 'YearBegin': YearBegin(1), 'YearEnd': YearEnd(1), 'Week': Week(1), 'Week_Tues': Week(2, normalize=False, weekday=1), 'WeekOfMonth': WeekOfMonth(week=3, weekday=4), 'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3), 'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"), 'Easter': Easter(), 'Hour': Hour(1), 'Minute': Minute(1) } return dict(series=series, frame=frame, panel=panel, index=index, scalars=scalars, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()), cat=cat, timestamp=timestamp, offsets=off)
def test_dense_to_sparse(self): wp = Panel.from_dict(self.data_dict) dwp = wp.to_sparse() tm.assertIsInstance(dwp['ItemA']['A'], SparseSeries)
def WQXtoPandas( xmlLocation, charDict, outputPath='.', fromFile=False, outputDirName='Processed-Sites', RUN_PHREEQC=False, PHREEQC_PATH='/home/mcoving/phreeqc-2.18.0/bin/', DATABASE_FILE='/home/mcoving/phreeqc-2.18.0/database/phreeqc.dat', LOG_FILE='Result.log', START_FILE=None, splittag='', bracket_charge_balance=False): """ Processes a WQX xml data file and loads data for each site in the WQX file into Pandas data objects that are stored in directories for each site. Parameters ---------- xmlLocation : string Content depends on mode in which WQXtoPandas is run. When fromFile is set to False (input methods 2 or 3 in excel file) this string contains the html for a query to the USGS NWIS database to obtain an xml file of the desired data. Alternatively, if fromFile is True (input method 1 in excel file) then this string contains the name of the xml file from which to read the data. charDict : dict A dictionary containing information about the characteristics to be processed. Keys are EPA SRS characteristic names. Each entry in the dictionary is a second dictionary that contains keys IsRequired, pcode, fraction, and quality. These entries tell WQXtoPandas whether a given characteristic is required in order to process a sample, and whether a specific pcode, fraction, or quality should be required. See excel example file for more details. outputPath : string path to directory that will contain output directory fromFile : boolean True if data will be read from an xml file already present on computer. False if xml file should be queried from NWIS. (Default=False) outputDirName : string Name of output directory where all site data will be written out. (Default='Processed-Sites') RUN_PHREEQC : boolean Set to true if samples should be processed through PHREEQC. (Default=False) PHREEQC_PATH : string Path to PHREEQC executable (folder only, not executable file name) DATABASE_FILE : string Path to database file that PHREEQC should use, including database file name. LOG_FILE : string Name of log file that WQXtoPandas will create. (Default='Result.log') START_FILE : string Name of xls start file that was used to run this instance of WQXtoPandas. Name will be written out in log file. bracket_charge_balance : bool If set to true, WQXtoPandas will alternately force charge balance on calcium and alkalinity, while the latter is not physically meaningful, this provides a useful estimate of uncertainty for cases with high charge balance errors. This is most useful for water that is very dilute or with high organic content, such that titrated alkalinity values are artificially high. Returns ------- Returns 0 if execution successful. Returns -1 in case of error. Notes ----- Designed to be run through convenience function runWQXtoPandas(). """ try: #Check to see if output directory exists absOutputDirPath = os.path.abspath(outputPath) sitesdir = os.path.join(absOutputDirPath, outputDirName) print("sitesdir", sitesdir) if not (os.path.exists(sitesdir)): try: os.makedirs(sitesdir) except os.error: print(( "Problem creating output directory. Check output path name: " + outputPath)) return -1 #create xml tree if fromFile: #read from file wqxtree = etree.ElementTree(file=xmlLocation) else: #check whether we already have a matching xml file xmlSaveFile = LOG_FILE + splittag + '.xml' if (os.path.isfile(xmlSaveFile)): goodAnswer = False while not (goodAnswer): answer = input( "An xml file (" + xmlSaveFile + ") already exists. \n Use this instead of html query (y or n)?" ) if (answer.startswith('y')): #read from file wqxtree = etree.ElementTree(file=xmlSaveFile) goodAnswer = True queryXML = False elif (answer.startswith('n')): goodAnswer = True queryXML = True else: queryXML = True #If we don't have a matching xml file, or we want to obtain a new one, then get the new xml if (queryXML): print("Obtaining xml file from USGS NWIS using html query...") #parse from html query print("XML query string: ", xmlLocation) r = requests.get(xmlLocation) if not r.ok: #There is some problem with the xml query print("Response: ", str(r)) print("Reason: ", r.reason) print("Warning: ", r.headers['Warning']) #write to xml file try: #write xml to file xmlFile = open(xmlSaveFile, 'w') print(r.text, file=xmlFile) xmlFile.close() wqxtree = etree.ElementTree(file=xmlSaveFile) except IOError: print( ("Problem writing to xml file to store html query: " + xmlSaveFile)) return -1 #begin parsing XML tree root = wqxtree.getroot() #get namespace map NSMAP = root.nsmap WQX = "{%s}" % NSMAP[None] #iterate over all <Activity> tags within file and process each sample samples_processed = [] samples_not_processed = [] sitesDict = {} sitesMetaDict = {} for activity in wqxtree.getiterator(tag=WQX + "Activity"): processThisSample = True reason = '' description = activity.find(WQX + "ActivityDescription") if (description != None): datetext = description.findtext(WQX + "ActivityStartDate") starttime = description.find(WQX + "ActivityStartTime") if (starttime != None): timetext = starttime.findtext(WQX + "Time") timezone = starttime.findtext(WQX + "TimeZoneCode") else: timetext = '' timezone = '' location = description.findtext(WQX + "MonitoringLocationIdentifier") descriptionDict = { 'location': location, 'date': datetext, 'time': timetext, 'timezone': timezone } else: descriptionDict = None processThisSample = False reason = 'No description' print(('Processing sample from ' + location + ' on ' + datetext)) #create null sample dict sampleDict = {} sampleMetaDict = {} #iterate though all results for this activity for result in activity.getiterator(tag=WQX + 'Result'): if (processThisSample): try: resultdesc = result.find(WQX + "ResultDescription") characteristic = resultdesc.findtext( WQX + "CharacteristicName") if (characteristic in charDict): samplefraction = resultdesc.findtext( WQX + "ResultSampleFractionText") pcode = resultdesc.findtext(WQX + "USGSPCode") quality = resultdesc.findtext( WQX + "ResultStatusIdentifier") measure = resultdesc.find(WQX + "ResultMeasure") count = 1.0 if not (measure == None): value = measure.findtext(WQX + "ResultMeasureValue") units = measure.findtext(WQX + "MeasureUnitCode") #split pcode into list tempPcodeList = charDict[characteristic][ 'pcode'].split(';') # print("tempPcodeList="+str(tempPcodeList)) pcodeDict = {} for codePriority, code in enumerate( tempPcodeList): code = code.strip() if code != '': pcodeDict[code] = codePriority #Check whether characteristic meets criteria #for inclusion, otherwise don't add to sampleDict addCharacteristic = True if (charDict[characteristic]['fraction'] != '0'): #test for correct fraction if (charDict[characteristic]['fraction'] != samplefraction): addCharacteristic = False if (addCharacteristic): if (charDict[characteristic]['pcode'] != '0'): #test for correct pcode # print("pcode = "+pcode) # print("pcodeList = "+str(pcodeList)) # print("pcode in list="+str(pcode in pcodeList)) if not (pcode in pcodeDict): addCharacteristic = False if (addCharacteristic): if (charDict[characteristic]['quality'] != '0'): #test for correct data quality if (charDict[characteristic]['quality'] != quality): addCharacteristic = False #end of characteristic criteria check #Process duplicate characteristics if (addCharacteristic): if (characteristic in sampleDict): priorPcode = sampleMetaDict[ characteristic]['pcode'] #if there are already multiple pcodes get only first one priorPcode = priorPcode.split(';')[0] averageValue = False if (len(pcodeDict) > 1): thisPcodePriority = pcodeDict[ pcode] priorPcodePriority = \ pcodeDict[priorPcode] if (thisPcodePriority >\ priorPcodePriority): #previous characteristic remains addCharacteristic = False elif (thisPcodePriority ==\ priorPcodePriority): averageValue = True else: averageValue = True if averageValue: #average this value with existing values count = \ sampleMetaDict[characteristic]['count'] count += 1. oldvalue = float(\ sampleDict[characteristic]) newvalue = (oldvalue * (count - 1.)\ + float(value))/count value = str(newvalue) pcode = priorPcode + '; ' + pcode priorUnits = \ sampleMetaDict[characteristic]['units'] units = priorUnits + '; ' + units if (addCharacteristic): sampleDict[characteristic] = value sampleMetaDict[characteristic] = { 'samplefraction': samplefraction, 'units': units, 'pcode': pcode, 'quality': quality, 'count': count } #end results loop except etree.XMLSyntaxError as detail: print("File contains invalid XML syntax: ", detail) processThisSample = False reason = "Entry contains invalid XML syntax." #check whether sample has all the required constituents # print "Checking for requirements." if (processThisSample): for characteristic in charDict.keys(): if (charDict[characteristic]['IsRequired'] != '0'): if not (characteristic in sampleDict): processThisSample = False reason += characteristic + ' not available. ' if (processThisSample): #check to see whether site directory exists, if not, create it sampledir = os.path.join(sitesdir, location) if not (os.path.exists(sampledir)): try: os.makedirs(sampledir) except os.error: print(("Problem creating location directory: " + sampledir)) processThisSample = False reason = "Problem creating location directory: " + sampledir if (processThisSample): #Pull daily discharge data from USGS website good_discharge_value = False num_Q_tries = 0 #Try 5 times to retrieve discharge value while (not good_discharge_value) and num_Q_tries <= 5: dischargeDict = GetDailyDischarge( location, datetext ) #currently hard-wired to pcode 00060 (daily discharge, cfs) if dischargeDict != -1: good_discharge_value = True else: num_Q_tries += 1 dischargeDict = None if (dischargeDict != None): sampleDict['Stream flow, mean. daily'] = dischargeDict[ 'discharge'] sampleMetaDict['Stream flow, mean. daily'] = { 'units': 'cfs', 'pcode': '00060', 'quality': dischargeDict['quality'], 'count': 1, 'samplefraction': None } descriptionDict['name'] = dischargeDict['name'] else: #Possibly allow this sample to be thrown out if no mean daily discharge, and/or similar for instantaneous discharge sampleDict['Stream flow, mean. daily'] = None sampleMetaDict['Stream flow, mean. daily'] = { 'units': 'cfs', 'pcode': '00060', 'quality': None, 'count': 1, 'samplefraction': None } # Create data frame row for this sample date if descriptionDict['time'] != '': rowdate = to_datetime(datetext + ' ' + descriptionDict['time']) else: rowdate = to_datetime(datetext) #sampleRow = DataFrame(sampleDict, index=[rowdate], dtype='float') #Create Panel to contain sample meta data samplePanelRow = Panel({ 'data': DataFrame(sampleDict, index=[rowdate], dtype='float'), 'time': DataFrame(descriptionDict['time'], index=[rowdate], columns=list(sampleMetaDict.keys())), 'timezone': DataFrame(descriptionDict['timezone'], index=[rowdate], columns=list(sampleMetaDict.keys())), 'pcode': DataFrame( [extractValues(sampleMetaDict, ['pcode'])['values']], index=[rowdate], columns=list(sampleMetaDict.keys())), 'quality': DataFrame( [extractValues(sampleMetaDict, ['quality'])['values']], index=[rowdate], columns=list(sampleMetaDict.keys())), 'fraction': DataFrame([ extractValues(sampleMetaDict, ['samplefraction'])['values'] ], index=[rowdate], columns=list(sampleMetaDict.keys())), 'units': DataFrame( [extractValues(sampleMetaDict, ['units'])['values']], index=[rowdate], columns=list(sampleMetaDict.keys())), 'count': DataFrame( [extractValues(sampleMetaDict, ['count'])['values']], index=[rowdate], columns=list(sampleMetaDict.keys())), }) #sampleMetaRow = Series(sampleMetaDict, index=[to_datetime(datetext)], dtype='object') #Previous solution was reading/writing from pickle files #New solution will keep all data in memory until end. #This could cause memory problems with large data sets #Test whether a df for this location already exists if location in sitesDict: # tempDF = sitesDict[location] # sitesDict[location] = tempDF.append(sampleRow) tempPanel = sitesDict[location] sitesDict[location] = concat([tempPanel, samplePanelRow], axis=1) else: sitesDict[location] = samplePanelRow #add one to number of samples processed if (processThisSample): samples_processed.append(location + ' ' + datetext) else: samples_not_processed.append(location + ' ' + datetext + ' - ' + reason) print(('Number of Samples Processed = ' + str(len(samples_processed)))) print(('Number of Samples Not Processed = ' + str(len(samples_not_processed)))) #Write out individual site data pickle and csv files in each site directory print('Writing out site data files...') for location, pnl in sitesDict.items(): print(location) pickleFile = os.path.join(sitesdir, location, location + '-Panel.pkl') pickle.dump(pnl, open(pickleFile, 'wb')) pnl.to_excel(pickleFile[:-3] + 'xls') #Retrieve and store site description metadata siteDescriptionDataDF = GetSiteData(location) siteDescriptionDataFileName = os.path.join( sitesdir, location, location + '-Site-Description.pkl') pickle.dump(siteDescriptionDataDF, open(siteDescriptionDataFileName, 'wb')) siteDescriptionDataDF.to_csv(siteDescriptionDataFileName[:-3] + 'csv') #Process sites through PHREEQC if RUN_PHREEQC: print("Processing site water chemisty data in PHREEQC...") for location, pnl in sitesDict.items(): phreeqc_df = processPanel(pnl, os.path.join(sitesdir, location), PHREEQC_PATH, DATABASE_FILE) phreeqc_site_file = os.path.join(sitesdir, location, location + '-PHREEQC.pkl') try: pickle.dump(phreeqc_df, open(phreeqc_site_file, 'wb')) phreeqc_df.to_csv(phreeqc_site_file[:-3] + 'csv') except IOError: print('Problem writing out PHREEQC data file.') if bracket_charge_balance: for location, pnl in sitesDict.items(): #Force balance on Calcium phreeqc_df_ca = processPanel(pnl, os.path.join( sitesdir, location), PHREEQC_PATH, DATABASE_FILE, force_balance='Ca') phreeqc_site_file_ca = os.path.join( sitesdir, location, location + '-PHREEQC-Ca.pkl') try: pickle.dump(phreeqc_df_ca, open(phreeqc_site_file_ca, 'wb')) phreeqc_df_ca.to_csv(phreeqc_site_file_ca[:-3] + 'csv') except IOError: print('Problem writing out PHREEQC Ca data file.') #Force balance on Alkalinity phreeqc_df_alk = processPanel(pnl, os.path.join( sitesdir, location), PHREEQC_PATH, DATABASE_FILE, force_balance='Alk') phreeqc_site_file_alk = os.path.join( sitesdir, location, location + '-PHREEQC-Alk.pkl') try: pickle.dump(phreeqc_df_alk, open(phreeqc_site_file_alk, 'wb')) phreeqc_df_alk.to_csv(phreeqc_site_file_alk[:-3] + 'csv') except IOError: print('Problem writing out PHREEQC Alk data file.') #Create log file print(('Writing log file: ' + LOG_FILE + splittag)) try: log_file = open(LOG_FILE + splittag, 'w') print('Start file = ' + START_FILE, file=log_file) print('Number of Samples Processed = ' + str(len(samples_processed)), file=log_file) print('Number of Samples Not Processed = ' + str(len(samples_not_processed)), file=log_file) print("###############", file=log_file) print("Characteristics", file=log_file) print("###############", file=log_file) printColumnNames = True for key, flags in charDict.items(): if (printColumnNames): names = ['characteristic'] # + '\t' for column in flags.keys(): names.append(str(column)) print(str("\t".join(names)), file=log_file) printColumnNames = False columns = [key] for column in flags.keys(): if isinstance(flags[column], str): columns.append(flags[column]) print(str("\t".join(columns)), file=log_file) print("###############", file=log_file) print("Samples processed", file=log_file) print("###############", file=log_file) for line in samples_processed: print(line, file=log_file) print("###############", file=log_file) print("Samples not processed", file=log_file) print("###############", file=log_file) for line in samples_not_processed: print(line, file=log_file) except IOError: print(("Problem opening log file: " + LOG_FILE)) return -1 #exceptions for parsing of xml file except IOError: print("Error opening xml file. Does it exist?") #Note: can throw this error when discharge values are not read correctly, #I should fix this, 6/16/2014 except etree.XMLSyntaxError as detail: print("File contains invalid XML syntax: ", detail) except requests.exceptions.RequestException as detail: print("Error retrieving data by xml query: ", detail) return 0
def make_source(self): return Panel(self.raw_data).tz_localize('UTC', axis=1)
def test_partial_setting(self): # GH2578, allow ix and friends to partially set # series s_orig = Series([1, 2, 3]) s = s_orig.copy() s[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) # iloc/iat raise s = s_orig.copy() def f(): s.iloc[3] = 5. pytest.raises(IndexError, f) def f(): s.iat[3] = 5. pytest.raises(IndexError, f) # ## frame ## df_orig = DataFrame( np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64') # iloc/iat raise df = df_orig.copy() def f(): df.iloc[4, 2] = 5. pytest.raises(IndexError, f) def f(): df.iat[4, 2] = 5. pytest.raises(IndexError, f) # row setting where it exists expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.iloc[1] = df.iloc[2] tm.assert_frame_equal(df, expected) expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.loc[1] = df.loc[2] tm.assert_frame_equal(df, expected) # like 2578, partial setting with dtype preservation expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) df = df_orig.copy() df.loc[3] = df.loc[2] tm.assert_frame_equal(df, expected) # single dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'B'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) df = df_orig.copy() df['B'] = df['B'].astype(np.float64) with catch_warnings(record=True): df.ix[:, 'B'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'C'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'C'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) with catch_warnings(record=True): # ## panel ## p_orig = Panel(np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], major_axis=pd.date_range('2001/1/12', periods=4), minor_axis=['A', 'B'], dtype='float64') # panel setting via item p_orig = Panel(np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], major_axis=pd.date_range('2001/1/12', periods=4), minor_axis=['A', 'B'], dtype='float64') expected = p_orig.copy() expected['Item3'] = expected['Item1'] p = p_orig.copy() p.loc['Item3'] = p['Item1'] tm.assert_panel_equal(p, expected) # panel with aligned series expected = p_orig.copy() expected = expected.transpose(2, 1, 0) expected['C'] = DataFrame({'Item1': [30, 30, 30, 30], 'Item2': [32, 32, 32, 32]}, index=p_orig.major_axis) expected = expected.transpose(2, 1, 0) p = p_orig.copy() p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items) tm.assert_panel_equal(p, expected) # GH 8473 dates = date_range('1/1/2000', periods=8) df_orig = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) expected = pd.concat([df_orig, DataFrame({'A': 7}, index=[dates[-1] + 1])], sort=True) df = df_orig.copy() df.loc[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) exp_other = DataFrame({0: 7}, index=[dates[-1] + 1]) expected = pd.concat([df_orig, exp_other], axis=1) df = df_orig.copy() df.loc[dates[-1] + 1, 0] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 0] = 7 tm.assert_frame_equal(df, expected)
def test_sample(sel): # Fixes issue: 2419 # additional specific object based tests # A few dataframe test with degenerate weights. easy_weight_list = [0] * 10 easy_weight_list[5] = 1 df = pd.DataFrame({ 'col1': range(10, 20), 'col2': range(20, 30), 'colString': ['a'] * 10, 'easyweights': easy_weight_list }) sample1 = df.sample(n=1, weights='easyweights') assert_frame_equal(sample1, df.iloc[5:6]) # Ensure proper error if string given as weight for Series, panel, or # DataFrame with axis = 1. s = Series(range(10)) with pytest.raises(ValueError): s.sample(n=3, weights='weight_column') with catch_warnings(record=True): panel = Panel(items=[0, 1, 2], major_axis=[2, 3, 4], minor_axis=[3, 4, 5]) with pytest.raises(ValueError): panel.sample(n=1, weights='weight_column') with pytest.raises(ValueError): df.sample(n=1, weights='weight_column', axis=1) # Check weighting key error with pytest.raises(KeyError): df.sample(n=3, weights='not_a_real_column_name') # Check that re-normalizes weights that don't sum to one. weights_less_than_1 = [0] * 10 weights_less_than_1[0] = 0.5 tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) ### # Test axis argument ### # Test axis argument df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10}) second_column_weight = [0, 1] assert_frame_equal( df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']]) # Different axis arg types assert_frame_equal( df.sample(n=1, axis='columns', weights=second_column_weight), df[['col2']]) weight = [0] * 10 weight[5] = 0.5 assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), df.iloc[5:6]) assert_frame_equal(df.sample(n=1, axis='index', weights=weight), df.iloc[5:6]) # Check out of range axis values with pytest.raises(ValueError): df.sample(n=1, axis=2) with pytest.raises(ValueError): df.sample(n=1, axis='not_a_name') with pytest.raises(ValueError): s = pd.Series(range(10)) s.sample(n=1, axis=1) # Test weight length compared to correct axis with pytest.raises(ValueError): df.sample(n=1, axis=1, weights=[0.5] * 10) # Check weights with axis = 1 easy_weight_list = [0] * 3 easy_weight_list[2] = 1 df = pd.DataFrame({ 'col1': range(10, 20), 'col2': range(20, 30), 'colString': ['a'] * 10 }) sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) assert_frame_equal(sample1, df[['colString']]) # Test default axes with catch_warnings(record=True): p = Panel(items=['a', 'b', 'c'], major_axis=[2, 4, 6], minor_axis=[1, 3, 5]) assert_panel_equal(p.sample(n=3, random_state=42), p.sample(n=3, axis=1, random_state=42)) assert_frame_equal(df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)) # Test that function aligns weights with frame df = DataFrame({ 'col1': [5, 6, 7], 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3]) s = Series([1, 0, 0], index=[3, 5, 9]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) # Weights have index values to be dropped because not in # sampled DataFrame s2 = Series([0.001, 0, 10000], index=[3, 5, 10]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2)) # Weights have empty values to be filed with zeros s3 = Series([0.01, 0], index=[3, 5]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3)) # No overlap in weight and sampled DataFrame indices s4 = Series([1, 0], index=[1, 2]) with pytest.raises(ValueError): df.sample(1, weights=s4)
def setUp(self): self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=lrange(0, 8, 2), columns=lrange(0, 12, 3)) with catch_warnings(record=True): self.panel_ints = Panel(np.random.rand(4, 4, 4), items=lrange(0, 8, 2), major_axis=lrange(0, 12, 3), minor_axis=lrange(0, 16, 4)) self.series_uints = Series(np.random.rand(4), index=UInt64Index(lrange(0, 8, 2))) self.frame_uints = DataFrame(np.random.randn(4, 4), index=UInt64Index(lrange(0, 8, 2)), columns=UInt64Index(lrange(0, 12, 3))) with catch_warnings(record=True): self.panel_uints = Panel(np.random.rand(4, 4, 4), items=UInt64Index(lrange(0, 8, 2)), major_axis=UInt64Index(lrange(0, 12, 3)), minor_axis=UInt64Index(lrange(0, 16, 4))) self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) with catch_warnings(record=True): self.panel_labels = Panel(np.random.randn(4, 4, 4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) with catch_warnings(record=True): self.panel_mixed = Panel(np.random.randn(4, 4, 4), items=[2, 4, 'null', 8]) self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) with catch_warnings(record=True): self.panel_ts = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) dates_rev = (date_range('20130101', periods=4).sort_values(ascending=False)) self.series_ts_rev = Series(np.random.randn(4), index=dates_rev) self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev) with catch_warnings(record=True): self.panel_ts_rev = Panel(np.random.randn(4, 4, 4), items=dates_rev) self.frame_empty = DataFrame({}) self.series_empty = Series({}) with catch_warnings(record=True): self.panel_empty = Panel({}) # form agglomerates for o in self._objs: d = dict() for t in self._typs: d[t] = getattr(self, '%s_%s' % (o, t), None) setattr(self, o, d)
def __init__(self, autoAdjust=True): self.startDate = (2008,1,1) self.autoAdjust=autoAdjust self.wp = Panel()
def time_from_dict(self): with warnings.catch_warnings(record=True): Panel.from_dict(self.data_frames)
class HistData(object): ''' a class for working with yahoo finance data ''' def __init__(self, autoAdjust=True): self.startDate = (2008,1,1) self.autoAdjust=autoAdjust self.wp = Panel() def load(self,dataFile): """load data from HDF""" if os.path.exists(dataFile): store = HDFStore(dataFile) symbols = [str(s).strip('/') for s in list(store.keys()) ] data = dict(list(zip(symbols,[store[symbol] for symbol in symbols]))) self.wp = Panel(data) store.close() else: raise IOError('Data file does not exist') def save(self,dataFile): """ save data to HDF""" print(('Saving data to', dataFile)) store = HDFStore(dataFile) for symbol in self.wp.items: store[symbol] = self.wp[symbol] store.close() def downloadData(self,symbols='all'): ''' get data from yahoo ''' if symbols == 'all': symbols = self.symbols #store = HDFStore(self.dataFile) p = ProgressBar(len(symbols)) for idx,symbol in enumerate(symbols): try: df = getSymbolData(symbol,sDate=self.startDate,verbose=False) if self.autoAdjust: df = _adjust(df,removeOrig=True) if len(self.symbols)==0: self.wp = Panel({symbol:df}) else: self.wp[symbol] = df except Exception as e: print(e) p.animate(idx+1) def getDataFrame(self,field='close'): ''' return a slice on wide panel for a given field ''' return self.wp.minor_xs(field) @property def symbols(self): return self.wp.items.tolist() def __repr__(self): return str(self.wp)
def setup_method(self, method): self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=lrange(0, 8, 2), columns=lrange(0, 12, 3)) with catch_warnings(record=True): self.panel_ints = Panel(np.random.rand(4, 4, 4), items=lrange(0, 8, 2), major_axis=lrange(0, 12, 3), minor_axis=lrange(0, 16, 4)) self.series_uints = Series(np.random.rand(4), index=UInt64Index(lrange(0, 8, 2))) self.frame_uints = DataFrame(np.random.randn(4, 4), index=UInt64Index(lrange(0, 8, 2)), columns=UInt64Index(lrange(0, 12, 3))) with catch_warnings(record=True): self.panel_uints = Panel(np.random.rand(4, 4, 4), items=UInt64Index(lrange(0, 8, 2)), major_axis=UInt64Index(lrange(0, 12, 3)), minor_axis=UInt64Index(lrange(0, 16, 4))) self.series_floats = Series(np.random.rand(4), index=Float64Index(range(0, 8, 2))) self.frame_floats = DataFrame(np.random.randn(4, 4), index=Float64Index(range(0, 8, 2)), columns=Float64Index(range(0, 12, 3))) with catch_warnings(record=True): self.panel_floats = Panel(np.random.rand(4, 4, 4), items=Float64Index(range(0, 8, 2)), major_axis=Float64Index(range(0, 12, 3)), minor_axis=Float64Index(range(0, 16, 4))) m_idces = [ MultiIndex.from_product([[1, 2], [3, 4]]), MultiIndex.from_product([[5, 6], [7, 8]]), MultiIndex.from_product([[9, 10], [11, 12]]) ] self.series_multi = Series(np.random.rand(4), index=m_idces[0]) self.frame_multi = DataFrame(np.random.randn(4, 4), index=m_idces[0], columns=m_idces[1]) with catch_warnings(record=True): self.panel_multi = Panel(np.random.rand(4, 4, 4), items=m_idces[0], major_axis=m_idces[1], minor_axis=m_idces[2]) self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) with catch_warnings(record=True): self.panel_labels = Panel(np.random.randn(4, 4, 4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) with catch_warnings(record=True): self.panel_mixed = Panel(np.random.randn(4, 4, 4), items=[2, 4, 'null', 8]) self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) with catch_warnings(record=True): self.panel_ts = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) dates_rev = (date_range('20130101', periods=4).sort_values(ascending=False)) self.series_ts_rev = Series(np.random.randn(4), index=dates_rev) self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev) with catch_warnings(record=True): self.panel_ts_rev = Panel(np.random.randn(4, 4, 4), items=dates_rev) self.frame_empty = DataFrame({}) self.series_empty = Series({}) with catch_warnings(record=True): self.panel_empty = Panel({}) # form agglomerates for o in self._objs: d = dict() for t in self._typs: d[t] = getattr(self, '%s_%s' % (o, t), None) setattr(self, o, d)
def p_apply(panel, f): result = {} for item in panel.items: result[item] = f(panel[item]) return Panel(result, items=panel.items)
def test_to_dense(self): dwp = self.panel.to_dense() dwp2 = Panel.from_dict(self.data_dict) tm.assert_panel_equal(dwp, dwp2)