def from_castra(x, columns=None, index=False, npartitions=None): """Load a dask Bag from a Castra. Parameters ---------- x : filename or Castra columns: list or string, optional The columns to load. Default is all columns. index: bool, optional If True, the index is included as the first element in each tuple. Default is False. npartitions: int, optional The number of desired partitions. Defaults to number of partitions in the Castra. """ from castra import Castra if not isinstance(x, Castra): x = Castra(x, readonly=True) elif not x._readonly: x = Castra(x.path, readonly=True) if columns is None: columns = x.columns if npartitions is None: npartitions = len(x.partitions) parts = from_sequence(x.partitions, npartitions=npartitions) func = lambda p: load_castra_partition(x, p, columns, index) return parts.map_partitions(func).map_partitions(reify)
def test_categorize(): A = pd.DataFrame({ 'x': [1, 2, 3], 'y': ['A', None, 'A'] }, columns=['x', 'y'], index=[0, 10, 20]) B = pd.DataFrame({ 'x': [4, 5, 6], 'y': ['C', None, 'A'] }, columns=['x', 'y'], index=[30, 40, 50]) with Castra(template=A, categories=['y']) as c: c.extend(A) assert c[:].dtypes['y'] == 'category' assert c[:]['y'].cat.codes.dtype == np.dtype('i1') assert list(c[:, 'y'].cat.categories) == ['A', None] c.extend(B) assert list(c[:, 'y'].cat.categories) == ['A', None, 'C'] assert c.load_partition(c.partitions.iloc[0], 'y').dtype == 'category' c.flush() d = Castra(path=c.path) tm.assert_frame_equal(c[:], d[:])
def test_load(base): with Castra(path=base, template=A) as c: c.extend(A) c.extend(B) loaded = Castra(path=base) tm.assert_frame_equal(pd.concat([A, B]), loaded[:])
def test_get_empty_result(base): c = Castra(path=base, template=A) c.extend(A) df = c[100:200] assert (df.columns == A.columns).all()
def test_categorical_index(): df = pd.DataFrame({'x': [1, 2, 3]}, index=pd.CategoricalIndex(['a', 'a', 'b'], ordered=True, name='foo')) with Castra(template=df, categories=True) as c: c.extend(df) result = c[:] tm.assert_frame_equal(c[:], df) A = pd.DataFrame({'x': [1, 2, 3]}, index=pd.Index(['a', 'a', 'b'], name='foo')) B = pd.DataFrame({'x': [4, 5, 6]}, index=pd.Index(['c', 'd', 'd'], name='foo')) path = tempfile.mkdtemp(prefix='castra-') try: with Castra(path=path, template=A, categories=['foo']) as c: c.extend(A) c.extend(B) c2 = Castra(path=path) result = c2[:] expected = pd.concat([A, B]) expected.index = pd.CategoricalIndex(expected.index, name=expected.index.name, ordered=True) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(c['a'], expected.loc['a']) finally: shutil.rmtree(path)
def from_castra(x, columns=None, index=False, npartitions=None): """Load a dask Bag from a Castra. Parameters ---------- x : filename or Castra columns: list or string, optional The columns to load. Default is all columns. index: bool, optional If True, the index is included as the first element in each tuple. Default is False. npartitions: int, optional The number of desired partitions. Defaults to number of partitions in the Castra. """ from castra import Castra if not isinstance(x, Castra): x = Castra(x, readonly=True) elif not x._readonly: x = Castra(x.path, readonly=True) if columns is None: columns = x.columns name = 'from-castra-' + next(tokens) dsk = dict(((name, i), (load_castra_partition, x, part, columns, index)) for i, part in enumerate(x.partitions)) return Bag(dsk, name, len(x.partitions))
def execute(file_name): categories = ['distinguished', 'removal_reason'] f = load(file_name) batches = partition_all(200000, f) df, frames = peek(map(to_df, batches)) castra = Castra('./subreddit_dumps/'+file_name+'.castra', template = df, categories = categories) castra.extend_sequence(frames, freq = '3h')
def execute(file_name): categories = ['distinguished', 'removal_reason'] f = load(file_name) batches = partition_all(200000, f) df, frames = peek(map(to_df, batches)) castra = Castra('./subreddit_dumps/' + file_name + '.castra', template=df, categories=categories) castra.extend_sequence(frames, freq='3h')
def test_pickle_Castra(): path = tempfile.mkdtemp(prefix='castra-') c = Castra(path=path, template=A) c.extend(A) c.extend(B) dumped = pickle.dumps(c) undumped = pickle.loads(dumped) tm.assert_frame_equal(pd.concat([A, B]), undumped[:])
def test_reload(): path = tempfile.mkdtemp(prefix='castra-') try: c = Castra(template=A, path=path) c.extend(A) d = Castra(path=path) assert c.columns == d.columns assert (c.partitions == d.partitions).all() assert c.minimum == d.minimum finally: shutil.rmtree(path)
def from_castra(x, columns=None): """Load a dask DataFrame from a Castra. Parameters ---------- x : filename or Castra columns: list or string, optional The columns to load. Default is all columns. """ from castra import Castra if not isinstance(x, Castra): x = Castra(x, readonly=True) return x.to_dask(columns)
def test_Castra(): c = Castra(template=A) c.extend(A) c.extend(B) assert c.columns == ['x', 'y'] tm.assert_frame_equal(c[0:100], pd.concat([A, B])) tm.assert_frame_equal(c[:5], A) tm.assert_frame_equal(c[5:], B) tm.assert_frame_equal(c[2:5], A[1:]) tm.assert_frame_equal(c[2:15], pd.concat([A[1:], B[:1]]))
def test_extend_sequence_single_frame(): df = pd.util.testing.makeTimeDataFrame(100, 'h') seq = [df] with Castra(template=df) as c: c.extend_sequence(seq, freq='d') assert (c.partitions.index == [ '2000-01-01 23:00:00', '2000-01-02 23:00:00', '2000-01-03 23:00:00', '2000-01-04 23:00:00', '2000-01-05 03:00:00' ]).all() df = pd.DataFrame({'a': range(10), 'b': range(10)}) seq = [df] with Castra(template=df) as c: c.extend_sequence(seq) tm.assert_frame_equal(c[:], df)
def test_extend_sequence_freq(): df = pd.util.testing.makeTimeDataFrame(1000, 'min') seq = [df.iloc[i:i + 100] for i in range(0, 1000, 100)] with Castra(template=df) as c: c.extend_sequence(seq, freq='h') tm.assert_frame_equal(c[:], df) parts = pd.date_range(start=df.index[59], freq='h', periods=16).insert(17, df.index[-1]) tm.assert_index_equal(c.partitions.index, parts) with Castra(template=df) as c: c.extend_sequence(seq, freq='d') tm.assert_frame_equal(c[:], df) assert len(c.partitions) == 1
def test_text(): df = pd.DataFrame({'name': ['Alice', 'Bob'], 'balance': [100, 200]}, columns=['name', 'balance']) with Castra(template=df) as c: c.extend(df) tm.assert_frame_equal(c[:], df)
def test_categorical_index_with_dask_dataframe(): pytest.importorskip('dask.dataframe') import dask.dataframe as dd import dask A = pd.DataFrame({'x': [1, 2, 3, 4]}, index=pd.Index(['a', 'a', 'b', 'b'], name='foo')) B = pd.DataFrame({'x': [4, 5, 6]}, index=pd.Index(['c', 'd', 'd'], name='foo')) path = tempfile.mkdtemp(prefix='castra-') try: with Castra(path=path, template=A, categories=['foo']) as c: c.extend(A) c.extend(B) df = dd.from_castra(path) assert df.divisions == ('a', 'c', 'd') result = df.compute(get=dask. async .get_sync) expected = pd.concat([A, B]) expected.index = pd.CategoricalIndex(expected.index, name=expected.index.name, ordered=True) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.loc['a'].compute(), expected.loc['a']) tm.assert_frame_equal( df.loc['b'].compute(get=dask. async .get_sync), expected.loc['b']) finally: shutil.rmtree(path)
def test_categories_nan(): a = pd.DataFrame({'x': ['A', np.nan]}) b = pd.DataFrame({'x': ['B', np.nan]}) with Castra(template=a, categories=['x']) as c: c.extend(a) c.extend(b) assert len(c.categories['x']) == 3
def from_castra(x, columns=None): """ Load a dask DataFrame from a Castra. The Castra project has been deprecated. We recommend using Parquet instead. Parameters ---------- x : filename or Castra columns: list or string, optional The columns to load. Default is all columns. """ from castra import Castra if not isinstance(x, Castra): x = Castra(x, readonly=True) return x.to_dask(columns)
def test_do_not_create_dirs_if_template_fails(): A = pd.DataFrame({'x': [1, 2] * 3, 'y': [1., 2.] * 3, 'z': list('abcabc')}, columns=list('xyz')) with pytest.raises(ValueError): Castra(template=A, path='foo', categories=['w']) assert not os.path.exists(os.path.join('foo', 'meta')) assert not os.path.exists(os.path.join('foo', 'meta', 'categories'))
def test_category_dtype(): A = pd.DataFrame({'x': [1, 2] * 3, 'y': [1., 2.] * 3, 'z': list('abcabc')}, columns=list('xyz')) with Castra(template=A, categories=['z']) as c: c.extend(A) assert A.dtypes['z'] == 'object' assert c.dtypes['z'] == pd.core.categorical.CategoricalDtype()
def test_same_categories_when_already_categorized(): A = pd.DataFrame({'x': [1, 2] * 1000, 'y': [1., 2.] * 1000, 'z': np.random.choice(list('abc'), size=2000)}, columns=list('xyz')) A['z'] = A.z.astype('category') with Castra(template=A, categories=['z']) as c: c.extend(A) assert c.categories['z'] == A.z.cat.categories.tolist()
def test_raise_error_on_equal_index(): a = pd.DataFrame({'x': [1, 2, 3]}, index=[1, 2, 3]) b = pd.DataFrame({'x': [4, 5, 6]}, index=[3, 4, 5]) with Castra(template=a) as c: c.extend(a) with pytest.raises(ValueError): c.extend(b)
def test_column_with_period(): df = pd.DataFrame({ 'x': [10, 20], '.': [10., 20.] }, columns=['x', '.'], index=[10, 20]) with Castra(template=df) as c: c.extend(df)
def test_extend_sequence_overlap(): df = pd.util.testing.makeTimeDataFrame(20, 'min') p1 = df.iloc[:15] p2 = df.iloc[10:20] seq = [p1, p2] df = pd.concat(seq) with Castra(template=df) as c: c.extend_sequence(seq) tm.assert_frame_equal(c[:], df.sort_index()) assert (c.partitions.index == [p.index[-1] for p in seq]).all() # Check with trivial index p1 = pd.DataFrame({'a': range(10), 'b': range(10)}) p2 = pd.DataFrame({'a': range(10, 17), 'b': range(10, 17)}) seq = [p1, p2] df = pd.DataFrame({'a': range(17), 'b': range(17)}) with Castra(template=df) as c: c.extend_sequence(seq) tm.assert_frame_equal(c[:], df) assert (c.partitions.index == [9, 16]).all()
def test_column_access(): with Castra(template=A) as c: c.extend(A) c.extend(B) df = c[:, ['x']] tm.assert_frame_equal(df, pd.concat([A[['x']], B[['x']]])) df = c[:, 'x'] tm.assert_series_equal(df, pd.concat([A.x, B.x]))
def test_timeseries(): indices = [pd.DatetimeIndex(start=str(i), end=str(i+1), freq='w') for i in range(2000, 2015)] dfs = [pd.DataFrame({'x': list(range(len(ind)))}, ind) for ind in indices] with Castra(template=dfs[0]) as c: for df in dfs: c.extend(df) df = c['2010-05': '2013-02'] assert len(df) > 100
def test_raise_error_on_mismatched_index(): x = pd.DataFrame({'x': [1, 2, 3]}, index=[1, 2, 3]) y = pd.DataFrame({'x': [1, 2, 3]}, index=[4, 5, 6]) z = pd.DataFrame({'x': [4, 5, 6]}, index=[5, 6, 7]) with Castra(template=x) as c: c.extend(x) c.extend(y) with pytest.raises(ValueError): c.extend(z)
def test_many_default_indexes(): a = pd.DataFrame({'x': [1, 2, 3]}) b = pd.DataFrame({'x': [4, 5, 6]}) c = pd.DataFrame({'x': [7, 8, 9]}) e = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) with Castra(template=a) as C: C.extend(a) C.extend(b) C.extend(c) tm.assert_frame_equal(C[:], e)
def test_extend_sequence_none(): data = {'a': range(5), 'b': range(5)} p1 = pd.DataFrame(data, index=[1, 2, 3, 4, 5]) p2 = pd.DataFrame(data, index=[5, 5, 5, 6, 7]) p3 = pd.DataFrame(data, index=[7, 9, 10, 11, 12]) seq = [p1, p2, p3] df = pd.concat(seq) with Castra(template=df) as c: c.extend_sequence(seq) tm.assert_frame_equal(c[:], df) assert len(c.partitions) == 3 assert len(c.load_partition('1--5', ['a', 'b']).index) == 8 assert len(c.load_partition('6--7', ['a', 'b']).index) == 3 assert len(c.load_partition('9--12', ['a', 'b']).index) == 4
def test_first_index_is_timestamp(): pytest.importorskip('dask.dataframe') df = pd.DataFrame({ 'x': [1, 2] * 3, 'y': [1., 2.] * 3, 'z': list('abcabc') }, columns=list('xyz'), index=pd.date_range(start='20120101', periods=6)) with Castra(template=df) as c: c.extend(df) assert isinstance(c.minimum, pd.Timestamp) assert isinstance(c.to_dask().divisions[0], pd.Timestamp)
def floats_to_castra(input_dir, output_fname, progress=False, **kwargs): """Convert MITgcm float data to pands hdf format. Paramters --------- input_dir : path Where to find the MITgcm output data output_fname : path Filename of the hdf data store kwargs : Extra keyword arguments to pass to floater.input_formats.MITgcmFloatData """ import pandas as pd from castra import Castra output_fname = _maybe_add_suffix(output_fname, '.castra') mfd = input.MITgcmFloatData(input_dir, **kwargs) c = None for block in mfd.generator(progress=progress, return_full_block=True): df = pd.DataFrame.from_records(block) if not c: c = Castra(output_fname, template=df) c.extend(df)
def test_del_with_random_dir(self): c = Castra(template=A) assert os.path.exists(c.path) c.__del__() assert not os.path.exists(c.path)
def test_readonly(): path = tempfile.mkdtemp(prefix='castra-') try: c = Castra(path=path, template=A) c.extend(A) d = Castra(path=path, readonly=True) with pytest.raises(IOError): d.extend(B) with pytest.raises(IOError): d.extend_sequence([B]) with pytest.raises(IOError): d.flush() with pytest.raises(IOError): d.drop() with pytest.raises(IOError): d.save_partitions() with pytest.raises(IOError): d.flush_meta() assert c.columns == d.columns assert (c.partitions == d.partitions).all() assert c.minimum == d.minimum finally: shutil.rmtree(path)
def test_get_slice(base): c = Castra(path=base, template=A) c.extend(A) tm.assert_frame_equal(c[:], c[:, :]) tm.assert_frame_equal(c[:, 1:], c[:][['y']])
def load(file_name): c = Castra(path = './subreddit_dumps/'+file_name+'.castra/') df = c.to_dask() return df