def compare(self, current_data, all_data, vf, version): # GH12277 encoding default used to be latin-1, now utf-8 if LooseVersion(version) < "0.18.0": data = read_msgpack(vf, encoding="latin-1") else: data = read_msgpack(vf) self.check_min_structure(data, version) for typ, dv in data.items(): assert typ in all_data, "unpacked data contains " 'extra key "{0}"'.format( typ ) for dt, result in dv.items(): assert ( dt in current_data[typ] ), 'data["{0}"] contains extra ' 'key "{1}"'.format(typ, dt) try: expected = current_data[typ][dt] except KeyError: continue # use a specific comparator # if available comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt) comparator = getattr(self, comp_method, None) if comparator is not None: comparator(result, expected, typ, version) else: check_arbitrary(result, expected) return data
def test_string_io(self): df = DataFrame(np.random.randn(10, 2)) s = df.to_msgpack(None) result = read_msgpack(s) tm.assert_frame_equal(result, df) s = df.to_msgpack() result = read_msgpack(s) tm.assert_frame_equal(result, df) s = df.to_msgpack() result = read_msgpack(BytesIO(s)) tm.assert_frame_equal(result, df) s = to_msgpack(None, df) result = read_msgpack(s) tm.assert_frame_equal(result, df) with ensure_clean(self.path) as p: s = df.to_msgpack() fh = open(p, "wb") fh.write(s) fh.close() result = read_msgpack(p) tm.assert_frame_equal(result, df)
def get_dataset(self, name, apply_exclusion=False): """Retrieve a dataset""" name = self.dataset_exists(name) if self.data_format == "msg_pack": try: import mbf_pandas_msgpack except (ImportError, AttributeError): raise ImportError( "Please install mbf-pandas-msgpack to read this old school biobank file" ) with self.zf.open(name) as op: try: df = mbf_pandas_msgpack.read_msgpack(op.read()) except KeyError as e: if "KeyError: u'category'" in str(e): raise ValueError( "Your pandas is too old. You need at least version 0.18" ) elif self.data_format == "parquet": try: import pyarrow except ImportError: try: import fastparquet except ImportError: raise ValueError( "marburg_biobank needs either pyarrow or fastparquet") ds = self.zf.namelist() ii = 0 dfs = [] sub_name = name + "/" + str(ii) while sub_name in ds: dfs.append(self.__load_df_from_parquet(sub_name)) ii += 1 sub_name = name + "/" + str(ii) if not dfs: # not actually a unit splitted dataframe - meta? df = self.__load_df_from_parquet(name) elif len(dfs) == 1: df = dfs[0] else: categoricals = set() for df in dfs: for c, dt in df.dtypes.items(): if dt.name == "category": categoricals.add(c) df = pd.concat(dfs) reps = {c: pd.Categorical(df[c]) for c in categoricals} if reps: df = df.assign(**reps) else: raise ValueError( "Unexpected data format. Do you need to upgrade marburg_biobank?" ) if apply_exclusion: try: df = self.apply_exclusion(name, df) except CantApplyExclusion: return df return df
def test_iterator(self): self.setUp() l = [self.frame["float"], self.frame["float"].A, self.frame["float"].B, None] with ensure_clean(self.path) as path: to_msgpack(path, *l) for i, packed in enumerate(read_msgpack(path, iterator=True)): check_arbitrary(packed, l[i])
def pd_read(fn, index_column=None, **kwargs): ext = os.path.splitext(fn)[-1] if ".zip" in ext: ext = os.path.splitext(fn[:-4])[-1] if ext == ".msgpack": from mbf_pandas_msgpack import read_msgpack df = read_msgpack(fn) else: df = getattr(pd, f"read_{ext[1:]}")(fn, **kwargs) if index_column: df = df.set_index(index_column) return df
def test_invalid_arg(self): # GH10369 class A(object): def __init__(self): self.read = 0 with pytest.raises(ValueError): read_msgpack(path_or_buf=None) with pytest.raises(ValueError): read_msgpack(path_or_buf={}) with pytest.raises(ValueError): read_msgpack(path_or_buf=A())
def test_1_3(self): df = mbf_pandas_msgpack.read_msgpack( "samples/sample_pandas_1.3.0.msgpack") assert_frame_equal(df, supposed)
def test_iterator_with_string_io(self): dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)] s = to_msgpack(None, *dfs) for i, result in enumerate(read_msgpack(s, iterator=True)): tm.assert_frame_equal(result, dfs[i])
def encode_decode(self, x, compress=None, **kwargs): with ensure_clean(self.path) as p: to_msgpack(p, x, compress=compress, **kwargs) return read_msgpack(p, **kwargs)