def test_importer_date(self): expected_birthday_date = [ '1990-01-01', '1980-03-04', '1970-04-05', '1960-04-05', '1950-04-05' ] bio = BytesIO() with session.Session() as s: importer.import_with_schema(s, bio, self.ds_name, self.schema, self.files, False, {}, {}, self.ts, chunk_row_size=self.chunk_row_size) ds = s.get_dataset(self.ds_name) df = ds.get_dataframe('schema_key') self.assertEqual(df['birthday'].data[:].tolist(), [ datetime.strptime( x, "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp() for x in expected_birthday_date ]) with h5py.File(bio, 'r') as hf: self.assertEqual( hf['schema_key']['birthday']['values'][:].tolist(), [ datetime.strptime(x, "%Y-%m-%d").replace( tzinfo=timezone.utc).timestamp() for x in expected_birthday_date ])
def test_read_csv_only_datetime_field(self): bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'w', 'dst') df = dst.create_dataframe('df') parsers.read_csv(self.csv_file_name, df, self.schema_dict, include=['updated_at']) expected_updated_at_list = [ '2020-05-12 07:00:00', '2020-05-13 01:00:00', '2020-05-14 03:00:00', '2020-05-15 03:00:00', '2020-05-16 03:00:00' ] expected_updated_at_date_list = [ b'2020-05-12', b'2020-05-13', b'2020-05-14', b'2020-05-15', b'2020-05-16' ] self.assertEqual(df['updated_at'].data[:].tolist(), [ datetime.strptime(x, "%Y-%m-%d %H:%M:%S").replace( tzinfo=timezone.utc).timestamp() for x in expected_updated_at_list ]) self.assertEqual(df['updated_at_day'].data[:].tolist(), expected_updated_at_date_list)
def test_ordered_map_valid_stream(self): s = session.Session() bio = BytesIO() with h5py.File(bio, 'w') as hf: map_field = np.asarray([ 0, 0, 0, 1, 1, 3, 3, 3, 3, 5, 5, 5, 5, ops.INVALID_INDEX, ops.INVALID_INDEX, 7, 7, 7 ], dtype=np.int64) data_field = np.asarray([-1, -2, -3, -4, -5, -6, -8, -9], dtype=np.int32) f_map_field = s.create_numeric(hf, "map_field", "int64") f_map_field.data.write(map_field) f_data_field = s.create_numeric(hf, "data_field", "int32") f_data_field.data.write(data_field) result_field = np.zeros(len(map_field), dtype=np.int32) ops.ordered_map_valid_stream(f_data_field, f_map_field, result_field, 4) expected = np.asarray([ -1, -1, -1, -2, -2, -4, -4, -4, -4, -6, -6, -6, -6, 0, 0, -9, -9, -9 ], dtype=np.int32) self.assertTrue(np.array_equal(result_field, expected))
def test_dataframe_init(self): bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'w', 'dst') # init df = dst.create_dataframe('dst') self.assertTrue(isinstance(df, dataframe.DataFrame)) numf = df.create_numeric('numf', 'uint32') df2 = dst.create_dataframe('dst2', dataframe=df) self.assertTrue(isinstance(df2, dataframe.DataFrame)) # add & set & contains self.assertTrue('numf' in df) self.assertTrue('numf' in df2) cat = s.create_categorical(df2, 'cat', 'int8', {'a': 1, 'b': 2}) self.assertFalse('cat' in df) self.assertFalse(df.contains_field(cat)) df['cat'] = cat self.assertTrue('cat' in df) # list & get self.assertEqual(id(numf), id(df.get_field('numf'))) self.assertEqual(id(numf), id(df['numf'])) # list & iter dfit = iter(df) self.assertEqual('numf', next(dfit)) self.assertEqual('cat', next(dfit)) # del & del by field del df['numf'] self.assertFalse('numf' in df) with self.assertRaises(ValueError, msg="This field is owned by a different dataframe"): df.delete_field(cat) self.assertFalse(df.contains_field(cat))
def tests_merge_left_compound_key(self): l_id_1 = np.asarray([0, 0, 0, 0, 1, 1, 1, 1], dtype='int32') l_id_2 = np.asarray([0, 1, 2, 3, 0, 1, 2, 3], dtype='int32') r_id_1 = np.asarray([0, 1, 0, 1, 0, 1, 0, 1], dtype='int32') r_id_2 = np.asarray([0, 0, 1, 1, 2, 2, 3, 3], dtype='int32') l_vals = ['00', '01', '02', '03', '10', '11', '12', '13'] r_vals = ['00', '10', '01', '11', '02', '12', '03', '13'] expected = ['00', '01', '02', '03', '10', '11', '12', '13'] bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'w', 'dst') ldf = dst.create_dataframe('ldf') rdf = dst.create_dataframe('rdf') ldf.create_numeric('l_id_1', 'int32').data.write(l_id_1) ldf.create_numeric('l_id_2', 'int32').data.write(l_id_2) ldf.create_indexed_string('l_vals').data.write(l_vals) rdf.create_numeric('r_id_1', 'int32').data.write(r_id_1) rdf.create_numeric('r_id_2', 'int32').data.write(r_id_2) rdf.create_indexed_string('r_vals').data.write(r_vals) ddf = dst.create_dataframe('ddf') dataframe.merge(ldf, rdf, ddf, ('l_id_1', 'l_id_2'), ('r_id_1', 'r_id_2'), how='left') self.assertEqual(expected, ddf['l_vals'].data[:]) self.assertEqual(expected, ddf['r_vals'].data[:]) self.assertEqual(ddf['l_id_1'].data[:].tolist(), ddf['r_id_1'].data[:].tolist()) self.assertEqual(ddf['r_id_2'].data[:].tolist(), ddf['r_id_2'].data[:].tolist())
def test_streaming_sort_merge(self): bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'r+', 'dst') hf = dst.create_dataframe('hf') rs = np.random.RandomState(12345678) length = 105 segment_length = 25 chunk_length = 8 src_values = np.arange(length, dtype=np.int32) src_values += 1000 rs.shuffle(src_values) src_v_f = s.create_numeric(hf, 'src_values', 'int32') src_v_f.data.write(src_values) src_i_f = s.create_numeric(hf, 'src_indices', 'int64') src_i_f.data.write(np.arange(length, dtype=np.int64)) for c in utils.chunks(length, segment_length): sorted_index = np.argsort(src_v_f.data[c[0]:c[1]]) src_v_f.data[c[0]:c[1]] =\ s.apply_index(sorted_index, src_v_f.data[c[0]:c[1]]) src_i_f.data[c[0]:c[1]] =\ s.apply_index(sorted_index, src_i_f.data[c[0]:c[1]]) tgt_i_f = s.create_numeric(hf, 'tgt_values', 'int32') tgt_v_f = s.create_numeric(hf, 'tgt_indices', 'int64') ops.streaming_sort_merge(src_i_f, src_v_f, tgt_i_f, tgt_v_f, segment_length, chunk_length) self.assertTrue( np.array_equal(tgt_v_f.data[:], np.sort(src_values[:]))) self.assertTrue( np.array_equal(tgt_i_f.data[:], np.argsort(src_values)))
def tests_merge_outer(self): r_id = np.asarray([0, 1, 2, 3, 4, 5, 6, 7], dtype='int32') l_id = np.asarray([2, 3, 0, 4, 7, 6, 2, 0, 3], dtype='int32') r_vals = [ 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven' ] l_vals = [ 'bb1', 'ccc1', '', 'dddd1', 'ggggggg1', 'ffffff1', 'bb2', '', 'ccc2' ] expected_left = [ 'bb1', 'bb2', 'ccc1', 'ccc2', '', '', 'dddd1', 'ggggggg1', 'ffffff1', '', '' ] expected_right = [ 'two', 'two', 'three', 'three', 'zero', 'zero', 'four', 'seven', 'six', 'one', 'five' ] bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'w', 'dst') ldf = dst.create_dataframe('ldf') rdf = dst.create_dataframe('rdf') ldf.create_numeric('l_id', 'int32').data.write(l_id) ldf.create_indexed_string('l_vals').data.write(l_vals) rdf.create_numeric('r_id', 'int32').data.write(r_id) rdf.create_indexed_string('r_vals').data.write(r_vals) ddf = dst.create_dataframe('ddf') dataframe.merge(ldf, rdf, ddf, 'l_id', 'r_id', how='outer') self.assertEqual(expected_left, ddf['l_vals'].data[:]) self.assertEqual(expected_right, ddf['r_vals'].data[:])
def test_ordered_merge_inner(self): l_id = np.asarray([b'a', b'b', b'd', b'f', b'g', b'h'], dtype='S1') l_vals = np.asarray([100, 200, 400, 600, 700, 800]) l_vals_2 = np.asarray([10000, 20000, 40000, 60000, 70000, 80000]) r_id = np.asarray( [b'a', b'c', b'c', b'd', b'd', b'e', b'e', b'f', b'f', b'h', b'h'], dtype='S1') r_vals = np.asarray( [1000, 3000, 3001, 4000, 4001, 5000, 5001, 6000, 6001, 8000, 8001]) r_vals_2 = np.asarray([ 100000, 300001, 300000, 400001, 400000, 500001, 50000, 600001, 600000, 800001, 800000 ]) l_vals_exp = np.asarray([100, 400, 400, 600, 600, 800, 800], dtype=np.int32) l_vals_2_exp = np.asarray( [10000, 40000, 40000, 60000, 60000, 80000, 80000], dtype=np.int32) r_vals_exp = np.asarray([1000, 4000, 4001, 6000, 6001, 8000, 8001], dtype=np.int32) r_vals_2_exp = np.asarray( [100000, 400001, 400000, 600001, 600000, 800001, 800000], dtype=np.int32) s = session.Session() actual = s.ordered_merge_inner(l_id, r_id, left_field_sources=(l_vals, l_vals_2), right_field_sources=(r_vals, r_vals_2), left_unique=True, right_unique=False) self.assertTrue(np.array_equal(actual[0][0], l_vals_exp)) self.assertTrue(np.array_equal(actual[0][1], l_vals_2_exp)) self.assertTrue(np.array_equal(actual[1][0], r_vals_exp)) self.assertTrue(np.array_equal(actual[1][1], r_vals_2_exp))
def test_dataset_sort_index_groups(self): s = session.Session(10) vx = np.asarray([b'a', b'b', b'c', b'd', b'e'], dtype='S1') va = np.asarray([1, 2, 2, 1, 1]) vb = np.asarray([5, 4, 3, 2, 1]) bio = BytesIO() with h5py.File(bio, 'w') as hf: s.create_fixed_string(hf, 'x', 1).data.write(vx) s.create_numeric(hf, 'a', 'int32').data.write(va) s.create_numeric(hf, 'b', 'int32').data.write(vb) sindex = s.dataset_sort_index((hf['a'], hf['b']), np.arange(5, dtype='uint32')) s.get(hf['a']).writeable().data[:] = s.apply_index(sindex, hf['a']) s.get(hf['b']).writeable().data[:] = s.apply_index(sindex, hf['b']) s.get(hf['x']).writeable().data[:] = s.apply_index(sindex, hf['x']) self.assertListEqual([1, 1, 1, 2, 2], s.get(hf['a']).data[:].tolist()) self.assertListEqual([1, 2, 5, 3, 4], s.get(hf['b']).data[:].tolist()) self.assertListEqual([b'e', b'd', b'a', b'c', b'b'], s.get(hf['x']).data[:].tolist())
def test_categorical_field_importer_with_small_chunk_size(self): chunk_row_size = 20 # chunk_row_size * column_count < total_bytes expected_postcode_value_list = [1, 3, 2, 0, 4] expected_key_names = [b'', b'NW1', b'E1', b'SW1P', b'NW3'] expected_key_values = [0, 1, 2, 3, 4] bio = BytesIO() with session.Session() as s: importer.import_with_schema(s, bio, self.ds_name, self.schema, self.files, True, None, None, self.ts, chunk_row_size=chunk_row_size) ds = s.get_dataset(self.ds_name) df = ds.get_dataframe('schema_key') self.assertEqual(df['postcode'].data[:].tolist(), expected_postcode_value_list) self.assertEqual(list(df['postcode'].keys.values()), expected_key_names) with h5py.File(bio, 'r') as hf: self.assertEqual( hf['schema_key']['postcode']['values'][:].tolist(), expected_postcode_value_list) #self.assertEqual(hf['schema_key']['postcode']['key_names'][:].tolist(), expected_key_names) self.assertEqual( hf['schema_key']['postcode']['key_values'][:].tolist(), expected_key_values)
def test_fixed_string_field_importer(self): expected_patient_id_value_list = [ b'E1', b'E123', b'E234', b'', b'E456' ] bio = BytesIO() with session.Session() as s: importer.import_with_schema(s, bio, self.ds_name, self.schema, self.files, False, None, None, self.ts, chunk_row_size=self.chunk_row_size) ds = s.get_dataset(self.ds_name) df = ds.get_dataframe('schema_key') self.assertEqual(df['patient_id'].data[:].tolist(), expected_patient_id_value_list) with h5py.File(bio, 'r') as hf: self.assertEqual( hf['schema_key']['patient_id']['values'][:].tolist(), expected_patient_id_value_list)
def test_indexed_string_importer_with_small_chunk_size(self): chunk_row_size = 20 # chunk_row_size * column_count < total_bytes bio = BytesIO() with session.Session() as s: importer.import_with_schema(s, bio, self.ds_name, self.schema, self.files, False, None, None, self.ts, chunk_row_size=chunk_row_size) ds = s.get_dataset(self.ds_name) df = ds.get_dataframe('schema_key') self.assertEqual(df['name'].data[:], ['a', 'bb', 'ccc', 'dddd', 'eeeee']) with h5py.File(bio, 'r') as hf: indices = hf['schema_key']['name']['index'][:] values = hf['schema_key']['name']['values'][:] self.assertListEqual(list(indices), [0, 1, 3, 6, 10, 15]) self.assertEqual(values[indices[0]:indices[1]].tobytes(), b'a') self.assertEqual(values[indices[3]:indices[4]].tobytes(), b'dddd')
def test_numeric_importer_in_allow_empty_mode(self): bio = BytesIO() with session.Session() as s: importer.import_with_schema(s, bio, self.ds_name, self.schema, self.files, False, {}, {}, self.ts, chunk_row_size=self.chunk_row_size) ds = s.get_dataset(self.ds_name) df = ds.get_dataframe('schema_key') self.assertEqual(df['age_valid'].data[:].tolist(), [True, True, True, True, True]) self.assertTrue('weight_change_valid' not in df) with h5py.File(bio, 'r') as hf: self.assertTrue(hf['schema_key']['age']['values'][:].tolist(), [30, 40, 50, 60, 70]) self.assertTrue( hf['schema_key']['age_valid']['values'][:].tolist(), [True, True, True, True, True]) self.assertTrue( 'weight_change_valid' not in set(hf['schema_key'].keys()))
def test_numeric_importer_with_non_numeric_value_in_strict_mode(self): TEST_CSV_CONTENTS_EMPTY_VALUE = '\n'.join( ('name, id', 'a, 1', 'c, 5@')) fd_csv, csv_file_name = tempfile.mkstemp(suffix='.csv') with open(csv_file_name, 'w') as fcsv: fcsv.write(TEST_CSV_CONTENTS_EMPTY_VALUE) files = {'schema_key': csv_file_name} bio = BytesIO() with self.assertRaises(ValueError) as context: with session.Session() as s: importer.import_with_schema(s, bio, self.ds_name, self.schema, files, False, {}, {}, self.ts, chunk_row_size=self.chunk_row_size) self.assertEqual( str(context.exception), "Field 'id' contains values that cannot be converted to float in 'strict' mode" ) os.close(fd_csv)
def test_ordered_map_valid_stream(self): bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'r+', 'dst') hf = dst.create_dataframe('hf') map_field = np.asarray([ 0, 0, 0, 1, 1, 3, 3, 3, 3, 5, 5, 5, 5, ops.INVALID_INDEX, ops.INVALID_INDEX, 7, 7, 7 ], dtype=np.int64) data_field = np.asarray([-1, -2, -3, -4, -5, -6, -8, -9], dtype=np.int32) f_map_field = s.create_numeric(hf, "map_field", "int64") f_map_field.data.write(map_field) f_data_field = s.create_numeric(hf, "data_field", "int32") f_data_field.data.write(data_field) result_field = np.zeros(len(map_field), dtype=np.int32) ops.ordered_map_valid_stream(f_data_field, f_map_field, result_field, 4) expected = np.asarray([ -1, -1, -1, -2, -2, -4, -4, -4, -4, -6, -6, -6, -6, 0, 0, -9, -9, -9 ], dtype=np.int32) self.assertTrue(np.array_equal(result_field, expected))
def test_apply_spans_concat_field(self): idx = np.asarray([0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2], dtype=np.int32) vals = [ 'a', "'b'", 'what', 'some, information', 'x', '', 'foo', 'flop', "'dun'", "'mun'", "'race, track?'", '', "for, too", 'z', 'now!' ] # vals = ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b'] bio = BytesIO() with session.Session() as s: spans = s.get_spans(idx) # results = s.apply_spans_concat(spans, vals) # self.assertListEqual([0, 8, 6, 9], results.tolist()) ds = s.open_dataset(bio, "w", "ds") # s.apply_spans_concat(spans, vals, dest=s.create_indexed_string(ds, 'result')) # self.assertListEqual([0, 8, 6, 9], s.get(ds['result']).data[:].tolist()) s.create_indexed_string(ds, 'vals').data.write(vals) s.apply_spans_concat(spans, s.get(ds['vals']), dest=s.create_indexed_string(ds, 'result')) self.assertListEqual([ 'a,\'b\',what,"some, information",x', 'foo,flop', '\'dun\',\'mun\',"\'race, track?\'","for, too",z,now!' ], s.get(ds['result']).data[:])
def test_ordered_inner_map_left_unique_streamed(self): bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'r+', 'dst') hf = dst.create_dataframe('hf') a_ids = np.asarray( [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18], dtype=np.int64) b_ids = np.asarray([ 0, 1, 1, 2, 4, 5, 5, 6, 8, 9, 9, 10, 12, 13, 13, 14, 16, 17, 17, 18 ], dtype=np.int64) a_ids_f = s.create_numeric(hf, 'a_ids', 'int64') a_ids_f.data.write(a_ids) b_ids_f = s.create_numeric(hf, 'b_ids', 'int64') b_ids_f.data.write(b_ids) left_result = s.create_numeric(hf, 'left_result', 'int64') right_result = s.create_numeric(hf, 'right_result', 'int64') ops.ordered_inner_map_left_unique_streamed(a_ids_f, b_ids_f, left_result, right_result) left_expected = np.asarray( [0, 1, 1, 2, 4, 4, 5, 7, 8, 10, 11, 11, 13, 14, 14, 15], dtype=np.int32) self.assertTrue(np.array_equal(left_result.data[:], left_expected)) right_expected = np.asarray( [0, 1, 2, 3, 5, 6, 7, 8, 11, 12, 13, 14, 16, 17, 18, 19], dtype=np.int32) self.assertTrue( np.array_equal(right_result.data[:], right_expected))
def test_merge_left_dataset(self): bio1 = BytesIO() with h5py.File(bio1, 'w') as src: s = session.Session() p_id = np.array([100, 200, 300, 400, 500, 600, 800, 900]) p_val = np.array([-1, -2, -3, -4, -5, -6, -8, -9]) a_pid = np.array([ 100, 100, 100, 200, 200, 400, 400, 400, 400, 600, 600, 600, 700, 700, 900, 900, 900 ]) a_val = np.array([ 10, 11, 12, 23, 22, 40, 43, 42, 41, 60, 61, 63, 71, 71, 94, 93, 92 ]) src.create_group('p') s.create_numeric(src['p'], 'id', 'int32').data.write(p_id) s.create_numeric(src['p'], 'val', 'int32').data.write(p_val) src.create_group('a') s.create_numeric(src['a'], 'pid', 'int32').data.write(a_pid) bio2 = BytesIO() with h5py.File(bio1, 'r') as src: with h5py.File(bio2, 'w') as snk: s.merge_left(s.get(src['a']['pid']), s.get(src['p']['id']), right_fields=(s.get(src['p']['val']), ), right_writers=(s.create_numeric( snk, 'val', 'int32'), )) expected = [ -1, -1, -1, -2, -2, -4, -4, -4, -4, -6, -6, -6, 0, 0, -9, -9, -9 ] actual = s.get(snk['val']).data[:] self.assertListEqual(expected, actual.data[:].tolist())
def tests_merge_right(self): r_id = np.asarray([0, 1, 2, 3, 4, 5, 6, 7], dtype='int32') l_id = np.asarray([2, 3, 0, 4, 7, 6, 2, 0, 3], dtype='int32') l_vals = [ 'bb1', 'ccc1', '', 'dddd1', 'ggggggg1', 'ffffff1', 'bb2', '', 'ccc2' ] expected = [ '', '', '', 'bb1', 'bb2', 'ccc1', 'ccc2', 'dddd1', '', 'ffffff1', 'ggggggg1' ] bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'w', 'dst') ldf = dst.create_dataframe('ldf') rdf = dst.create_dataframe('rdf') ldf.create_numeric('l_id', 'int32').data.write(l_id) ldf.create_indexed_string('l_vals').data.write(l_vals) rdf.create_numeric('r_id', 'int32').data.write(r_id) ddf = dst.create_dataframe('ddf') dataframe.merge(ldf, rdf, ddf, 'l_id', 'r_id', how='right') self.assertEqual(expected, ddf['l_vals'].data[:]) valid_if_equal = (ddf['l_id'].data[:] == ddf['r_id'].data[:]) | \ np.logical_not(ddf['valid_l'].data[:]) self.assertTrue(np.all(valid_if_equal))
def test_indexed_string_importer(self): s = session.Session() bio = BytesIO() with h5py.File(bio, 'w') as hf: values = [ '', '', '1.0.0', '', '1.0.ä', '1.0.0', '1.0.0', '1.0.0', '', '', '1.0.0', '1.0.0', '', '1.0.0', '1.0.ä', '1.0.0', '' ] im = fields.IndexedStringImporter(s, hf, 'x') im.write(values) f = s.get(hf['x']) expected = [ '', '', '1.0.0', '', '1.0.ä', '1.0.0', '1.0.0', '1.0.0', '', '', '1.0.0', '1.0.0', '', '1.0.0', '1.0.ä', '1.0.0', '' ] self.assertListEqual(expected, f.data[:]) expected = [ 0, 0, 0, 5, 5, 11, 16, 21, 26, 26, 26, 31, 36, 36, 41, 47, 52, 52 ] self.assertListEqual(expected, f.indices[:].tolist()) expected = [ 49, 46, 48, 46, 48, 49, 46, 48, 46, 195, 164, 49, 46, 48, 46, 48, 49, 46, 48, 46, 48, 49, 46, 48, 46, 48, 49, 46, 48, 46, 48, 49, 46, 48, 46, 48, 49, 46, 48, 46, 48, 49, 46, 48, 46, 195, 164, 49, 46, 48, 46, 48 ] self.assertListEqual(expected, f.values[:].tolist())
def test_dataframe_create_indexed_string(self): bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'r+', 'dst') hf = dst.create_dataframe('dst') np.random.seed(12345678) values = np.random.randint(low=0, high=4, size=200000) svalues = [''.join(['x'] * v) for v in values] a = hf.create_indexed_string('a', 8) a.data.write(svalues) total = np.unique(a.data[:]) self.assertListEqual(['', 'x', 'xx', 'xxx'], total.tolist()) strs = a.data[:] strs = [s + 'y' for s in strs] a.data.clear() a.data.write(strs) self.assertListEqual( ['xxxy', 'xxy', 'xxxy', 'y', 'xy', 'y', 'xxxy', 'xxxy', 'xy', 'y'], strs[:10]) self.assertListEqual([0, 4, 7, 11, 12, 14, 15, 19, 23, 25], a.indices[:10].tolist()) self.assertListEqual( [120, 120, 120, 121, 120, 120, 121, 120, 120, 120], a.values[:10].tolist()) self.assertListEqual( ['xxxy', 'xxy', 'xxxy', 'y', 'xy', 'y', 'xxxy', 'xxxy', 'xy', 'y'], a.data[:10])
def test_read_csv_with_fields_out_of_order(self): bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'w', 'dst') df = dst.create_dataframe('df') parsers.read_csv(self.csv_file_name, df, self.schema_dict, include=['weight_change', 'height', 'BMI']) expected_height_list = list( np.asarray([170.9, 180.2, 160.5, 160.5, 161.0], dtype=np.float32)) expected_height_valid_list = [True, True, False, False, True] self.assertEqual(list(df['height'].data[:]), expected_height_list) self.assertEqual(list(df['height_valid_test'].data[:]), expected_height_valid_list) expected_weight_change_list = list( np.asarray( [21.2, utils.get_min_max('float32')[0], -17.5, -17.5, 2.5], dtype=np.float32)) self.assertEqual(list(df['weight_change'].data[:]), expected_weight_change_list) self.assertTrue('weight_change_valid' not in df) expected_BMI_list = list( np.asarray([20.5, 25.4, 27.2, 27.2, 20.2], dtype=np.float64)) expected_BMI_valid_list = [True, True, True, True, True] self.assertEqual(list(df['BMI'].data[:]), expected_BMI_list) self.assertEqual(list(df['BMI_valid'].data[:]), expected_BMI_valid_list)
def test_dataframe_create_mem_numeric(self): bio = BytesIO() with session.Session() as s: dst = s.open_dataset(bio, 'r+', 'dst') df = dst.create_dataframe('dst') num = df.create_numeric('num', 'uint32') num.data.write([1, 2, 3, 4]) self.assertEqual([1, 2, 3, 4], num.data[:].tolist()) num2 = df.create_numeric('num2', 'uint32') num2.data.write([1, 2, 3, 4]) df['num3'] = num + num2 self.assertEqual([2, 4, 6, 8], df['num3'].data[:].tolist()) df['num4'] = num - np.array([1, 2, 3, 4]) self.assertEqual([0, 0, 0, 0], df['num4'].data[:].tolist()) df['num5'] = num * np.array([1, 2, 3, 4]) self.assertEqual([1, 4, 9, 16], df['num5'].data[:].tolist()) df['num6'] = df['num5'] / np.array([1, 2, 3, 4]) self.assertEqual([1, 2, 3, 4], df['num6'].data[:].tolist()) df['num7'] = df['num'] & df['num2'] self.assertEqual([1, 2, 3, 4], df['num7'].data[:].tolist()) df['num8'] = df['num'] | df['num2'] self.assertEqual([1, 2, 3, 4], df['num8'].data[:].tolist()) df['num9'] = df['num'] ^ df['num2'] self.assertEqual([0, 0, 0, 0], df['num9'].data[:].tolist()) df['num10'] = df['num'] % df['num2'] self.assertEqual([0, 0, 0, 0], df['num10'].data[:].tolist())
def test_ordered_map_to_right_left_unique_streamed(self): s = session.Session() bio = BytesIO() with h5py.File(bio, 'w') as hf: a_ids = np.asarray( [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 16, 17, 18], dtype=np.int64) b_ids = np.asarray([ 0, 1, 1, 2, 4, 5, 5, 6, 8, 9, 9, 10, 12, 13, 13, 14, 16, 17, 17, 18 ], dtype=np.int64) a_ids_f = s.create_numeric(hf, 'a_ids', 'int64') a_ids_f.data.write(a_ids) b_ids_f = s.create_numeric(hf, 'b_ids', 'int64') b_ids_f.data.write(b_ids) left_to_right_result = s.create_numeric(hf, 'left_result', 'int64') ops.ordered_map_to_right_right_unique_streamed( a_ids_f, b_ids_f, left_to_right_result) expected = np.asarray([ 0, 1, 3, ops.INVALID_INDEX, 5, 7, ops.INVALID_INDEX, 8, 11, ops.INVALID_INDEX, 12, 13, ops.INVALID_INDEX, 16, 17, 19 ]) self.assertTrue( np.array_equal(left_to_right_result.data[:], expected))
def test_match_assessment(self): bio = BytesIO() with esess.Session() as s: src = s.open_dataset(bio, 'w', 'src') # test df tests = src.create_dataframe('tests') pid = tests.create_numeric('patient_id', 'int32') pid.data.write([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) d = tests.create_timestamp('created_at') d.data.write( [datetime(2020, 1, i).timestamp() for i in range(5, 15)]) pid = tests.create_numeric('result', 'int32') pid.data.write([3, 4, 3, 4, 3, 4, 3, 4, 3, 4]) #assessment df asmt = src.create_dataframe('assessments') pid = asmt.create_numeric('patient_id', 'int32') pid.data.write([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) d = asmt.create_timestamp('created_at') d.data.write([ datetime(2020, 1, i).timestamp() for i in list(reversed(range(7, 17))) ]) result = src.create_dataframe('result') match_assessment(tests, asmt, result, 5) self.assertListEqual(result['patient_id_l'].data[:].tolist(), list([7, 8, 9])) result = src.create_dataframe('result2') match_assessment(tests, asmt, result, 5, True) self.assertListEqual(result['patient_id_l'].data[:].tolist(), list([8]))
def test_importer_with_arg_include(self): include, exclude = {'schema_key': ['id', 'name']}, {} bio = BytesIO() with session.Session() as s: importer.import_with_schema(s, bio, self.ds_name, self.schema, self.files, False, include, exclude, self.ts, chunk_row_size=self.chunk_row_size) ds = s.get_dataset(self.ds_name) df = ds.get_dataframe('schema_key') self.assertEqual(df['id'].data[:].tolist(), [1, 2, 3, 4, 5]) self.assertEqual(df['name'].data[:], ['a', 'bb', 'ccc', 'dddd', 'eeeee']) with h5py.File(bio, 'r') as hf: self.assertListEqual(list(hf.keys()), ['schema_key']) self.assertTrue( set(hf['schema_key'].keys()) >= set(['id', 'name'])) self.assertEqual(hf['schema_key']['id']['values'][:].tolist(), [1, 2, 3, 4, 5]) self.assertEqual(hf['schema_key']['name']['index'][:].tolist(), [0, 1, 3, 6, 10, 15])
def test_dataframe_create_with_dataframe(self): iscontents1 = ['a', 'bb', 'ccc', 'dddd'] iscontents2 = ['eeee', 'fff', 'gg', 'h'] fscontents1 = [s.encode() for s in iscontents1] fscontents2 = [s.encode() for s in iscontents2] ccontents1 = np.array([1, 2, 2, 1], dtype=np.int8) ccontents2 = np.array([2, 1, 1, 2], dtype=np.int8) ncontents1 = np.array([1, 2, 3, 4], dtype=np.int32) ncontents2 = np.array([5, 6, 7, 8], dtype=np.int32) from datetime import datetime as D tcontents1 = [D(2020, 1, 1), D(2020, 1, 2), D(2020, 1, 3), D(2020, 1, 4)] tcontents1 = np.array([d.timestamp() for d in tcontents1]) tcontents2 = [D(2021, 1, 1), D(2021, 1, 2), D(2021, 1, 3), D(2021, 1, 4)] tcontents2 = np.array([d.timestamp() for d in tcontents2]) bio = BytesIO() with session.Session() as s: ds = s.open_dataset(bio, 'w', 'ds') df1 = ds.create_dataframe('df1') df1.create_indexed_string('is_foo').data.write(iscontents1) df1.create_fixed_string('fs_foo', 4).data.write(fscontents1) df1.create_categorical('c_foo', 'int8', {b'a': 1, b'b': 2}).data.write(ccontents1) df1.create_numeric('n_foo', 'uint32').data.write(ncontents1) df1.create_timestamp('t_foo').data.write(tcontents1) df2 = ds.create_dataframe('df2', dataframe=df1) self.assertListEqual(iscontents1, df1['is_foo'].data[:]) self.assertListEqual(iscontents1, df2['is_foo'].data[:]) df2['is_foo'].data.clear() df2['is_foo'].data.write(iscontents2) self.assertListEqual(iscontents1, df1['is_foo'].data[:]) self.assertListEqual(iscontents2, df2['is_foo'].data[:]) self.assertListEqual(fscontents1, df1['fs_foo'].data[:].tolist()) self.assertListEqual(fscontents1, df2['fs_foo'].data[:].tolist()) df2['fs_foo'].data[:] = fscontents2 self.assertListEqual(fscontents1, df1['fs_foo'].data[:].tolist()) self.assertListEqual(fscontents2, df2['fs_foo'].data[:].tolist()) self.assertListEqual(ccontents1.tolist(), df1['c_foo'].data[:].tolist()) self.assertListEqual(ccontents1.tolist(), df2['c_foo'].data[:].tolist()) df2['c_foo'].data[:] = ccontents2 self.assertListEqual(ccontents1.tolist(), df1['c_foo'].data[:].tolist()) self.assertListEqual(ccontents2.tolist(), df2['c_foo'].data[:].tolist()) self.assertDictEqual({1: b'a', 2: b'b'}, df1['c_foo'].keys) self.assertDictEqual({1: b'a', 2: b'b'}, df2['c_foo'].keys) self.assertListEqual(ncontents1.tolist(), df1['n_foo'].data[:].tolist()) self.assertListEqual(ncontents1.tolist(), df2['n_foo'].data[:].tolist()) df2['n_foo'].data[:] = np.array(ncontents2, dtype=np.uint32) self.assertListEqual(ncontents1.tolist(), df1['n_foo'].data[:].tolist()) self.assertListEqual(ncontents2.tolist(), df2['n_foo'].data[:].tolist()) self.assertListEqual(tcontents1.tolist(), df1['t_foo'].data[:].tolist()) self.assertListEqual(tcontents1.tolist(), df2['t_foo'].data[:].tolist()) df2['t_foo'].data[:] = np.array(tcontents2, dtype=np.float64) self.assertListEqual(tcontents1.tolist(), df1['t_foo'].data[:].tolist()) self.assertListEqual(tcontents2.tolist(), df2['t_foo'].data[:].tolist())
def join_tests(): """ Merge tests to previous merged (assessments, vaccine), filter out subjects has test records within 10days after vaccine """ with sess.Session() as s: # open related datasets src = s.open_dataset(ADATA, 'r', 'asmt') tests_src = src['tests'] dst = s.open_dataset(DSTDATA, 'r+', 'dst') vacc = dst['asmt_v'] tests_m = dst.create_dataframe('tests_m') dataframe.merge(vacc, tests_src, tests_m, 'patient_id_l', 'patient_id', how='inner') # filter out subjects has tests after 10days of vaccine # date_taken_specific_l is vaccine date, date_taken_specific_r is tests date test_filter = tests_m['date_taken_specific_l'] < tests_m[ 'date_taken_specific_r'] # test after vaccine test_filter &= tests_m['date_taken_specific_l'] > ( tests_m['date_taken_specific_r'] - 3600 * 24 * 10) tests_m.apply_filter(test_filter)
def test_dataset_init_with_data(self): bio = BytesIO() with session.Session() as s: h5file = h5py.File(bio, 'w') hgrp1 = h5file.create_group("grp1") num1 = s.create_numeric(hgrp1, 'num1', 'uint32') num1.data.write(np.array([0, 1, 2, 3, 4])) h5file.close() # read existing datafile dst = s.open_dataset(bio, 'r+', 'dst') self.assertTrue(isinstance(dst['grp1'], DataFrame)) self.assertEqual(s.get(dst['grp1']['num1']).data[:].tolist(), [0, 1, 2, 3, 4]) # add dataframe bio2 = BytesIO() ds2 = s.open_dataset(bio2, 'w', 'ds2') df2 = ds2.create_dataframe('df2') fs = df2.create_fixed_string('fs', 1) fs.data.write([b'a', b'b', b'c', b'd']) dst.copy(df2, 'df2') self.assertTrue(isinstance(dst['df2'], DataFrame)) self.assertEqual([b'a', b'b', b'c', b'd'], dst['df2']['fs'].data[:].tolist()) del dst['df2'] self.assertTrue(len(dst.keys()) == 1) self.assertTrue(len(dst._file.keys()) == 1) # set dataframe (this is a copy between datasets dst['df3'] = df2 self.assertTrue(isinstance(dst['df3'], DataFrame)) self.assertEqual([b'a', b'b', b'c', b'd'], dst['df3']['fs'].data[:].tolist())
def asmt_merge_vacc(): """ Merge assessment df with vaccine dataframe, filter out subject has a healthy assessments before vaccine date """ with sess.Session() as s: # open related datasets src = s.open_dataset(ADATA, 'r', 'asmt') asmt = src['assessments'] vacc = s.open_dataset(VDATA, 'r', 'vacc') dst = s.open_dataset(DSTDATA, 'w', 'dst') #filter vaccine type vbrand_filter = (vacc['vaccine_doses']['brand'].data[:] == 2) | \ (vacc['vaccine_doses']['brand'].data[:] == 3) dvacc = dst.create_dataframe('vacc') vacc['vaccine_doses'].apply_filter(vbrand_filter, ddf=dvacc) #join asmt with vaccine using patient_id, write to result asmt_v = dst.create_dataframe('asmt_v') dataframe.merge(asmt, dvacc, asmt_v, 'patient_id', 'patient_id', how='inner') #filter healthy asmt record within 10days of vaccine date symp_list = [ 'persistent_cough', 'fever', 'fatigue', 'delirium', 'shortness_of_breath', 'diarrhoea', 'abdominal_pain', 'chest_pain', 'hoarse_voice', 'skipped_meals', 'loss_of_smell', 'headache', 'sore_throat', 'chills_or_shivers', 'eye_soreness', 'nausea', 'blisters_on_feet', 'unusual_muscle_pains', 'runny_nose', 'red_welts_on_face_or_lips', 'dizzy_light_headed', 'swollen_glands', 'sneezing', 'skin_burning', 'earache', 'altered_smell', 'brain_fog', 'irregular_heartbeat' ] symp_filter = asmt_v['persistent_cough'].data[:] > 1 # has symptom for symptom1 in symp_list: symp_filter |= asmt_v[symptom1].data[:] > 1 # has symptom symp_filter = ~symp_filter # has no symptom symp_filter &= asmt_v['date_taken_specific'].data[:] > asmt_v[ 'updated_at_l'].data[:] # asmt before vaccine symp_filter &= asmt_v['updated_at_l'].data[:] > asmt_v[ 'date_taken_specific'].data[:] - 3600 * 24 * 10 # 10 days asmt_v.apply_filter(symp_filter) # has symptom after vaccine yes_symp_filter = asmt_v['persistent_cough'].data[:] > 1 for symptom1 in symp_list: yes_symp_filter |= asmt_v[symptom1].data[:] > 1 # has symptom yes_symp_filter &= asmt_v['date_taken_specific'].data[:] < asmt_v[ 'updated_at_l'].data[:] # assessment after vaccine yes_symp_filter &= asmt_v[ 'date_taken_specific'].data[:] + 3600 * 24 * 10 > asmt_v[ 'updated_at_l'].data[:] # assessment within 7 days of vaccine asmt_v.apply_filter(yes_symp_filter) print("finish asmt join vaccine.")