def testReadAllWithErrorAndWarn(self): self._create_file( columns=self.dict_single_column, prefix_path=f'{IOTest.io_test_dir}/iotest_single_column') self._create_file( columns=self.dict_single_column, prefix_path=f'{IOTest.io_test_dir}/iotest_single_column_dupe') # Make sure we can read ok dataset = ak.read_all(filenames=[ f'{IOTest.io_test_dir}/iotest_single_column_LOCALE0000', f'{IOTest.io_test_dir}/iotest_single_column_dupe_LOCALE0000' ]) self.assertIsNotNone(dataset, "Expected dataset to be populated") # Change the name of the first file we try to raise an error due to file missing. with self.assertRaises(RuntimeError): dataset = ak.read_all(filenames=[ f'{IOTest.io_test_dir}/iotest_MISSING_single_column_LOCALE0000', f'{IOTest.io_test_dir}/iotest_single_column_dupe_LOCALE0000' ]) # Run the same test with missing file, but this time with the warning flag for read_all with pytest.warns( RuntimeWarning, match=r"There were .* errors reading files on the server.*"): dataset = ak.read_all(filenames=[ f'{IOTest.io_test_dir}/iotest_MISSING_single_column_LOCALE0000', f'{IOTest.io_test_dir}/iotest_single_column_dupe_LOCALE0000' ], strictTypes=False, allow_errors=True) self.assertIsNotNone(dataset, "Expected dataset to be populated")
def testHdfUnsanitizedNames(self): # Test when quotes are part of the dataset name my_arrays = {'foo"0"': ak.arange(100), 'bar"': ak.arange(100)} with tempfile.TemporaryDirectory( dir=IOTest.io_test_dir) as tmp_dirname: ak.save_all(my_arrays, f"{tmp_dirname}/bad_dataset_names") ak.read_all(f"{tmp_dirname}/bad_dataset_names*")
def testReadAllWithGlob(self): ''' Creates 2..n files depending upon the number of arkouda_server locales with two files each containing different-named datasets with the same pdarrays, reads the files with the glob feature of the read_all method, and confirms the datasets and embedded pdarrays match the input dataset and pdarrays :return: None :raise: AssertionError if the input and returned datasets don't match ''' self._create_file(columns=self.dict_columns, path_prefix='{}/iotest_dict_columns'.format( IOTest.io_test_dir)) self._create_file(columns=self.dict_columns, path_prefix='{}/iotest_dict_columns_dupe'.format( IOTest.io_test_dir)) dataset = ak.read_all( filenames='{}/iotest_dict_columns*'.format(IOTest.io_test_dir)) self.assertEqual(3, len(list(dataset.keys()))) self.assertEqual(self.int_tens_pdarray.all(), dataset['int_tens_pdarray'].all()) self.assertEqual(self.int_hundreds_pdarray.all(), dataset['int_hundreds_pdarray'].all()) self.assertEqual(self.float_pdarray.all(), dataset['float_pdarray'].all())
def testReadAllWithGlob(self): ''' Creates 2..n files depending upon the number of arkouda_server locales with two files each containing different-named datasets with the same pdarrays, reads the files with the glob feature of the read_all method, and confirms the datasets and embedded pdarrays match the input dataset and pdarrays :return: None :raise: AssertionError if the input and returned datasets don't match ''' self._create_file(columns=self.dict_columns, prefix_path='{}/iotest_dict_columns'.format(IOTest.io_test_dir)) retrieved_columns = ak.read_all(filenames='{}/iotest_dict_columns*'.format(IOTest.io_test_dir)) itp = self.list_columns[0].to_ndarray() itp.sort() ritp = retrieved_columns['int_tens_pdarray'].to_ndarray() ritp.sort() ihp = self.list_columns[1].to_ndarray() ihp.sort() rihp = retrieved_columns['int_hundreds_pdarray'].to_ndarray() rihp.sort() fp = self.list_columns[2].to_ndarray() fp.sort() rfp = retrieved_columns['float_pdarray'].to_ndarray() rfp.sort() self.assertEqual(4, len(list(retrieved_columns.keys()))) self.assertTrue((itp == ritp).all()) self.assertTrue((ihp == rihp).all()) self.assertTrue((fp == rfp).all()) self.assertEqual(len(self.bool_pdarray), len(retrieved_columns['bool_pdarray']))
def testStrictTypes(self): N = 100 prefix = '{}/strict-type-test'.format(IOTest.io_test_dir) inttypes = [np.uint32, np.int64, np.uint16, np.int16] floattypes = [np.float32, np.float64, np.float32, np.float64] for i, (it, ft) in enumerate(zip(inttypes, floattypes)): with h5py.File('{}-{}'.format(prefix, i), 'w') as f: idata = np.arange(i*N, (i+1)*N, dtype=it) f.create_dataset('integers', data=idata) fdata = np.arange(i*N, (i+1)*N, dtype=ft) f.create_dataset('floats', data=fdata) with self.assertRaises(RuntimeError) as cm: a = ak.read_all(prefix+'*') self.assertTrue('Inconsistent precision or sign' in cm.exception.args[0]) a = ak.read_all(prefix+'*', strictTypes=False) self.assertTrue((a['integers'] == ak.arange(len(inttypes)*N)).all()) self.assertTrue(np.allclose(a['floats'].to_ndarray(), np.arange(len(floattypes)*N, dtype=np.float64)))
def testReadAll(self): ak.verbose = False #client verbose Flag cwd = os.getcwd() allfiles = glob(cwd + '/../converter/netflow_day-*.hdf') print(allfiles) start = time.time() dictionary1 = ak.read_all(allfiles, iterative=True) end = time.time() t1 = end - start print("read_all(iterative=True) seconds: %.3f" % (t1)) for key, value in dictionary1.items(): print(key, type(value), value, len(value)) start = time.time() dictionary2 = ak.read_all(allfiles) end = time.time() t2 = end - start print("read_all() seconds: %.3f" % (t2)) for key, value in dictionary2.items(): print(key, type(value), value, len(value))
def testSaveStringsDataset(self): # Create, save, and load Strings dataset strings_array = ak.array( ['testing string{}'.format(num) for num in list(range(0, 25))]) strings_array.save('{}/strings-test'.format(IOTest.io_test_dir), dataset='strings') r_strings_array = ak.load('{}/strings-test'.format(IOTest.io_test_dir), dataset='strings') strings = strings_array.to_ndarray() strings.sort() r_strings = r_strings_array.to_ndarray() r_strings.sort() self.assertTrue((strings == r_strings).all()) # Read a part of a saved Strings dataset from one hdf5 file r_strings_subset = ak.read_all(filenames='{}/strings-test_LOCALE0000'.\ format(IOTest.io_test_dir)) self.assertIsNotNone(r_strings_subset) self.assertTrue(isinstance(r_strings_subset[0], str)) self.assertIsNotNone(ak.read_hdf(filenames='{}/strings-test_LOCALE0000'.\ format(IOTest.io_test_dir), dsetName='strings/values')) self.assertIsNotNone(ak.read_hdf(filenames='{}/strings-test_LOCALE0000'.\ format(IOTest.io_test_dir), dsetName='strings/segments')) # Repeat the test using the calc_string_offsets=True option to have server calculate offsets array r_strings_subset = ak.read_all( filenames=f'{IOTest.io_test_dir}/strings-test_LOCALE0000', calc_string_offsets=True) self.assertIsNotNone(r_strings_subset) self.assertTrue(isinstance(r_strings_subset[0], str)) self.assertIsNotNone( ak.read_hdf( filenames=f'{IOTest.io_test_dir}/strings-test_LOCALE0000', dsetName='strings/values', calc_string_offsets=True)) self.assertIsNotNone( ak.read_hdf( filenames=f'{IOTest.io_test_dir}/strings-test_LOCALE0000', dsetName='strings/segments', calc_string_offsets=True))
def testReadAll(self): ''' Creates 2..n files depending upon the number of arkouda_server locales, reads the files with an explicit list of file names to the read_all method, and confirms the datasets and embedded pdarrays match the input dataset and pdarrays :return: None :raise: AssertionError if the input and returned datasets don't match ''' self._create_file(columns=self.dict_columns, prefix_path='{}/iotest_dict_columns'.format(IOTest.io_test_dir)) dataset = ak.read_all(filenames=['{}/iotest_dict_columns_LOCALE0'.format(IOTest.io_test_dir)]) self.assertEqual(4, len(list(dataset.keys())))
if __name__ == '__main__': if len(sys.argv) < 3: print("Usage: {} <hostname> <port> <HDF5_filenames>".format( sys.argv[0])) sys.exit() ak.connect(sys.argv[1], sys.argv[2]) ak.verbose = False #client verbose Flag cwd = os.getcwd() allfiles = glob(cwd + '/../converter/netflow_day-*.hdf') if len(sys.argv) > 3: allfiles = sys.argv[3:] start = time.time() dictionary1 = ak.read_all(allfiles, iterative=True) end = time.time() t1 = end - start print("read_all(iterative=True) seconds: %.3f" % (t1)) for key, value in dictionary1.items(): print(key, type(value), value, len(value)) start = time.time() dictionary2 = ak.read_all(allfiles) end = time.time() t2 = end - start print("read_all() seconds: %.3f" % (t2)) for key, value in dictionary2.items(): print(key, type(value), value, len(value)) ak.disconnect()