def test_recordbatch_no_fields(): batch = pa.record_batch([], []) assert len(batch) == 0 assert batch.num_rows == 0 assert batch.num_columns == 0
def test_recordbatch_empty_metadata(): data = [pa.array(range(5)), pa.array([-10, -5, 0, 5, 10])] batch = pa.record_batch(data, ['c0', 'c1']) assert batch.schema.metadata is None
def make_batches(): schema = make_schema() return [ pa.record_batch([[[1], [2, 42]]], schema), pa.record_batch([[None, [], [5, 6]]], schema), ]
def test_recordbatch_from_arrays_validate_schema(): # ARROW-6263 arr = pa.array([1, 2]) schema = pa.schema([pa.field('f0', pa.utf8())]) with pytest.raises(NotImplementedError): pa.record_batch([arr], schema=schema)
def make_batches(): schema = make_schema() batch1 = pa.record_batch([[1, 2, 3]], schema=schema) batch2 = pa.record_batch([[4, 5]], schema=schema) return [batch1, batch2]
def make_batch(): return pa.record_batch([[[1], [2, 42]]], make_schema())
#!/usr/bin/env python3 import pyarrow as pa from pyarrow import feather with open('checksum.feather', 'bw') as out_f: property_batch = pa.record_batch([[], []], names=['mtime', 'checksum']) property_table = pa.Table.from_batches([property_batch]) feather.write_feather(property_table, out_f, compression='zstd')
def make_batch(self): return pa.record_batch([[[1], [], None, [2, 42]]], self.make_schema())
class RecordBasedTfxioTest(parameterized.TestCase): def testReadTfRecord(self): tmp_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir) file1 = os.path.join(tmp_dir, "tfrecord1") file1_records = [b"aa", b"bb"] _WriteTfRecord(file1, file1_records) file2 = os.path.join(tmp_dir, "tfrecord2") file2_records = [b"cc", b"dd"] _WriteTfRecord(file2, file2_records) def _CheckRecords(actual, expected): self.assertEqual(set(actual), set(expected)) # Test reading multiple file patterns. with beam.Pipeline() as p: record_pcoll = p | record_based_tfxio.ReadTfRecord( [file1 + "*", file2 + "*"]) beam_test_util.assert_that( record_pcoll, lambda actual: _CheckRecords(actual, file1_records + file2_records)) @parameterized.named_parameters(*[ dict( testcase_name="simple", input_record_batch=pa.record_batch([pa.array([[1], [2]])], ["feature1"]), raw_records=[b"aa", b"bb"], expected_raw_record_column=pa.array( [[b"aa"], [b"bb"]], type=pa.large_list(pa.large_binary()))), dict( testcase_name="with_record_index", input_record_batch=pa.record_batch( [pa.array([[1], [2], [3]]), pa.array([[0], [1], [1]])], ["feature1", "record_index"]), raw_records=[b"aa", b"bb"], expected_raw_record_column=pa.array([[b"aa"], [b"bb"], [b"bb"]], type=pa.large_list( pa.large_binary())), record_index_column_name="record_index", ), dict( testcase_name="with_record_index_empty_input", input_record_batch=pa.record_batch([ pa.array([], type=pa.list_(pa.int64())), pa.array([], type=pa.large_list(pa.int32())) ], ["feature1", "record_index"]), raw_records=[b"aa", b"bb"], expected_raw_record_column=pa.array( [], type=pa.large_list(pa.large_binary())), record_index_column_name="record_index", ) ]) def testAppendRawRecordColumn( self, input_record_batch, raw_records, expected_raw_record_column, record_index_column_name=None): column_name = "raw_record" output_record_batch = record_based_tfxio.AppendRawRecordColumn( record_batch=input_record_batch, column_name=column_name, raw_records=raw_records, record_index_column_name=record_index_column_name) self.assertEqual( output_record_batch.num_columns, input_record_batch.num_columns + 1) for i in range(input_record_batch.num_columns): self.assertTrue( input_record_batch.column(i).equals(output_record_batch.column(i))) self.assertEqual( output_record_batch.schema.names[output_record_batch.num_columns - 1], column_name) self.assertTrue( output_record_batch.column(output_record_batch.num_columns - 1) .equals(expected_raw_record_column)) def testOverridableRecordBasedTFXIO(self): tmp_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir) file1 = os.path.join(tmp_dir, "tfrecord1") file1_records = [b"aa", b"bb"] _WriteTfRecord(file1, file1_records) def _CheckRecords(actual, expected): for a, e in zip(actual, expected): self.assertDictEqual(a.to_pydict(), e) @beam.typehints.with_input_types(Any) @beam.typehints.with_output_types(bytes) def _RawRecordBeamSource(pipeline: Any): return pipeline | beam.io.ReadFromTFRecord(file1 + "*") @beam.typehints.with_input_types(bytes) @beam.typehints.with_output_types(pa.RecordBatch) def _RawRecordsToRecordBatch(pcoll, batch_size): batch_size = 1 if not batch_size else batch_size class _CreateRBDoFn(beam.DoFn): def process(self, examples): return [ pa.RecordBatch.from_arrays([pa.array(examples)], ["column_name"]) ] return (pcoll | beam.BatchElements(batch_size) | beam.ParDo(_CreateRBDoFn())) tfxio = record_based_tfxio.OverridableRecordBasedTFXIO( telemetry_descriptors=None, logical_format="tfrecord", physical_format="tf_example", raw_record_beam_source=beam.ptransform_fn(_RawRecordBeamSource), raw_record_to_record_batch=beam.ptransform_fn(_RawRecordsToRecordBatch)) expected = [{"column_name": [b"aa"]}, {"column_name": [b"bb"]}] with beam.Pipeline() as p: record_pcoll = p | tfxio.BeamSource() beam_test_util.assert_that( record_pcoll, lambda actual: _CheckRecords(actual, expected))
class RecordBasedTfxioTest(parameterized.TestCase): def testReadTfRecord(self): tmp_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir) def _WriteTfRecord(path, records): with tf.io.TFRecordWriter(path) as w: for r in records: w.write(r) file1 = os.path.join(tmp_dir, "tfrecord1") file1_records = [b"aa", b"bb"] _WriteTfRecord(file1, file1_records) file2 = os.path.join(tmp_dir, "tfrecord2") file2_records = [b"cc", b"dd"] _WriteTfRecord(file2, file2_records) def _CheckRecords(actual, expected): self.assertEqual(set(actual), set(expected)) # Test reading multiple file patterns. with beam.Pipeline() as p: record_pcoll = p | record_based_tfxio.ReadTfRecord( [file1 + "*", file2 + "*"]) beam_test_util.assert_that( record_pcoll, lambda actual: _CheckRecords( actual, file1_records + file2_records)) @parameterized.named_parameters(*[ dict(testcase_name="simple", input_record_batch=pa.record_batch([pa.array([[1], [2]])], ["feature1"]), raw_records=[b"aa", b"bb"], expected_raw_record_column=pa.array([[b"aa"], [b"bb"]], type=pa.large_list( pa.large_binary()))), dict( testcase_name="with_record_index", input_record_batch=pa.record_batch( [pa.array([[1], [2], [3]]), pa.array([[0], [1], [1]])], ["feature1", "record_index"]), raw_records=[b"aa", b"bb"], expected_raw_record_column=pa.array([[b"aa"], [b"bb"], [b"bb"]], type=pa.large_list( pa.large_binary())), record_index_column_name="record_index", ), dict( testcase_name="with_record_index_empty_input", input_record_batch=pa.record_batch([ pa.array([], type=pa.list_(pa.int64())), pa.array([], type=pa.large_list(pa.int32())) ], ["feature1", "record_index"]), raw_records=[b"aa", b"bb"], expected_raw_record_column=pa.array([], type=pa.large_list( pa.large_binary())), record_index_column_name="record_index", ) ]) def testAppendRawRecordColumn(self, input_record_batch, raw_records, expected_raw_record_column, record_index_column_name=None): column_name = "raw_record" output_record_batch = record_based_tfxio.AppendRawRecordColumn( record_batch=input_record_batch, column_name=column_name, raw_records=raw_records, record_index_column_name=record_index_column_name) self.assertEqual(output_record_batch.num_columns, input_record_batch.num_columns + 1) for i in range(input_record_batch.num_columns): self.assertTrue( input_record_batch.column(i).equals( output_record_batch.column(i))) self.assertEqual( output_record_batch.schema.names[output_record_batch.num_columns - 1], column_name) self.assertTrue( output_record_batch.column(output_record_batch.num_columns - 1).equals(expected_raw_record_column))
def readline(file, terminator=b'\n'): bs = b'' while True: bs += file.read(80) index = bs.find(terminator) if index > 0: file.seek(index + 1) line = bs[:index + 1] return line out_path = "test.arrow" out = pa.OSFile(out_path, "wb") schema = pa.schema(fields=[pa.field("sentence", pa.list_(pa.string()))]) # print(schema) # sink = pa.BufferOutputStream() writer = pa.ipc.RecordBatchFileWriter(out, schema=schema) with open(file_path, 'r') as f: for line in tqdm(f): batch = pa.record_batch(data=[pa.array([line.strip().split(" ")])], schema=schema) writer.write(batch) writer.close() out.close() out_map = pa.memory_map(out_path, mode='rb') reader = pa.ipc.open_file(out_map) pa_table = reader.read_all() print(pa_table[0][24])
def make_extension_batch(): schema = make_extension_schema() ext_col = schema[0].type.wrap_array(pa.array([b"foo", b"bar"], type=pa.binary(3))) return pa.record_batch([ext_col], schema)
def convert_to_arrow(my_cccc, in_file_list, out_dir, out_list_file, conf_df, conf_synop_staion_df, conf_temp_pilot_staion_df, debug): warno = 189 out_arrows = [] cccc_set = set([re.sub('^.*/', '', re.sub('/alphanumeric/.*$', '', in_file)) for in_file in in_file_list]) cat_subcat_set = set([re.search(r'^[^/]*/[^/]*/', re.sub('^.*/alphanumeric/', '', in_file)).group().rstrip('/') for in_file in in_file_list]) for cccc in cccc_set: for cat_subcat in cat_subcat_set: cat = re.sub('/.*$', '', cat_subcat) subcat = re.sub('^.*/', '', cat_subcat) out_cat_subcat_df = conf_df[(conf_df['input_category'] == cat) & (conf_df['input_subcategory'] == subcat)] location_type_output_cat_subcat_set = set([str(location_type) + '/' + output_cat + '/' + output_subcat for output_index, location_type, output_cat, output_subcat in list(out_cat_subcat_df[['location_type','output_category','output_subcategory']].itertuples())]) for location_type_output_cat_subcat in location_type_output_cat_subcat_set: property_dict = {} datatype_dict = {} location_type_output_cat_subcat_list = location_type_output_cat_subcat.split('/') location_type = int(location_type_output_cat_subcat_list[0]) output_cat = location_type_output_cat_subcat_list[1] output_subcat = location_type_output_cat_subcat_list[2] for in_file in in_file_list: match = re.search(r'^.*/' + cccc + '/alphanumeric/' + cat_subcat + '/.*$', in_file) if not match: continue if not os.access(in_file, os.F_OK): print('Warning', warno, ':', in_file, 'does not exist.', file=sys.stderr) continue elif not os.path.isfile(in_file): print('Warning', warno, ':', in_file, 'is not file.', file=sys.stderr) continue elif not os.access(in_file, os.R_OK): print('Warning', warno, ':', in_file, 'is not readable.', file=sys.stderr) continue message = bytearray() with open(in_file, 'rb') as in_file_stream: if debug: print('Debug', ':', in_file, file=sys.stderr) message = in_file_stream.read() dt_str = re.sub('/.*$', '', re.sub('^.*/' + cat_subcat + '/', '', in_file)) an_dict, datatype_dict = parse(cccc, cat, subcat, output_cat, output_subcat, in_file, message, dt_str, conf_synop_staion_df, conf_temp_pilot_staion_df, debug) for key in an_dict: message_np = an_dict[key] if key in property_dict: property_dict[key] = np.concatenate([property_dict[key], message_np]) else: property_dict[key] = message_np if datetime_name in property_dict and location_name in property_dict and location_name in datatype_dict: name_list = [] data_list = [] name_list.append(location_name) data_list.append(pa.array(property_dict[location_name], datatype_dict[location_name])) datatype_dict.pop(location_name) name_list.append(latitude_name) data_list.append(pa.array(property_dict[latitude_name], datatype_dict[latitude_name])) datatype_dict.pop(latitude_name) name_list.append(longitude_name) data_list.append(pa.array(property_dict[longitude_name], datatype_dict[longitude_name])) datatype_dict.pop(longitude_name) if subcat == 'synop' or subcat == 'synop_mobil': name_list.append(height_of_station_ground_above_mean_sea_level_name) data_list.append(pa.array(property_dict[height_of_station_ground_above_mean_sea_level_name], datatype_dict[height_of_station_ground_above_mean_sea_level_name])) datatype_dict.pop(height_of_station_ground_above_mean_sea_level_name) name_list.append(datetime_name) data_list.append(pa.array(property_dict[datetime_name], pa.timestamp('ms', tz='utc'))) for datatype_key in datatype_dict.keys(): if datatype_key in property_dict: if any([False if value == None else True for value in property_dict[datatype_key]]): name_list.append(datatype_key) data_list.append(pa.array(property_dict[datatype_key], datatype_dict[datatype_key])) out_directory_list = [out_dir, cccc, 'alphanumeric_to_arrow', output_cat, output_subcat] out_directory = '/'.join(out_directory_list) os.makedirs(out_directory, exist_ok=True) now = datetime.utcnow() out_file_list = [out_directory, '/', 'C_', my_cccc, '_', str(now.year).zfill(4), str(now.month).zfill(2), str(now.day).zfill(2), str(now.hour).zfill(2), str(now.minute).zfill(2), str(now.second).zfill(2), '.feather'] out_file = ''.join(out_file_list) with open(out_file, 'bw') as out_f: batch = pa.record_batch(data_list, names=name_list) table = pa.Table.from_batches([batch]) feather.write_feather(table, out_f, compression='zstd') print(out_file, file=out_list_file)
def test_factory_functions_invalid_input(): with pytest.raises(TypeError, match="Expected pandas DataFrame, python"): pa.table("invalid input") with pytest.raises(TypeError, match="Expected pandas DataFrame"): pa.record_batch("invalid input")
def convert_to_arrow(my_cccc, in_file_list, out_dir, out_list_file, conf_df, write_location, debug): warno = 189 out_arrows = [] now = datetime.utcnow() create_datetime_list = ['C_', my_cccc, '_', str(now.year).zfill(4), str(now.month).zfill(2), str(now.day).zfill(2), str(now.hour).zfill(2), str(now.minute).zfill(2), str(now.second).zfill(2)] create_datetime = ''.join(create_datetime_list) cccc_set = set([re.sub('^.*/', '', re.sub('/grib/.*$', '', in_file)) for in_file in in_file_list]) cat_subcat_set = set([re.search(r'^[^/]*/[^/]*/', re.sub('^.*/grib/', '', in_file)).group().rstrip('/') for in_file in in_file_list]) for cccc in cccc_set: for cat_subcat in cat_subcat_set: keys = ['stepRange', 'typeOfLevel', 'level', 'shortName'] missingValue = -3.402823e+38 for in_file in in_file_list: property_dict = {} ft_list = [] match = re.search(r'^.*/' + cccc + '/grib/' + cat_subcat + '/.*$', in_file) if not match: continue if not os.access(in_file, os.F_OK): print('Warning', warno, ':', in_file, 'does not exist.', file=sys.stderr) continue elif not os.path.isfile(in_file): print('Warning', warno, ':', in_file, 'is not file.', file=sys.stderr) continue elif not os.access(in_file, os.R_OK): print('Warning', warno, ':', in_file, 'is not readable.', file=sys.stderr) continue dt_str = re.sub('/.*$', '', re.sub('^.*/' + cccc + '/grib/' + cat_subcat + '/', '', in_file)) with open(in_file, 'r') as in_file_stream: if debug: print('Debug', ':', in_file, file=sys.stderr) try: codes_grib_multi_support_on() iid = codes_index_new_from_file(in_file, keys) key_values_list = [] for key in keys: key_values = codes_index_get(iid, key) key_values_list.append(key_values) products = [[]] for key_values in key_values_list: products = [x + [y] for x in products for y in key_values] for product in products: for key_count in range(len(keys)): codes_index_select(iid, keys[key_count], product[key_count]) while True: gid = codes_new_from_index(iid) if gid is None: break codes_set(gid, 'missingValue', missingValue) iterid = codes_keys_iterator_new(gid, 'ls') step_range = None type_of_level = None level = None short_name = None cat = re.sub('/.*$', '', cat_subcat) subcat = re.sub('^.*/', '', cat_subcat) target_conf_df = conf_df[(conf_df['category'] == cat) & (conf_df['subcategory'] == subcat)] while codes_keys_iterator_next(iterid): key = codes_keys_iterator_get_name(iterid) if key in keys: value = codes_get_string(gid, key) if key == 'stepRange' or key == 'level': target_conf_df = target_conf_df[(target_conf_df[key] == int(value))] else: target_conf_df = target_conf_df[(target_conf_df[key] == value)] codes_keys_iterator_delete(iterid) message_np = np.array([]) for conf_row in target_conf_df.itertuples(): ft = codes_get(gid, 'stepRange') if not ft in ft_list: ft_list.append(ft) property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)] = np.array(codes_get_values(gid)) if write_location: iterid = codes_grib_iterator_new(gid, 0) lat_list = [] lon_list = [] while True: latitude_longitude_value = codes_grib_iterator_next(iterid) if not latitude_longitude_value: break else: lat_list.append(latitude_longitude_value[0]) if latitude_longitude_value[1] < 180.0: lon_list.append(latitude_longitude_value[1]) else: lon_list.append(latitude_longitude_value[1] - 360.0) codes_grib_iterator_delete(iterid) out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory] out_directory = '/'.join(out_directory_list) os.makedirs(out_directory, exist_ok=True) out_file_list = [out_directory, '/location.feather'] out_file = ''.join(out_file_list) with open(out_file, 'bw') as out_f: location_batch = pa.record_batch([pa.array(lat_list, 'float32'), pa.array(lon_list, 'float32')], names=['latitude [degree]', 'longitude [degree]']) location_table = pa.Table.from_batches([location_batch]) feather.write_feather(location_table, out_f, compression='zstd') codes_release(gid) except: print('Warning', warno, ':', in_file, 'is invalid grib.', file=sys.stderr) if len(property_dict) > 0: out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory] out_directory = '/'.join(out_directory_list) os.makedirs(out_directory, exist_ok=True) out_file_list = [out_directory, '/location.feather'] out_file = ''.join(out_file_list) location_df = feather.read_feather(out_file) dt = datetime(int(dt_str[0:4]), int(dt_str[4:6]), int(dt_str[6:8]), int(dt_str[8:10]), 0, 0, 0, tzinfo=timezone.utc) dt_list = [dt for i in range(0, len(location_df.index))] for ft in ft_list: name_list = ['latitude [degree]', 'longitude [degree]', 'datetime'] data_list = [pa.array(location_df['latitude [degree]'].values.tolist(), 'float32'), pa.array(location_df['longitude [degree]'].values.tolist(), 'float32')] data_list.append(pa.array(dt_list, pa.timestamp('ms', tz='utc'))) for conf_row in conf_df[(conf_df['category'] == cat) & (conf_df['subcategory'] == subcat)].itertuples(): if len(property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)]) > 0: if re.match(r'^.*U wind component.*$', conf_row.name): u_value_np = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)] v_value_np = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName.replace('u', 'v'), ft)] wind_speed_np = np.sqrt(np.power(u_value_np, 2) + np.power(v_value_np, 2)) wind_direction_np = np.degrees(np.arctan2(v_value_np, u_value_np)) wind_direction_np = np.array([value + 360.0 if value < 0 else value for value in wind_direction_np]) name_list.append(ft + '/' + re.sub(r'U wind component', 'wind speed [m/s]', conf_row.name)) data_list.append(pa.array(np.array(wind_speed_np, dtype=conf_row.datatype))) name_list.append(ft + '/' + re.sub(r'U wind component', 'wind direction [degree]', conf_row.name)) data_list.append(pa.array(np.array(wind_direction_np, dtype=conf_row.datatype))) elif not re.match(r'^.*V wind component.*$', conf_row.name): value_list = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)] name_list.append(ft + '/' + conf_row.name) data_list.append(pa.array(np.array(value_list, dtype=conf_row.datatype))) out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory] out_directory = '/'.join(out_directory_list) os.makedirs(out_directory, exist_ok=True) out_file_list = [out_directory, '/', dt_str, '_', create_datetime, '.feather'] out_file = ''.join(out_file_list) with open(out_file, 'bw') as out_f: property_batch = pa.record_batch(data_list, names=name_list) property_table = pa.Table.from_batches([property_batch]) feather.write_feather(property_table, out_f, compression='zstd') print(out_file, file=out_list_file)
def test_recordbatch_column_sets_private_name(): # ARROW-6429 rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0']) assert rb[0]._name == 'a0'
def convert_to_arrow(my_cccc, in_file_list, out_dir, out_list_file, conf_df, debug): warno = 189 out_arrows = [] now = datetime.utcnow() create_datetime_directory_list = [ 'C_', my_cccc, '_', str(now.year).zfill(4), str(now.month).zfill(2), str(now.day).zfill(2), str(now.hour).zfill(2), str(now.minute).zfill(2), str(now.second).zfill(2) ] create_datetime_directory = ''.join(create_datetime_directory_list) cccc_set = set([ re.sub('^.*/', '', re.sub('/bufr/.*$', '', in_file)) for in_file in in_file_list ]) cat_subcat_set = set([ re.search(r'^[^/]*/[^/]*/', re.sub('^.*/bufr/', '', in_file)).group().rstrip('/') for in_file in in_file_list ]) for cccc in cccc_set: for cat_subcat in cat_subcat_set: cat = re.sub('/.*$', '', cat_subcat) subcat = re.sub('^.*/', '', cat_subcat) out_cat_subcat_df = conf_df[(conf_df['input_category'] == cat) & ( conf_df['input_subcategory'] == subcat)] location_type_output_cat_subcat_set = set([ str(location_type) + '/' + output_cat + '/' + output_subcat for output_index, location_type, output_cat, output_subcat in list(out_cat_subcat_df[[ 'location_type', 'output_category', 'output_subcategory' ]].itertuples()) ]) for location_type_output_cat_subcat in location_type_output_cat_subcat_set: datatype_dict = {} property_dict = {} location_type_output_cat_subcat_list = location_type_output_cat_subcat.split( '/') location_type = int(location_type_output_cat_subcat_list[0]) output_cat = location_type_output_cat_subcat_list[1] output_subcat = location_type_output_cat_subcat_list[2] for in_file in in_file_list: match = re.search( r'^.*/' + cccc + '/bufr/' + cat_subcat + '/.*$', in_file) if not match: continue if not os.access(in_file, os.F_OK): print('Warning', warno, ':', in_file, 'does not exist.', file=sys.stderr) continue elif not os.path.isfile(in_file): print('Warning', warno, ':', in_file, 'is not file.', file=sys.stderr) continue elif not os.access(in_file, os.R_OK): print('Warning', warno, ':', in_file, 'is not readable.', file=sys.stderr) continue if debug: print('Debug', ':', in_file, file=sys.stderr) with open(in_file, 'rb') as in_file_stream: while True: bufr = None unexpanded_descriptors = [] try: bufr = codes_bufr_new_from_file(in_file_stream) if bufr is None: break unexpanded_descriptors = codes_get_array( bufr, 'unexpandedDescriptors') except: break descriptor_conf_df = pd.DataFrame( index=[], columns=['descriptor', 'descriptor_2']) for bufr_descriptor in unexpanded_descriptors: descriptor_conf_df = conf_df[ (conf_df['input_category'] == cat) & (conf_df['input_subcategory'] == subcat) & (conf_df['location_type'] == location_type) & (conf_df['output_category'] == output_cat) & (conf_df['output_subcategory'] == output_subcat) & (conf_df['descriptor'] == bufr_descriptor)] if len(descriptor_conf_df.index) > 0: descriptor_2_list = list( set(descriptor_conf_df[[ 'descriptor_2' ]].values.flatten())) if len(descriptor_2_list ) > 0 and not np.isnan( descriptor_2_list[0]): is_descriptor_2 = False for descriptor_2 in descriptor_2_list: if descriptor_2 in unexpanded_descriptors: descriptor_conf_df = descriptor_conf_df[ descriptor_conf_df[ 'descriptor_2'] == descriptor_2] is_descriptor_2 = True break if not is_descriptor_2: descriptor_conf_df = pd.DataFrame( index=[], columns=[ 'descriptor', 'descriptor_2' ]) break if len(descriptor_conf_df.index) == 0: print('Info', ':', 'not found descriptor.', unexpanded_descriptors, in_file, file=sys.stderr) break number_of_subsets = codes_get( bufr, 'numberOfSubsets') if number_of_subsets == 0: break try: codes_set(bufr, 'unpack', 1) except: break bufr_dict = {} none_np = np.array([]) if descriptor_conf_df['get_type'].values.flatten( )[0] == 'subset': for subset_num in range( 1, number_of_subsets + 1): number_of_array = 0 for conf_row in descriptor_conf_df.itertuples( ): array = getArray( bufr, subset_num, number_of_subsets, conf_row, in_file) if number_of_array == 0: if len(array) > 0: number_of_array = len(array) else: break if conf_row.convert_type == 'to_value' or conf_row.convert_type == 'to_value_to_array': if len(array ) > conf_row.array_index: value = array[int( conf_row.array_index)] if conf_row.convert_type == 'to_value_to_array': array = np.array( [ value for i in range( 0, number_of_array ) ], dtype=object) else: array = np.array( [value], dtype=object) elif len(array) == 0: array = np.array([ None for i in range( 0, number_of_array) ], dtype=object) else: print( 'Warning', warno, ':', 'len(array) is not more than conf_row.array_index.', 'subset', 'key:', conf_row.key, 'array length:', len(array), 'number of array:', number_of_array, 'file:', in_file, file=sys.stderr) array = np.array([ None for i in range( 0, number_of_array) ], dtype=object) if len(array) < number_of_array: for padding_count in range( len(array), number_of_array): array = np.append(array, None) elif len(array) > number_of_array: print( 'Warning', warno, ':', 'len(array) is more than number_of_array.', 'subset', 'key:', conf_row.key, 'array length:', len(array), 'number of array:', number_of_array, 'file:', in_file, file=sys.stderr) array = np.array([ None for i in range( 0, number_of_array) ], dtype=object) break if conf_row.key in bufr_dict: bufr_dict[ conf_row.key] = np.concatenate( [ bufr_dict[ conf_row.key], array ]) else: bufr_dict[conf_row.key] = array else: number_of_array = 0 for conf_row in descriptor_conf_df.itertuples( ): array = getArray(bufr, 0, 0, conf_row, in_file) if number_of_array == 0: if len(array) == 0: print('Warning', warno, ':', 'len(array) is 0.', '', 'key:', conf_row.key, 'array length:', len(array), 'number of array:', number_of_array, 'file:', in_file, file=sys.stderr) break else: number_of_array = len(array) elif len(array) != number_of_array: if len(array) == 1: value = array[0] array = np.array([ value for i in range( 0, number_of_array) ], dtype=object) else: print( 'Warning', warno, ':', 'len(array) is not equals to number_of_array.', '', 'key:', conf_row.key, 'array length:', len(array), 'number of array:', number_of_array, 'file:', in_file, file=sys.stderr) array = np.array([ None for i in range( 0, number_of_array) ], dtype=object) break bufr_dict[conf_row.key] = array for conf_row in descriptor_conf_df.itertuples(): if conf_row.output == 'location_datetime' and conf_row.key in bufr_dict: tmp_none_np = np.array([ False if value == None else True for value in bufr_dict[conf_row.key] ]) if len(none_np) > 0: none_np = none_np * tmp_none_np else: none_np = tmp_none_np codes_release(bufr) if len(bufr_dict) == 0: break bufr_dict['none'] = none_np location_datetime_index_np = np.array([ index for index, value in enumerate( bufr_dict['none']) if value == True ]) if len(location_datetime_index_np) > 0: message_np = np.array([]) pre_conf_row_name = '' for conf_row in descriptor_conf_df.itertuples( ): if conf_row.name != pre_conf_row_name: datatype_dict[ conf_row.name] = conf_row.datatype if len(message_np) > 0 and len( pre_conf_row_name) > 0: if pre_conf_row_name in property_dict: property_dict[ pre_conf_row_name] = np.concatenate( [ property_dict[ pre_conf_row_name], message_np ]) else: property_dict[ pre_conf_row_name] = message_np message_np = np.array([]) if conf_row.key in bufr_dict: tmp_message_np = bufr_dict[ conf_row.key] if max(location_datetime_index_np ) < len(tmp_message_np): tmp_message_np = tmp_message_np[ location_datetime_index_np] if len(tmp_message_np) > 0: if len(message_np) > 0: if conf_row.multiply != 0: message_np = message_np + conf_row.multiply * tmp_message_np else: message_np = message_np + tmp_message_np else: if conf_row.multiply != 0: message_np = conf_row.multiply * tmp_message_np else: message_np = tmp_message_np else: print( 'Info', 'unexpanded_descriptors :', unexpanded_descriptors, ': conditon of', conf_row.key, max(location_datetime_index_np ), len(tmp_message_np), in_file, file=sys.stderr) pre_conf_row_name = conf_row.name if len(message_np) > 0 and len( pre_conf_row_name) > 0: if pre_conf_row_name in property_dict: property_dict[ pre_conf_row_name] = np.concatenate( [ property_dict[ pre_conf_row_name], message_np ]) else: property_dict[ pre_conf_row_name] = message_np if datetime_name in property_dict: name_list = [] data_list = [] del_key_list = [] cat_subcat_conf_df = conf_df[ (conf_df['input_category'] == cat) & (conf_df['input_subcategory'] == subcat) & (conf_df['location_type'] == location_type) & (conf_df['output_category'] == output_cat) & (conf_df['output_subcategory'] == output_subcat)] datetime_tail = cat_subcat_conf_df[( cat_subcat_conf_df['name'] == 'datetime' )]['key'].values.flatten()[-1] for conf_row_name in set(cat_subcat_conf_df[( cat_subcat_conf_df['output'] == 'location_datetime' )]['name'].values.flatten()): if conf_row_name == 'datetime': plus_second_list = [ 0 for dt in range( 0, len(property_dict[conf_row_name])) ] if 'time period [s]' in property_dict: plus_second_list = property_dict[ 'time period [s]'] del_key_list.append('time period [s]') datetime_list = [] for i, dt_str in enumerate( property_dict[conf_row_name]): try: if datetime_tail == 'millisecond': obs_datetime = datetime( int(dt_str[0:4]), int(dt_str[4:6]), int(dt_str[6:8]), int(dt_str[8:10]), int(dt_str[10:12]), int(dt_str[12:14]), int(dt_str[15:]), tzinfo=timezone.utc) + timedelta( seconds=plus_second_list[i]) elif datetime_tail == 'second': obs_datetime = datetime( int(dt_str[0:4]), int(dt_str[4:6]), int(dt_str[6:8]), int(dt_str[8:10]), int(dt_str[10:12]), int(dt_str[12:14]), 0, tzinfo=timezone.utc) + timedelta( seconds=plus_second_list[i]) elif datetime_tail == 'minute': obs_datetime = datetime( int(dt_str[0:4]), int(dt_str[4:6]), int(dt_str[6:8]), int(dt_str[8:10]), int(dt_str[10:12]), 0, 0, tzinfo=timezone.utc) + timedelta( seconds=plus_second_list[i]) elif datetime_tail == 'hour': obs_datetime = datetime( int(dt_str[0:4]), int(dt_str[4:6]), int(dt_str[6:8]), int(dt_str[8:10]), 0, 0, 0, tzinfo=timezone.utc) + timedelta( seconds=plus_second_list[i]) elif datetime_tail == 'day': obs_datetime = datetime( int(dt_str[0:4]), int(dt_str[4:6]), int(dt_str[6:8]), 0, 0, 0, 0, tzinfo=timezone.utc) + timedelta( seconds=plus_second_list[i]) elif datetime_tail == 'month': obs_datetime = datetime( int(dt_str[0:4]), int(dt_str[4:6]), 0, 0, 0, 0, 0, tzinfo=timezone.utc) + timedelta( seconds=plus_second_list[i]) elif datetime_tail == 'year': obs_datetime = datetime( int(dt_str[0:4]), 0, 0, 0, 0, 0, 0, tzinfo=timezone.utc) + timedelta( seconds=plus_second_list[i]) datetime_list.append(obs_datetime) except: for property_dict_key in property_dict.keys( ): property_dict[property_dict_key].pop(i) data_list.append( pa.array(datetime_list, pa.timestamp('ms', tz='utc'))) name_list.append(conf_row_name) datatype_dict.pop(conf_row_name) elif conf_row_name != 'time period [s]': if conf_row_name in property_dict: data_list.append( pa.array(property_dict[conf_row_name], datatype_dict[conf_row_name])) name_list.append(conf_row_name) datatype_dict.pop(conf_row_name) for datatype_key in datatype_dict.keys(): if datatype_key in property_dict: if any([ False if value == None else True for value in property_dict[datatype_key] ]): name_list.append(datatype_key) data_list.append( pa.array(property_dict[datatype_key], datatype_dict[datatype_key])) out_directory_list = [ out_dir, cccc, 'bufr_to_arrow', output_cat, output_subcat ] out_directory = '/'.join(out_directory_list) os.makedirs(out_directory, exist_ok=True) now = datetime.utcnow() out_file_list = [ out_directory, '/', 'C_', my_cccc, '_', str(now.year).zfill(4), str(now.month).zfill(2), str(now.day).zfill(2), str(now.hour).zfill(2), str(now.minute).zfill(2), str(now.second).zfill(2), '.feather' ] out_file = ''.join(out_file_list) with open(out_file, 'bw') as out_f: batch = pa.record_batch(data_list, names=name_list) table = pa.Table.from_batches([batch]) feather.write_feather(table, out_f, compression='zstd') print(out_file, file=out_list_file)