Exemple #1
0
def test_recordbatch_no_fields():
    batch = pa.record_batch([], [])

    assert len(batch) == 0
    assert batch.num_rows == 0
    assert batch.num_columns == 0
Exemple #2
0
def test_recordbatch_empty_metadata():
    data = [pa.array(range(5)), pa.array([-10, -5, 0, 5, 10])]

    batch = pa.record_batch(data, ['c0', 'c1'])
    assert batch.schema.metadata is None
Exemple #3
0
def make_batches():
    schema = make_schema()
    return [
        pa.record_batch([[[1], [2, 42]]], schema),
        pa.record_batch([[None, [], [5, 6]]], schema),
    ]
Exemple #4
0
def test_recordbatch_from_arrays_validate_schema():
    # ARROW-6263
    arr = pa.array([1, 2])
    schema = pa.schema([pa.field('f0', pa.utf8())])
    with pytest.raises(NotImplementedError):
        pa.record_batch([arr], schema=schema)
Exemple #5
0
 def make_batches():
     schema = make_schema()
     batch1 = pa.record_batch([[1, 2, 3]], schema=schema)
     batch2 = pa.record_batch([[4, 5]], schema=schema)
     return [batch1, batch2]
Exemple #6
0
def make_batch():
    return pa.record_batch([[[1], [2, 42]]], make_schema())
#!/usr/bin/env python3
import pyarrow as pa
from pyarrow import feather

with open('checksum.feather', 'bw') as out_f:
    property_batch = pa.record_batch([[], []], names=['mtime', 'checksum'])
    property_table = pa.Table.from_batches([property_batch])
    feather.write_feather(property_table, out_f, compression='zstd')
 def make_batch(self):
     return pa.record_batch([[[1], [], None, [2, 42]]], self.make_schema())
Exemple #9
0
class RecordBasedTfxioTest(parameterized.TestCase):

  def testReadTfRecord(self):
    tmp_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
    file1 = os.path.join(tmp_dir, "tfrecord1")
    file1_records = [b"aa", b"bb"]
    _WriteTfRecord(file1, file1_records)
    file2 = os.path.join(tmp_dir, "tfrecord2")
    file2_records = [b"cc", b"dd"]
    _WriteTfRecord(file2, file2_records)

    def _CheckRecords(actual, expected):
      self.assertEqual(set(actual), set(expected))

    # Test reading multiple file patterns.
    with beam.Pipeline() as p:
      record_pcoll = p | record_based_tfxio.ReadTfRecord(
          [file1 + "*", file2 + "*"])
      beam_test_util.assert_that(
          record_pcoll,
          lambda actual: _CheckRecords(actual, file1_records + file2_records))

  @parameterized.named_parameters(*[
      dict(
          testcase_name="simple",
          input_record_batch=pa.record_batch([pa.array([[1], [2]])],
                                             ["feature1"]),
          raw_records=[b"aa", b"bb"],
          expected_raw_record_column=pa.array(
              [[b"aa"], [b"bb"]], type=pa.large_list(pa.large_binary()))),
      dict(
          testcase_name="with_record_index",
          input_record_batch=pa.record_batch(
              [pa.array([[1], [2], [3]]),
               pa.array([[0], [1], [1]])], ["feature1", "record_index"]),
          raw_records=[b"aa", b"bb"],
          expected_raw_record_column=pa.array([[b"aa"], [b"bb"], [b"bb"]],
                                              type=pa.large_list(
                                                  pa.large_binary())),
          record_index_column_name="record_index",
      ),
      dict(
          testcase_name="with_record_index_empty_input",
          input_record_batch=pa.record_batch([
              pa.array([], type=pa.list_(pa.int64())),
              pa.array([], type=pa.large_list(pa.int32()))
          ], ["feature1", "record_index"]),
          raw_records=[b"aa", b"bb"],
          expected_raw_record_column=pa.array(
              [], type=pa.large_list(pa.large_binary())),
          record_index_column_name="record_index",
      )
  ])
  def testAppendRawRecordColumn(
      self, input_record_batch,
      raw_records,
      expected_raw_record_column,
      record_index_column_name=None):
    column_name = "raw_record"
    output_record_batch = record_based_tfxio.AppendRawRecordColumn(
        record_batch=input_record_batch, column_name=column_name,
        raw_records=raw_records,
        record_index_column_name=record_index_column_name)
    self.assertEqual(
        output_record_batch.num_columns,
        input_record_batch.num_columns + 1)
    for i in range(input_record_batch.num_columns):
      self.assertTrue(
          input_record_batch.column(i).equals(output_record_batch.column(i)))

    self.assertEqual(
        output_record_batch.schema.names[output_record_batch.num_columns - 1],
        column_name)
    self.assertTrue(
        output_record_batch.column(output_record_batch.num_columns - 1)
        .equals(expected_raw_record_column))

  def testOverridableRecordBasedTFXIO(self):
    tmp_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
    file1 = os.path.join(tmp_dir, "tfrecord1")
    file1_records = [b"aa", b"bb"]
    _WriteTfRecord(file1, file1_records)

    def _CheckRecords(actual, expected):
      for a, e in zip(actual, expected):
        self.assertDictEqual(a.to_pydict(), e)

    @beam.typehints.with_input_types(Any)
    @beam.typehints.with_output_types(bytes)
    def _RawRecordBeamSource(pipeline: Any):
      return pipeline | beam.io.ReadFromTFRecord(file1 + "*")

    @beam.typehints.with_input_types(bytes)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _RawRecordsToRecordBatch(pcoll, batch_size):
      batch_size = 1 if not batch_size else batch_size

      class _CreateRBDoFn(beam.DoFn):

        def process(self, examples):
          return [
              pa.RecordBatch.from_arrays([pa.array(examples)], ["column_name"])
          ]

      return (pcoll | beam.BatchElements(batch_size)
              | beam.ParDo(_CreateRBDoFn()))

    tfxio = record_based_tfxio.OverridableRecordBasedTFXIO(
        telemetry_descriptors=None,
        logical_format="tfrecord",
        physical_format="tf_example",
        raw_record_beam_source=beam.ptransform_fn(_RawRecordBeamSource),
        raw_record_to_record_batch=beam.ptransform_fn(_RawRecordsToRecordBatch))

    expected = [{"column_name": [b"aa"]}, {"column_name": [b"bb"]}]
    with beam.Pipeline() as p:
      record_pcoll = p | tfxio.BeamSource()
      beam_test_util.assert_that(
          record_pcoll,
          lambda actual: _CheckRecords(actual, expected))
Exemple #10
0
class RecordBasedTfxioTest(parameterized.TestCase):
    def testReadTfRecord(self):
        tmp_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)

        def _WriteTfRecord(path, records):
            with tf.io.TFRecordWriter(path) as w:
                for r in records:
                    w.write(r)

        file1 = os.path.join(tmp_dir, "tfrecord1")
        file1_records = [b"aa", b"bb"]
        _WriteTfRecord(file1, file1_records)
        file2 = os.path.join(tmp_dir, "tfrecord2")
        file2_records = [b"cc", b"dd"]
        _WriteTfRecord(file2, file2_records)

        def _CheckRecords(actual, expected):
            self.assertEqual(set(actual), set(expected))

        # Test reading multiple file patterns.
        with beam.Pipeline() as p:
            record_pcoll = p | record_based_tfxio.ReadTfRecord(
                [file1 + "*", file2 + "*"])
            beam_test_util.assert_that(
                record_pcoll, lambda actual: _CheckRecords(
                    actual, file1_records + file2_records))

    @parameterized.named_parameters(*[
        dict(testcase_name="simple",
             input_record_batch=pa.record_batch([pa.array([[1], [2]])],
                                                ["feature1"]),
             raw_records=[b"aa", b"bb"],
             expected_raw_record_column=pa.array([[b"aa"], [b"bb"]],
                                                 type=pa.large_list(
                                                     pa.large_binary()))),
        dict(
            testcase_name="with_record_index",
            input_record_batch=pa.record_batch(
                [pa.array([[1], [2], [3]]),
                 pa.array([[0], [1], [1]])], ["feature1", "record_index"]),
            raw_records=[b"aa", b"bb"],
            expected_raw_record_column=pa.array([[b"aa"], [b"bb"], [b"bb"]],
                                                type=pa.large_list(
                                                    pa.large_binary())),
            record_index_column_name="record_index",
        ),
        dict(
            testcase_name="with_record_index_empty_input",
            input_record_batch=pa.record_batch([
                pa.array([], type=pa.list_(pa.int64())),
                pa.array([], type=pa.large_list(pa.int32()))
            ], ["feature1", "record_index"]),
            raw_records=[b"aa", b"bb"],
            expected_raw_record_column=pa.array([],
                                                type=pa.large_list(
                                                    pa.large_binary())),
            record_index_column_name="record_index",
        )
    ])
    def testAppendRawRecordColumn(self,
                                  input_record_batch,
                                  raw_records,
                                  expected_raw_record_column,
                                  record_index_column_name=None):
        column_name = "raw_record"
        output_record_batch = record_based_tfxio.AppendRawRecordColumn(
            record_batch=input_record_batch,
            column_name=column_name,
            raw_records=raw_records,
            record_index_column_name=record_index_column_name)
        self.assertEqual(output_record_batch.num_columns,
                         input_record_batch.num_columns + 1)
        for i in range(input_record_batch.num_columns):
            self.assertTrue(
                input_record_batch.column(i).equals(
                    output_record_batch.column(i)))

        self.assertEqual(
            output_record_batch.schema.names[output_record_batch.num_columns -
                                             1], column_name)
        self.assertTrue(
            output_record_batch.column(output_record_batch.num_columns -
                                       1).equals(expected_raw_record_column))
Exemple #11
0
def readline(file, terminator=b'\n'):
    bs = b''
    while True:
        bs += file.read(80)
        index = bs.find(terminator)
        if index > 0:
            file.seek(index + 1)
            line = bs[:index + 1]
            return line


out_path = "test.arrow"
out = pa.OSFile(out_path, "wb")
schema = pa.schema(fields=[pa.field("sentence", pa.list_(pa.string()))])
# print(schema)
# sink = pa.BufferOutputStream()
writer = pa.ipc.RecordBatchFileWriter(out, schema=schema)
with open(file_path, 'r') as f:
    for line in tqdm(f):
        batch = pa.record_batch(data=[pa.array([line.strip().split(" ")])],
                                schema=schema)
        writer.write(batch)

writer.close()
out.close()

out_map = pa.memory_map(out_path, mode='rb')
reader = pa.ipc.open_file(out_map)
pa_table = reader.read_all()
print(pa_table[0][24])
Exemple #12
0
def make_extension_batch():
    schema = make_extension_schema()
    ext_col = schema[0].type.wrap_array(pa.array([b"foo", b"bar"],
                                                 type=pa.binary(3)))
    return pa.record_batch([ext_col], schema)
def convert_to_arrow(my_cccc, in_file_list, out_dir, out_list_file, conf_df, conf_synop_staion_df, conf_temp_pilot_staion_df, debug):
    warno = 189
    out_arrows = []
    cccc_set = set([re.sub('^.*/', '', re.sub('/alphanumeric/.*$', '', in_file)) for in_file in in_file_list])
    cat_subcat_set = set([re.search(r'^[^/]*/[^/]*/', re.sub('^.*/alphanumeric/', '', in_file)).group().rstrip('/') for in_file in in_file_list])
    for cccc in cccc_set:
        for cat_subcat in cat_subcat_set:
            cat = re.sub('/.*$', '', cat_subcat)
            subcat = re.sub('^.*/', '', cat_subcat)
            out_cat_subcat_df = conf_df[(conf_df['input_category'] == cat) & (conf_df['input_subcategory'] == subcat)]
            location_type_output_cat_subcat_set = set([str(location_type) + '/' + output_cat + '/' + output_subcat for output_index, location_type, output_cat, output_subcat in list(out_cat_subcat_df[['location_type','output_category','output_subcategory']].itertuples())])
            for location_type_output_cat_subcat in location_type_output_cat_subcat_set:
                property_dict = {}
                datatype_dict = {}
                location_type_output_cat_subcat_list = location_type_output_cat_subcat.split('/')
                location_type = int(location_type_output_cat_subcat_list[0])
                output_cat = location_type_output_cat_subcat_list[1]
                output_subcat = location_type_output_cat_subcat_list[2]
                for in_file in in_file_list:
                    match = re.search(r'^.*/' + cccc + '/alphanumeric/' + cat_subcat + '/.*$', in_file)
                    if not match:
                        continue
                    if not os.access(in_file, os.F_OK):
                        print('Warning', warno, ':', in_file, 'does not exist.', file=sys.stderr)
                        continue
                    elif not os.path.isfile(in_file):
                        print('Warning', warno, ':', in_file, 'is not file.', file=sys.stderr)
                        continue
                    elif not os.access(in_file, os.R_OK):
                        print('Warning', warno, ':', in_file, 'is not readable.', file=sys.stderr)
                        continue
                    message = bytearray()
                    with open(in_file, 'rb') as in_file_stream:
                        if debug:
                            print('Debug', ':', in_file, file=sys.stderr)
                        message = in_file_stream.read()
                    dt_str = re.sub('/.*$', '',  re.sub('^.*/' + cat_subcat + '/', '', in_file))
                    an_dict, datatype_dict = parse(cccc, cat, subcat, output_cat, output_subcat, in_file, message, dt_str, conf_synop_staion_df, conf_temp_pilot_staion_df, debug)
                    for key in an_dict:
                        message_np = an_dict[key]
                        if key in property_dict:
                            property_dict[key] = np.concatenate([property_dict[key], message_np])
                        else:
                            property_dict[key] = message_np
                if datetime_name in property_dict and location_name in property_dict and location_name in datatype_dict:
                    name_list = []
                    data_list = []
                    name_list.append(location_name)
                    data_list.append(pa.array(property_dict[location_name], datatype_dict[location_name]))
                    datatype_dict.pop(location_name)
                    name_list.append(latitude_name)
                    data_list.append(pa.array(property_dict[latitude_name], datatype_dict[latitude_name]))
                    datatype_dict.pop(latitude_name)
                    name_list.append(longitude_name)
                    data_list.append(pa.array(property_dict[longitude_name], datatype_dict[longitude_name]))
                    datatype_dict.pop(longitude_name)
                    if subcat == 'synop' or subcat == 'synop_mobil':
                        name_list.append(height_of_station_ground_above_mean_sea_level_name)
                        data_list.append(pa.array(property_dict[height_of_station_ground_above_mean_sea_level_name], datatype_dict[height_of_station_ground_above_mean_sea_level_name]))
                        datatype_dict.pop(height_of_station_ground_above_mean_sea_level_name)
                    name_list.append(datetime_name)
                    data_list.append(pa.array(property_dict[datetime_name], pa.timestamp('ms', tz='utc')))
                    for datatype_key in datatype_dict.keys():
                        if datatype_key in property_dict:
                            if any([False if value == None else True for value in property_dict[datatype_key]]):
                                name_list.append(datatype_key)
                                data_list.append(pa.array(property_dict[datatype_key], datatype_dict[datatype_key]))
                    out_directory_list = [out_dir, cccc, 'alphanumeric_to_arrow', output_cat, output_subcat]
                    out_directory = '/'.join(out_directory_list)
                    os.makedirs(out_directory, exist_ok=True)
                    now = datetime.utcnow()
                    out_file_list = [out_directory, '/', 'C_', my_cccc, '_', str(now.year).zfill(4), str(now.month).zfill(2), str(now.day).zfill(2), str(now.hour).zfill(2), str(now.minute).zfill(2), str(now.second).zfill(2), '.feather']
                    out_file = ''.join(out_file_list)
                    with open(out_file, 'bw') as out_f:
                        batch = pa.record_batch(data_list, names=name_list)
                        table = pa.Table.from_batches([batch])
                        feather.write_feather(table, out_f, compression='zstd')
                        print(out_file, file=out_list_file)
Exemple #14
0
def test_factory_functions_invalid_input():
    with pytest.raises(TypeError, match="Expected pandas DataFrame, python"):
        pa.table("invalid input")

    with pytest.raises(TypeError, match="Expected pandas DataFrame"):
        pa.record_batch("invalid input")
def convert_to_arrow(my_cccc, in_file_list, out_dir, out_list_file, conf_df, write_location, debug):
    warno = 189
    out_arrows = []
    now = datetime.utcnow()
    create_datetime_list = ['C_', my_cccc, '_', str(now.year).zfill(4), str(now.month).zfill(2), str(now.day).zfill(2), str(now.hour).zfill(2), str(now.minute).zfill(2), str(now.second).zfill(2)]
    create_datetime = ''.join(create_datetime_list)
    cccc_set = set([re.sub('^.*/', '', re.sub('/grib/.*$', '', in_file)) for in_file in in_file_list])
    cat_subcat_set = set([re.search(r'^[^/]*/[^/]*/', re.sub('^.*/grib/', '', in_file)).group().rstrip('/') for in_file in in_file_list])
    for cccc in cccc_set:
        for cat_subcat in cat_subcat_set:
            keys = ['stepRange', 'typeOfLevel', 'level', 'shortName']
            missingValue = -3.402823e+38
            for in_file in in_file_list:
                property_dict = {}
                ft_list = []
                match = re.search(r'^.*/' + cccc + '/grib/' + cat_subcat + '/.*$', in_file)
                if not match:
                    continue
                if not os.access(in_file, os.F_OK):
                    print('Warning', warno, ':', in_file, 'does not exist.', file=sys.stderr)
                    continue
                elif not os.path.isfile(in_file):
                    print('Warning', warno, ':', in_file, 'is not file.', file=sys.stderr)
                    continue
                elif not os.access(in_file, os.R_OK):
                    print('Warning', warno, ':', in_file, 'is not readable.', file=sys.stderr)
                    continue
                dt_str = re.sub('/.*$', '', re.sub('^.*/' + cccc + '/grib/' + cat_subcat + '/', '', in_file))
                with open(in_file, 'r') as in_file_stream:
                    if debug:
                        print('Debug', ':', in_file, file=sys.stderr)
                    try:
                        codes_grib_multi_support_on()
                        iid = codes_index_new_from_file(in_file, keys)
                        key_values_list = []
                        for key in keys:
                            key_values = codes_index_get(iid, key)
                            key_values_list.append(key_values)
                        products = [[]]
                        for key_values in key_values_list:
                            products = [x + [y] for x in products for y in key_values]
                        for product in products:
                            for key_count in range(len(keys)):
                                codes_index_select(iid, keys[key_count], product[key_count])
                            while True:
                                gid = codes_new_from_index(iid)
                                if gid is None:
                                    break
                                codes_set(gid, 'missingValue', missingValue)
                                iterid = codes_keys_iterator_new(gid, 'ls')
                                step_range = None
                                type_of_level = None
                                level = None
                                short_name = None
                                cat = re.sub('/.*$', '', cat_subcat)
                                subcat = re.sub('^.*/', '', cat_subcat)
                                target_conf_df = conf_df[(conf_df['category'] == cat) & (conf_df['subcategory'] == subcat)]
                                while codes_keys_iterator_next(iterid):
                                    key = codes_keys_iterator_get_name(iterid)
                                    if key in keys:
                                        value = codes_get_string(gid, key)
                                        if key == 'stepRange' or key == 'level':
                                            target_conf_df = target_conf_df[(target_conf_df[key] == int(value))]
                                        else:
                                            target_conf_df = target_conf_df[(target_conf_df[key] == value)]
                                codes_keys_iterator_delete(iterid)
                                message_np = np.array([])
                                for conf_row in target_conf_df.itertuples():
                                    ft = codes_get(gid, 'stepRange')
                                    if not ft in ft_list:
                                        ft_list.append(ft)
                                    property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)] = np.array(codes_get_values(gid))
                                if write_location:
                                    iterid = codes_grib_iterator_new(gid, 0)
                                    lat_list = []
                                    lon_list = []
                                    while True:
                                        latitude_longitude_value = codes_grib_iterator_next(iterid)
                                        if not latitude_longitude_value:
                                            break
                                        else:
                                            lat_list.append(latitude_longitude_value[0])
                                            if latitude_longitude_value[1] < 180.0:
                                                lon_list.append(latitude_longitude_value[1])
                                            else:
                                                lon_list.append(latitude_longitude_value[1] - 360.0)
                                    codes_grib_iterator_delete(iterid)
                                    out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory]
                                    out_directory = '/'.join(out_directory_list)
                                    os.makedirs(out_directory, exist_ok=True)
                                    out_file_list = [out_directory, '/location.feather']
                                    out_file = ''.join(out_file_list)
                                    with open(out_file, 'bw') as out_f:
                                        location_batch = pa.record_batch([pa.array(lat_list, 'float32'), pa.array(lon_list, 'float32')], names=['latitude [degree]', 'longitude [degree]'])
                                        location_table = pa.Table.from_batches([location_batch])
                                        feather.write_feather(location_table, out_f, compression='zstd')
                                codes_release(gid)
                    except:
                        print('Warning', warno, ':', in_file, 'is invalid grib.', file=sys.stderr)
                if len(property_dict) > 0:
                    out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory]
                    out_directory = '/'.join(out_directory_list)
                    os.makedirs(out_directory, exist_ok=True)
                    out_file_list = [out_directory, '/location.feather']
                    out_file = ''.join(out_file_list)
                    location_df = feather.read_feather(out_file)
                    dt = datetime(int(dt_str[0:4]), int(dt_str[4:6]), int(dt_str[6:8]), int(dt_str[8:10]), 0, 0, 0, tzinfo=timezone.utc)
                    dt_list = [dt for i in range(0, len(location_df.index))]
                    for ft in ft_list:
                        name_list = ['latitude [degree]', 'longitude [degree]', 'datetime']
                        data_list = [pa.array(location_df['latitude [degree]'].values.tolist(), 'float32'), pa.array(location_df['longitude [degree]'].values.tolist(), 'float32')]
                        data_list.append(pa.array(dt_list, pa.timestamp('ms', tz='utc')))
                        for conf_row in conf_df[(conf_df['category'] == cat) & (conf_df['subcategory'] == subcat)].itertuples():
                            if len(property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)]) > 0:
                                if re.match(r'^.*U wind component.*$', conf_row.name):
                                    u_value_np = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)]
                                    v_value_np = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName.replace('u', 'v'), ft)]
                                    wind_speed_np = np.sqrt(np.power(u_value_np, 2) + np.power(v_value_np, 2))
                                    wind_direction_np = np.degrees(np.arctan2(v_value_np, u_value_np))
                                    wind_direction_np = np.array([value + 360.0 if value < 0 else value for value in wind_direction_np])
                                    name_list.append(ft + '/' + re.sub(r'U wind component', 'wind speed [m/s]', conf_row.name))
                                    data_list.append(pa.array(np.array(wind_speed_np, dtype=conf_row.datatype)))
                                    name_list.append(ft + '/' + re.sub(r'U wind component', 'wind direction [degree]', conf_row.name))
                                    data_list.append(pa.array(np.array(wind_direction_np, dtype=conf_row.datatype)))
                                elif not re.match(r'^.*V wind component.*$', conf_row.name):
                                    value_list = property_dict[(conf_row.category, conf_row.subcategory, conf_row.stepRange, conf_row.typeOfLevel, conf_row.level, conf_row.shortName, ft)]
                                    name_list.append(ft + '/' + conf_row.name)
                                    data_list.append(pa.array(np.array(value_list, dtype=conf_row.datatype)))
                        out_directory_list = [out_dir, cccc, 'grib_to_arrow', conf_row.category, conf_row.subcategory]
                        out_directory = '/'.join(out_directory_list)
                        os.makedirs(out_directory, exist_ok=True)
                        out_file_list = [out_directory, '/', dt_str, '_', create_datetime, '.feather']
                        out_file = ''.join(out_file_list)
                        with open(out_file, 'bw') as out_f:
                            property_batch = pa.record_batch(data_list, names=name_list)
                            property_table = pa.Table.from_batches([property_batch])
                            feather.write_feather(property_table, out_f, compression='zstd')
                            print(out_file, file=out_list_file)
Exemple #16
0
def test_recordbatch_column_sets_private_name():
    # ARROW-6429
    rb = pa.record_batch([pa.array([1, 2, 3, 4])], names=['a0'])
    assert rb[0]._name == 'a0'
def convert_to_arrow(my_cccc, in_file_list, out_dir, out_list_file, conf_df,
                     debug):
    warno = 189
    out_arrows = []
    now = datetime.utcnow()
    create_datetime_directory_list = [
        'C_', my_cccc, '_',
        str(now.year).zfill(4),
        str(now.month).zfill(2),
        str(now.day).zfill(2),
        str(now.hour).zfill(2),
        str(now.minute).zfill(2),
        str(now.second).zfill(2)
    ]
    create_datetime_directory = ''.join(create_datetime_directory_list)
    cccc_set = set([
        re.sub('^.*/', '', re.sub('/bufr/.*$', '', in_file))
        for in_file in in_file_list
    ])
    cat_subcat_set = set([
        re.search(r'^[^/]*/[^/]*/', re.sub('^.*/bufr/', '',
                                           in_file)).group().rstrip('/')
        for in_file in in_file_list
    ])
    for cccc in cccc_set:
        for cat_subcat in cat_subcat_set:
            cat = re.sub('/.*$', '', cat_subcat)
            subcat = re.sub('^.*/', '', cat_subcat)
            out_cat_subcat_df = conf_df[(conf_df['input_category'] == cat) & (
                conf_df['input_subcategory'] == subcat)]
            location_type_output_cat_subcat_set = set([
                str(location_type) + '/' + output_cat + '/' + output_subcat
                for output_index, location_type, output_cat, output_subcat in
                list(out_cat_subcat_df[[
                    'location_type', 'output_category', 'output_subcategory'
                ]].itertuples())
            ])
            for location_type_output_cat_subcat in location_type_output_cat_subcat_set:
                datatype_dict = {}
                property_dict = {}
                location_type_output_cat_subcat_list = location_type_output_cat_subcat.split(
                    '/')
                location_type = int(location_type_output_cat_subcat_list[0])
                output_cat = location_type_output_cat_subcat_list[1]
                output_subcat = location_type_output_cat_subcat_list[2]
                for in_file in in_file_list:
                    match = re.search(
                        r'^.*/' + cccc + '/bufr/' + cat_subcat + '/.*$',
                        in_file)
                    if not match:
                        continue
                    if not os.access(in_file, os.F_OK):
                        print('Warning',
                              warno,
                              ':',
                              in_file,
                              'does not exist.',
                              file=sys.stderr)
                        continue
                    elif not os.path.isfile(in_file):
                        print('Warning',
                              warno,
                              ':',
                              in_file,
                              'is not file.',
                              file=sys.stderr)
                        continue
                    elif not os.access(in_file, os.R_OK):
                        print('Warning',
                              warno,
                              ':',
                              in_file,
                              'is not readable.',
                              file=sys.stderr)
                        continue
                    if debug:
                        print('Debug', ':', in_file, file=sys.stderr)
                    with open(in_file, 'rb') as in_file_stream:
                        while True:
                            bufr = None
                            unexpanded_descriptors = []
                            try:
                                bufr = codes_bufr_new_from_file(in_file_stream)
                                if bufr is None:
                                    break
                                unexpanded_descriptors = codes_get_array(
                                    bufr, 'unexpandedDescriptors')
                            except:
                                break
                            descriptor_conf_df = pd.DataFrame(
                                index=[],
                                columns=['descriptor', 'descriptor_2'])
                            for bufr_descriptor in unexpanded_descriptors:
                                descriptor_conf_df = conf_df[
                                    (conf_df['input_category'] == cat)
                                    & (conf_df['input_subcategory'] == subcat)
                                    &
                                    (conf_df['location_type'] == location_type)
                                    &
                                    (conf_df['output_category'] == output_cat)
                                    & (conf_df['output_subcategory']
                                       == output_subcat) &
                                    (conf_df['descriptor'] == bufr_descriptor)]
                                if len(descriptor_conf_df.index) > 0:
                                    descriptor_2_list = list(
                                        set(descriptor_conf_df[[
                                            'descriptor_2'
                                        ]].values.flatten()))
                                    if len(descriptor_2_list
                                           ) > 0 and not np.isnan(
                                               descriptor_2_list[0]):
                                        is_descriptor_2 = False
                                        for descriptor_2 in descriptor_2_list:
                                            if descriptor_2 in unexpanded_descriptors:
                                                descriptor_conf_df = descriptor_conf_df[
                                                    descriptor_conf_df[
                                                        'descriptor_2'] ==
                                                    descriptor_2]
                                                is_descriptor_2 = True
                                                break
                                        if not is_descriptor_2:
                                            descriptor_conf_df = pd.DataFrame(
                                                index=[],
                                                columns=[
                                                    'descriptor',
                                                    'descriptor_2'
                                                ])
                                    break
                            if len(descriptor_conf_df.index) == 0:
                                print('Info',
                                      ':',
                                      'not found descriptor.',
                                      unexpanded_descriptors,
                                      in_file,
                                      file=sys.stderr)
                                break
                            number_of_subsets = codes_get(
                                bufr, 'numberOfSubsets')
                            if number_of_subsets == 0:
                                break
                            try:
                                codes_set(bufr, 'unpack', 1)
                            except:
                                break
                            bufr_dict = {}
                            none_np = np.array([])
                            if descriptor_conf_df['get_type'].values.flatten(
                            )[0] == 'subset':
                                for subset_num in range(
                                        1, number_of_subsets + 1):
                                    number_of_array = 0
                                    for conf_row in descriptor_conf_df.itertuples(
                                    ):
                                        array = getArray(
                                            bufr, subset_num,
                                            number_of_subsets, conf_row,
                                            in_file)
                                        if number_of_array == 0:
                                            if len(array) > 0:
                                                number_of_array = len(array)
                                            else:
                                                break
                                        if conf_row.convert_type == 'to_value' or conf_row.convert_type == 'to_value_to_array':
                                            if len(array
                                                   ) > conf_row.array_index:
                                                value = array[int(
                                                    conf_row.array_index)]
                                                if conf_row.convert_type == 'to_value_to_array':
                                                    array = np.array(
                                                        [
                                                            value
                                                            for i in range(
                                                                0,
                                                                number_of_array
                                                            )
                                                        ],
                                                        dtype=object)
                                                else:
                                                    array = np.array(
                                                        [value], dtype=object)
                                            elif len(array) == 0:
                                                array = np.array([
                                                    None for i in range(
                                                        0, number_of_array)
                                                ],
                                                                 dtype=object)
                                            else:
                                                print(
                                                    'Warning',
                                                    warno,
                                                    ':',
                                                    'len(array) is not more than conf_row.array_index.',
                                                    'subset',
                                                    'key:',
                                                    conf_row.key,
                                                    'array length:',
                                                    len(array),
                                                    'number of array:',
                                                    number_of_array,
                                                    'file:',
                                                    in_file,
                                                    file=sys.stderr)
                                                array = np.array([
                                                    None for i in range(
                                                        0, number_of_array)
                                                ],
                                                                 dtype=object)
                                        if len(array) < number_of_array:
                                            for padding_count in range(
                                                    len(array),
                                                    number_of_array):
                                                array = np.append(array, None)
                                        elif len(array) > number_of_array:
                                            print(
                                                'Warning',
                                                warno,
                                                ':',
                                                'len(array) is more than number_of_array.',
                                                'subset',
                                                'key:',
                                                conf_row.key,
                                                'array length:',
                                                len(array),
                                                'number of array:',
                                                number_of_array,
                                                'file:',
                                                in_file,
                                                file=sys.stderr)
                                            array = np.array([
                                                None for i in range(
                                                    0, number_of_array)
                                            ],
                                                             dtype=object)
                                            break
                                        if conf_row.key in bufr_dict:
                                            bufr_dict[
                                                conf_row.key] = np.concatenate(
                                                    [
                                                        bufr_dict[
                                                            conf_row.key],
                                                        array
                                                    ])
                                        else:
                                            bufr_dict[conf_row.key] = array
                            else:
                                number_of_array = 0
                                for conf_row in descriptor_conf_df.itertuples(
                                ):
                                    array = getArray(bufr, 0, 0, conf_row,
                                                     in_file)
                                    if number_of_array == 0:
                                        if len(array) == 0:
                                            print('Warning',
                                                  warno,
                                                  ':',
                                                  'len(array) is 0.',
                                                  '',
                                                  'key:',
                                                  conf_row.key,
                                                  'array length:',
                                                  len(array),
                                                  'number of array:',
                                                  number_of_array,
                                                  'file:',
                                                  in_file,
                                                  file=sys.stderr)
                                            break
                                        else:
                                            number_of_array = len(array)
                                    elif len(array) != number_of_array:
                                        if len(array) == 1:
                                            value = array[0]
                                            array = np.array([
                                                value for i in range(
                                                    0, number_of_array)
                                            ],
                                                             dtype=object)
                                        else:
                                            print(
                                                'Warning',
                                                warno,
                                                ':',
                                                'len(array) is not equals to number_of_array.',
                                                '',
                                                'key:',
                                                conf_row.key,
                                                'array length:',
                                                len(array),
                                                'number of array:',
                                                number_of_array,
                                                'file:',
                                                in_file,
                                                file=sys.stderr)
                                            array = np.array([
                                                None for i in range(
                                                    0, number_of_array)
                                            ],
                                                             dtype=object)
                                            break
                                    bufr_dict[conf_row.key] = array
                            for conf_row in descriptor_conf_df.itertuples():
                                if conf_row.output == 'location_datetime' and conf_row.key in bufr_dict:
                                    tmp_none_np = np.array([
                                        False if value == None else True
                                        for value in bufr_dict[conf_row.key]
                                    ])
                                    if len(none_np) > 0:
                                        none_np = none_np * tmp_none_np
                                    else:
                                        none_np = tmp_none_np
                            codes_release(bufr)
                            if len(bufr_dict) == 0:
                                break
                            bufr_dict['none'] = none_np
                            location_datetime_index_np = np.array([
                                index for index, value in enumerate(
                                    bufr_dict['none']) if value == True
                            ])
                            if len(location_datetime_index_np) > 0:
                                message_np = np.array([])
                                pre_conf_row_name = ''
                                for conf_row in descriptor_conf_df.itertuples(
                                ):
                                    if conf_row.name != pre_conf_row_name:
                                        datatype_dict[
                                            conf_row.name] = conf_row.datatype
                                        if len(message_np) > 0 and len(
                                                pre_conf_row_name) > 0:
                                            if pre_conf_row_name in property_dict:
                                                property_dict[
                                                    pre_conf_row_name] = np.concatenate(
                                                        [
                                                            property_dict[
                                                                pre_conf_row_name],
                                                            message_np
                                                        ])
                                            else:
                                                property_dict[
                                                    pre_conf_row_name] = message_np
                                            message_np = np.array([])
                                    if conf_row.key in bufr_dict:
                                        tmp_message_np = bufr_dict[
                                            conf_row.key]
                                        if max(location_datetime_index_np
                                               ) < len(tmp_message_np):
                                            tmp_message_np = tmp_message_np[
                                                location_datetime_index_np]
                                            if len(tmp_message_np) > 0:
                                                if len(message_np) > 0:
                                                    if conf_row.multiply != 0:
                                                        message_np = message_np + conf_row.multiply * tmp_message_np
                                                    else:
                                                        message_np = message_np + tmp_message_np
                                                else:
                                                    if conf_row.multiply != 0:
                                                        message_np = conf_row.multiply * tmp_message_np
                                                    else:
                                                        message_np = tmp_message_np
                                        else:
                                            print(
                                                'Info',
                                                'unexpanded_descriptors :',
                                                unexpanded_descriptors,
                                                ': conditon of',
                                                conf_row.key,
                                                max(location_datetime_index_np
                                                    ),
                                                len(tmp_message_np),
                                                in_file,
                                                file=sys.stderr)
                                    pre_conf_row_name = conf_row.name
                                if len(message_np) > 0 and len(
                                        pre_conf_row_name) > 0:
                                    if pre_conf_row_name in property_dict:
                                        property_dict[
                                            pre_conf_row_name] = np.concatenate(
                                                [
                                                    property_dict[
                                                        pre_conf_row_name],
                                                    message_np
                                                ])
                                    else:
                                        property_dict[
                                            pre_conf_row_name] = message_np
                if datetime_name in property_dict:
                    name_list = []
                    data_list = []
                    del_key_list = []
                    cat_subcat_conf_df = conf_df[
                        (conf_df['input_category'] == cat)
                        & (conf_df['input_subcategory'] == subcat) &
                        (conf_df['location_type'] == location_type) &
                        (conf_df['output_category'] == output_cat) &
                        (conf_df['output_subcategory'] == output_subcat)]
                    datetime_tail = cat_subcat_conf_df[(
                        cat_subcat_conf_df['name'] == 'datetime'
                    )]['key'].values.flatten()[-1]
                    for conf_row_name in set(cat_subcat_conf_df[(
                            cat_subcat_conf_df['output'] == 'location_datetime'
                    )]['name'].values.flatten()):
                        if conf_row_name == 'datetime':
                            plus_second_list = [
                                0 for dt in range(
                                    0, len(property_dict[conf_row_name]))
                            ]
                            if 'time period [s]' in property_dict:
                                plus_second_list = property_dict[
                                    'time period [s]']
                                del_key_list.append('time period [s]')
                            datetime_list = []
                            for i, dt_str in enumerate(
                                    property_dict[conf_row_name]):
                                try:
                                    if datetime_tail == 'millisecond':
                                        obs_datetime = datetime(
                                            int(dt_str[0:4]),
                                            int(dt_str[4:6]),
                                            int(dt_str[6:8]),
                                            int(dt_str[8:10]),
                                            int(dt_str[10:12]),
                                            int(dt_str[12:14]),
                                            int(dt_str[15:]),
                                            tzinfo=timezone.utc) + timedelta(
                                                seconds=plus_second_list[i])
                                    elif datetime_tail == 'second':
                                        obs_datetime = datetime(
                                            int(dt_str[0:4]),
                                            int(dt_str[4:6]),
                                            int(dt_str[6:8]),
                                            int(dt_str[8:10]),
                                            int(dt_str[10:12]),
                                            int(dt_str[12:14]),
                                            0,
                                            tzinfo=timezone.utc) + timedelta(
                                                seconds=plus_second_list[i])
                                    elif datetime_tail == 'minute':
                                        obs_datetime = datetime(
                                            int(dt_str[0:4]),
                                            int(dt_str[4:6]),
                                            int(dt_str[6:8]),
                                            int(dt_str[8:10]),
                                            int(dt_str[10:12]),
                                            0,
                                            0,
                                            tzinfo=timezone.utc) + timedelta(
                                                seconds=plus_second_list[i])
                                    elif datetime_tail == 'hour':
                                        obs_datetime = datetime(
                                            int(dt_str[0:4]),
                                            int(dt_str[4:6]),
                                            int(dt_str[6:8]),
                                            int(dt_str[8:10]),
                                            0,
                                            0,
                                            0,
                                            tzinfo=timezone.utc) + timedelta(
                                                seconds=plus_second_list[i])
                                    elif datetime_tail == 'day':
                                        obs_datetime = datetime(
                                            int(dt_str[0:4]),
                                            int(dt_str[4:6]),
                                            int(dt_str[6:8]),
                                            0,
                                            0,
                                            0,
                                            0,
                                            tzinfo=timezone.utc) + timedelta(
                                                seconds=plus_second_list[i])
                                    elif datetime_tail == 'month':
                                        obs_datetime = datetime(
                                            int(dt_str[0:4]),
                                            int(dt_str[4:6]),
                                            0,
                                            0,
                                            0,
                                            0,
                                            0,
                                            tzinfo=timezone.utc) + timedelta(
                                                seconds=plus_second_list[i])
                                    elif datetime_tail == 'year':
                                        obs_datetime = datetime(
                                            int(dt_str[0:4]),
                                            0,
                                            0,
                                            0,
                                            0,
                                            0,
                                            0,
                                            tzinfo=timezone.utc) + timedelta(
                                                seconds=plus_second_list[i])
                                    datetime_list.append(obs_datetime)
                                except:
                                    for property_dict_key in property_dict.keys(
                                    ):
                                        property_dict[property_dict_key].pop(i)
                            data_list.append(
                                pa.array(datetime_list,
                                         pa.timestamp('ms', tz='utc')))
                            name_list.append(conf_row_name)
                            datatype_dict.pop(conf_row_name)
                        elif conf_row_name != 'time period [s]':
                            if conf_row_name in property_dict:
                                data_list.append(
                                    pa.array(property_dict[conf_row_name],
                                             datatype_dict[conf_row_name]))
                                name_list.append(conf_row_name)
                                datatype_dict.pop(conf_row_name)
                    for datatype_key in datatype_dict.keys():
                        if datatype_key in property_dict:
                            if any([
                                    False if value == None else True
                                    for value in property_dict[datatype_key]
                            ]):
                                name_list.append(datatype_key)
                                data_list.append(
                                    pa.array(property_dict[datatype_key],
                                             datatype_dict[datatype_key]))
                    out_directory_list = [
                        out_dir, cccc, 'bufr_to_arrow', output_cat,
                        output_subcat
                    ]
                    out_directory = '/'.join(out_directory_list)
                    os.makedirs(out_directory, exist_ok=True)
                    now = datetime.utcnow()
                    out_file_list = [
                        out_directory, '/', 'C_', my_cccc, '_',
                        str(now.year).zfill(4),
                        str(now.month).zfill(2),
                        str(now.day).zfill(2),
                        str(now.hour).zfill(2),
                        str(now.minute).zfill(2),
                        str(now.second).zfill(2), '.feather'
                    ]
                    out_file = ''.join(out_file_list)
                    with open(out_file, 'bw') as out_f:
                        batch = pa.record_batch(data_list, names=name_list)
                        table = pa.Table.from_batches([batch])
                        feather.write_feather(table, out_f, compression='zstd')
                        print(out_file, file=out_list_file)