def cast_arrow_table_to_schema( tab: pa.Table, schema: Union[pa.Schema, None] = None, expect_full_schema: bool = True, ): """Casts an arrow schema to a new or partial schema Args: tab (pa.Table): An arrow table schema (Union[pa.Schema, None], optional): [description]. Defaults to None. expect_full_schema (bool, optional): if True, pyarrow reader will expect the input schema to have fields for every col in the input file. If False, then will only cast columns that are listed in the schema, leaving all other columns to their default type on read. """ if expect_full_schema: update_schema = schema else: update_schema = update_existing_schema(tab.schema, schema) new_tab = tab.cast(update_schema) return new_tab
def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None): """Write a batch of Example to file. Args: example: the Example to add. """ if writer_batch_size is None: writer_batch_size = self.writer_batch_size if self.pa_writer is None: self._build_writer(inferred_schema=pa_table.schema) pa_table = pa_table.cast(self._schema) batches: List[pa.RecordBatch] = pa_table.to_batches(max_chunksize=writer_batch_size) self._num_bytes += sum(batch.nbytes for batch in batches) self._num_examples += pa_table.num_rows for batch in batches: self.pa_writer.write_batch(batch)
def _add_mock_smry_meta_to_table(table: pa.Table) -> pa.Table: schema = table.schema for colname in schema.names: is_rate = bool("_r" in colname) is_total = bool("_t" in colname) metadata = None if is_rate or is_total: metadata = { b"unit": b"N/A", b"is_rate": b"True" if is_rate else b"False", b"is_total": b"True" if is_total else b"False", b"is_historical": b"False", b"keyword": b"UNKNOWN", } if metadata: idx = schema.get_field_index(colname) field = schema.field(idx) field = field.with_metadata(metadata) schema = schema.set(idx, field) table = table.cast(schema) return table