def cast_arrow_table_to_schema(
    tab: pa.Table,
    schema: Union[pa.Schema, None] = None,
    expect_full_schema: bool = True,
):
    """Casts an arrow schema to a new or partial schema

    Args:
        tab (pa.Table): An arrow table
        schema (Union[pa.Schema, None], optional): [description]. Defaults to None.
        expect_full_schema (bool, optional): if True, pyarrow reader will
            expect the input schema to have fields for every col in the
            input file. If False, then will only cast columns that
            are listed in the schema, leaving all other columns to their
            default type on read.
    """

    if expect_full_schema:
        update_schema = schema
    else:
        update_schema = update_existing_schema(tab.schema, schema)

    new_tab = tab.cast(update_schema)

    return new_tab
Beispiel #2
0
    def write_table(self, pa_table: pa.Table, writer_batch_size: Optional[int] = None):
        """Write a batch of Example to file.

        Args:
            example: the Example to add.
        """
        if writer_batch_size is None:
            writer_batch_size = self.writer_batch_size
        if self.pa_writer is None:
            self._build_writer(inferred_schema=pa_table.schema)
        pa_table = pa_table.cast(self._schema)
        batches: List[pa.RecordBatch] = pa_table.to_batches(max_chunksize=writer_batch_size)
        self._num_bytes += sum(batch.nbytes for batch in batches)
        self._num_examples += pa_table.num_rows
        for batch in batches:
            self.pa_writer.write_batch(batch)
def _add_mock_smry_meta_to_table(table: pa.Table) -> pa.Table:
    schema = table.schema
    for colname in schema.names:
        is_rate = bool("_r" in colname)
        is_total = bool("_t" in colname)

        metadata = None
        if is_rate or is_total:
            metadata = {
                b"unit": b"N/A",
                b"is_rate": b"True" if is_rate else b"False",
                b"is_total": b"True" if is_total else b"False",
                b"is_historical": b"False",
                b"keyword": b"UNKNOWN",
            }

        if metadata:
            idx = schema.get_field_index(colname)
            field = schema.field(idx)
            field = field.with_metadata(metadata)
            schema = schema.set(idx, field)

    table = table.cast(schema)
    return table