def _nonnull_coder_from_type(field_type): type_info = field_type.WhichOneof("type_info") if type_info == "atomic_type": if field_type.atomic_type in (schema_pb2.INT32, schema_pb2.INT64): return VarIntCoder() elif field_type.atomic_type == schema_pb2.DOUBLE: return FloatCoder() elif field_type.atomic_type == schema_pb2.STRING: return StrUtf8Coder() elif field_type.atomic_type == schema_pb2.BOOLEAN: return BooleanCoder() elif field_type.atomic_type == schema_pb2.BYTES: return BytesCoder() elif type_info == "array_type": return IterableCoder(_coder_from_type(field_type.array_type.element_type)) elif type_info == "map_type": return MapCoder( _coder_from_type(field_type.map_type.key_type), _coder_from_type(field_type.map_type.value_type)) elif type_info == "row_type": return RowCoder(field_type.row_type.schema) # The Java SDK supports several more types, but the coders are not yet # standard, and are not implemented in Python. raise ValueError( "Encountered a type that is not currently supported by RowCoder: %s" % field_type)
def coder_from_type(field_type): type_info = field_type.WhichOneof("type_info") if type_info == "atomic_type": if field_type.atomic_type in (schema_pb2.INT32, schema_pb2.INT64): return VarIntCoder() elif field_type.atomic_type == schema_pb2.DOUBLE: return FloatCoder() elif field_type.atomic_type == schema_pb2.STRING: return StrUtf8Coder() elif type_info == "array_type": return IterableCoder( RowCoder.coder_from_type(field_type.array_type.element_type)) # The Java SDK supports several more types, but the coders are not yet # standard, and are not implemented in Python. raise ValueError( "Encountered a type that is not currently supported by RowCoder: %s" % field_type)
class CollectingFn(beam.DoFn): BUFFER_STATE = BagStateSpec('buffer', VarIntCoder()) COUNT_STATE = CombiningValueStateSpec('count', sum) def process(self, element, buffer_state=beam.DoFn.StateParam(BUFFER_STATE), count_state=beam.DoFn.StateParam(COUNT_STATE)): value = int(element[1].decode()) buffer_state.add(value) count_state.add(1) count = count_state.read() if count >= NUM_RECORDS: yield sum(buffer_state.read()) count_state.clear() buffer_state.clear()
def _nonnull_coder_from_type(field_type): type_info = field_type.WhichOneof("type_info") if type_info == "atomic_type": if field_type.atomic_type in (schema_pb2.INT32, schema_pb2.INT64): return VarIntCoder() elif field_type.atomic_type == schema_pb2.DOUBLE: return FloatCoder() elif field_type.atomic_type == schema_pb2.STRING: return StrUtf8Coder() elif field_type.atomic_type == schema_pb2.BOOLEAN: return BooleanCoder() elif field_type.atomic_type == schema_pb2.BYTES: return BytesCoder() elif type_info == "array_type": return IterableCoder( _coder_from_type(field_type.array_type.element_type)) elif type_info == "map_type": return MapCoder(_coder_from_type(field_type.map_type.key_type), _coder_from_type(field_type.map_type.value_type)) elif type_info == "logical_type": # Special case for the Any logical type. Just use the default coder for an # unknown Python object. if field_type.logical_type.urn == PYTHON_ANY_URN: return typecoders.registry.get_coder(object) logical_type = LogicalType.from_runner_api(field_type.logical_type) return LogicalTypeCoder( logical_type, _coder_from_type(field_type.logical_type.representation)) elif type_info == "row_type": return RowCoder(field_type.row_type.schema) # The Java SDK supports several more types, but the coders are not yet # standard, and are not implemented in Python. raise ValueError( "Encountered a type that is not currently supported by RowCoder: %s" % field_type)
class RowCoderImpl(StreamCoderImpl): """For internal use only; no backwards-compatibility guarantees.""" SIZE_CODER = VarIntCoder().get_impl() NULL_MARKER_CODER = BytesCoder().get_impl() def __init__(self, schema, components): self.schema = schema self.constructor = named_tuple_from_schema(schema) self.components = list(c.get_impl() for c in components) self.has_nullable_fields = any( field.type.nullable for field in self.schema.fields) def encode_to_stream(self, value, out, nested): nvals = len(self.schema.fields) self.SIZE_CODER.encode_to_stream(nvals, out, True) attrs = [getattr(value, f.name) for f in self.schema.fields] words = array('B') if self.has_nullable_fields: nulls = list(attr is None for attr in attrs) if any(nulls): words = array('B', itertools.repeat(0, (nvals + 7) // 8)) for i, is_null in enumerate(nulls): words[i // 8] |= is_null << (i % 8) self.NULL_MARKER_CODER.encode_to_stream(words.tostring(), out, True) for c, field, attr in zip(self.components, self.schema.fields, attrs): if attr is None: if not field.type.nullable: raise ValueError( "Attempted to encode null for non-nullable field \"{}\".".format( field.name)) continue c.encode_to_stream(attr, out, True) def decode_from_stream(self, in_stream, nested): nvals = self.SIZE_CODER.decode_from_stream(in_stream, True) words = array('B') words.fromstring(self.NULL_MARKER_CODER.decode_from_stream(in_stream, True)) if words: nulls = ((words[i // 8] >> (i % 8)) & 0x01 for i in range(nvals)) else: nulls = itertools.repeat(False, nvals) # If this coder's schema has more attributes than the encoded value, then # the schema must have changed. Populate the unencoded fields with nulls. if len(self.components) > nvals: nulls = itertools.chain( nulls, itertools.repeat(True, len(self.components) - nvals)) # Note that if this coder's schema has *fewer* attributes than the encoded # value, we just need to ignore the additional values, which will occur # here because we only decode as many values as we have coders for. return self.constructor( *( None if is_null else c.decode_from_stream(in_stream, True) for c, is_null in zip(self.components, nulls))) def _make_value_coder(self, nulls=itertools.repeat(False)): components = [ component for component, is_null in zip(self.components, nulls) if not is_null ] if self.has_nullable_fields else self.components return TupleCoder(components).get_impl()