Exemple #1
0
def _nonnull_coder_from_type(field_type):
  type_info = field_type.WhichOneof("type_info")
  if type_info == "atomic_type":
    if field_type.atomic_type in (schema_pb2.INT32, schema_pb2.INT64):
      return VarIntCoder()
    elif field_type.atomic_type == schema_pb2.DOUBLE:
      return FloatCoder()
    elif field_type.atomic_type == schema_pb2.STRING:
      return StrUtf8Coder()
    elif field_type.atomic_type == schema_pb2.BOOLEAN:
      return BooleanCoder()
    elif field_type.atomic_type == schema_pb2.BYTES:
      return BytesCoder()
  elif type_info == "array_type":
    return IterableCoder(_coder_from_type(field_type.array_type.element_type))
  elif type_info == "map_type":
    return MapCoder(
        _coder_from_type(field_type.map_type.key_type),
        _coder_from_type(field_type.map_type.value_type))
  elif type_info == "row_type":
    return RowCoder(field_type.row_type.schema)

  # The Java SDK supports several more types, but the coders are not yet
  # standard, and are not implemented in Python.
  raise ValueError(
      "Encountered a type that is not currently supported by RowCoder: %s" %
      field_type)
Exemple #2
0
    def coder_from_type(field_type):
        type_info = field_type.WhichOneof("type_info")
        if type_info == "atomic_type":
            if field_type.atomic_type in (schema_pb2.INT32, schema_pb2.INT64):
                return VarIntCoder()
            elif field_type.atomic_type == schema_pb2.DOUBLE:
                return FloatCoder()
            elif field_type.atomic_type == schema_pb2.STRING:
                return StrUtf8Coder()
        elif type_info == "array_type":
            return IterableCoder(
                RowCoder.coder_from_type(field_type.array_type.element_type))

        # The Java SDK supports several more types, but the coders are not yet
        # standard, and are not implemented in Python.
        raise ValueError(
            "Encountered a type that is not currently supported by RowCoder: %s"
            % field_type)
class CollectingFn(beam.DoFn):
    BUFFER_STATE = BagStateSpec('buffer', VarIntCoder())
    COUNT_STATE = CombiningValueStateSpec('count', sum)

    def process(self,
                element,
                buffer_state=beam.DoFn.StateParam(BUFFER_STATE),
                count_state=beam.DoFn.StateParam(COUNT_STATE)):
        value = int(element[1].decode())
        buffer_state.add(value)

        count_state.add(1)
        count = count_state.read()

        if count >= NUM_RECORDS:
            yield sum(buffer_state.read())
            count_state.clear()
            buffer_state.clear()
Exemple #4
0
def _nonnull_coder_from_type(field_type):
    type_info = field_type.WhichOneof("type_info")
    if type_info == "atomic_type":
        if field_type.atomic_type in (schema_pb2.INT32, schema_pb2.INT64):
            return VarIntCoder()
        elif field_type.atomic_type == schema_pb2.DOUBLE:
            return FloatCoder()
        elif field_type.atomic_type == schema_pb2.STRING:
            return StrUtf8Coder()
        elif field_type.atomic_type == schema_pb2.BOOLEAN:
            return BooleanCoder()
        elif field_type.atomic_type == schema_pb2.BYTES:
            return BytesCoder()
    elif type_info == "array_type":
        return IterableCoder(
            _coder_from_type(field_type.array_type.element_type))
    elif type_info == "map_type":
        return MapCoder(_coder_from_type(field_type.map_type.key_type),
                        _coder_from_type(field_type.map_type.value_type))
    elif type_info == "logical_type":
        # Special case for the Any logical type. Just use the default coder for an
        # unknown Python object.
        if field_type.logical_type.urn == PYTHON_ANY_URN:
            return typecoders.registry.get_coder(object)

        logical_type = LogicalType.from_runner_api(field_type.logical_type)
        return LogicalTypeCoder(
            logical_type,
            _coder_from_type(field_type.logical_type.representation))
    elif type_info == "row_type":
        return RowCoder(field_type.row_type.schema)

    # The Java SDK supports several more types, but the coders are not yet
    # standard, and are not implemented in Python.
    raise ValueError(
        "Encountered a type that is not currently supported by RowCoder: %s" %
        field_type)
Exemple #5
0
class RowCoderImpl(StreamCoderImpl):
  """For internal use only; no backwards-compatibility guarantees."""
  SIZE_CODER = VarIntCoder().get_impl()
  NULL_MARKER_CODER = BytesCoder().get_impl()

  def __init__(self, schema, components):
    self.schema = schema
    self.constructor = named_tuple_from_schema(schema)
    self.components = list(c.get_impl() for c in components)
    self.has_nullable_fields = any(
        field.type.nullable for field in self.schema.fields)

  def encode_to_stream(self, value, out, nested):
    nvals = len(self.schema.fields)
    self.SIZE_CODER.encode_to_stream(nvals, out, True)
    attrs = [getattr(value, f.name) for f in self.schema.fields]

    words = array('B')
    if self.has_nullable_fields:
      nulls = list(attr is None for attr in attrs)
      if any(nulls):
        words = array('B', itertools.repeat(0, (nvals + 7) // 8))
        for i, is_null in enumerate(nulls):
          words[i // 8] |= is_null << (i % 8)

    self.NULL_MARKER_CODER.encode_to_stream(words.tostring(), out, True)

    for c, field, attr in zip(self.components, self.schema.fields, attrs):
      if attr is None:
        if not field.type.nullable:
          raise ValueError(
              "Attempted to encode null for non-nullable field \"{}\".".format(
                  field.name))
        continue
      c.encode_to_stream(attr, out, True)

  def decode_from_stream(self, in_stream, nested):
    nvals = self.SIZE_CODER.decode_from_stream(in_stream, True)
    words = array('B')
    words.fromstring(self.NULL_MARKER_CODER.decode_from_stream(in_stream, True))

    if words:
      nulls = ((words[i // 8] >> (i % 8)) & 0x01 for i in range(nvals))
    else:
      nulls = itertools.repeat(False, nvals)

    # If this coder's schema has more attributes than the encoded value, then
    # the schema must have changed. Populate the unencoded fields with nulls.
    if len(self.components) > nvals:
      nulls = itertools.chain(
          nulls, itertools.repeat(True, len(self.components) - nvals))

    # Note that if this coder's schema has *fewer* attributes than the encoded
    # value, we just need to ignore the additional values, which will occur
    # here because we only decode as many values as we have coders for.
    return self.constructor(
        *(
            None if is_null else c.decode_from_stream(in_stream, True) for c,
            is_null in zip(self.components, nulls)))

  def _make_value_coder(self, nulls=itertools.repeat(False)):
    components = [
        component for component,
        is_null in zip(self.components, nulls) if not is_null
    ] if self.has_nullable_fields else self.components
    return TupleCoder(components).get_impl()