def _unbatch_transform(proxy, include_indexes):
    if isinstance(proxy, pd.DataFrame):
        ctor = element_type_from_dataframe(proxy,
                                           include_indexes=include_indexes)

        return beam.ParDo(
            _UnbatchWithIndex(ctor
                              ) if include_indexes else _UnbatchNoIndex(ctor))
    elif isinstance(proxy, pd.Series):
        # Raise a TypeError if proxy has an unknown type
        output_type = _dtype_to_fieldtype(proxy.dtype)
        # TODO: Should the index ever be included for a Series?
        if _match_is_optional(output_type):

            def unbatch(series):
                for isnull, value in zip(pd.isnull(series), series):
                    yield None if isnull else value
        else:

            def unbatch(series):
                yield from series

        return beam.FlatMap(unbatch).with_output_types(output_type)
    # TODO: What about scalar inputs?
    else:
        raise TypeError("Proxy '%s' has unsupported type '%s'" %
                        (proxy, type(proxy)))
예제 #2
0
def typing_to_runner_api(type_):
  if match_is_named_tuple(type_):
    schema = None
    if hasattr(type_, _BEAM_SCHEMA_ID):
      schema = SCHEMA_REGISTRY.get_schema_by_id(getattr(type_, _BEAM_SCHEMA_ID))
    if schema is None:
      fields = [
          schema_pb2.Field(
              name=name, type=typing_to_runner_api(type_._field_types[name]))
          for name in type_._fields
      ]
      type_id = str(uuid4())
      schema = schema_pb2.Schema(fields=fields, id=type_id)
      setattr(type_, _BEAM_SCHEMA_ID, type_id)
      SCHEMA_REGISTRY.add(type_, schema)

    return schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=schema))

  # All concrete types (other than NamedTuple sub-classes) should map to
  # a supported primitive type.
  elif type_ in PRIMITIVE_TO_ATOMIC_TYPE:
    return schema_pb2.FieldType(atomic_type=PRIMITIVE_TO_ATOMIC_TYPE[type_])

  elif _match_is_exactly_mapping(type_):
    key_type, value_type = map(typing_to_runner_api, _get_args(type_))
    return schema_pb2.FieldType(
        map_type=schema_pb2.MapType(key_type=key_type, value_type=value_type))

  elif _match_is_optional(type_):
    # It's possible that a user passes us Optional[Optional[T]], but in python
    # typing this is indistinguishable from Optional[T] - both resolve to
    # Union[T, None] - so there's no need to check for that case here.
    result = typing_to_runner_api(extract_optional_type(type_))
    result.nullable = True
    return result

  elif _safe_issubclass(type_, Sequence):
    element_type = typing_to_runner_api(_get_args(type_)[0])
    return schema_pb2.FieldType(
        array_type=schema_pb2.ArrayType(element_type=element_type))

  elif _safe_issubclass(type_, Mapping):
    key_type, value_type = map(typing_to_runner_api, _get_args(type_))
    return schema_pb2.FieldType(
        map_type=schema_pb2.MapType(key_type=key_type, value_type=value_type))

  try:
    logical_type = LogicalType.from_typing(type_)
  except ValueError:
    # Unknown type, just treat it like Any
    return schema_pb2.FieldType(
        logical_type=schema_pb2.LogicalType(urn=PYTHON_ANY_URN))
  else:
    # TODO(bhulette): Add support for logical types that require arguments
    return schema_pb2.FieldType(
        logical_type=schema_pb2.LogicalType(
            urn=logical_type.urn(),
            representation=typing_to_runner_api(
                logical_type.representation_type())))
예제 #3
0
def typing_to_runner_api(type_):
    if _match_is_named_tuple(type_):
        schema = None
        if hasattr(type_, _BEAM_SCHEMA_ID):
            schema = SCHEMA_REGISTRY.get_schema_by_id(
                getattr(type_, _BEAM_SCHEMA_ID))
        if schema is None:
            fields = [
                schema_pb2.Field(name=name,
                                 type=typing_to_runner_api(
                                     type_._field_types[name]))
                for name in type_._fields
            ]
            type_id = str(uuid4())
            schema = schema_pb2.Schema(fields=fields, id=type_id)
            setattr(type_, _BEAM_SCHEMA_ID, type_id)
            SCHEMA_REGISTRY.add(type_, schema)

        return schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=schema))

    # All concrete types (other than NamedTuple sub-classes) should map to
    # a supported primitive type.
    elif type_ in PRIMITIVE_TO_ATOMIC_TYPE:
        return schema_pb2.FieldType(
            atomic_type=PRIMITIVE_TO_ATOMIC_TYPE[type_])

    elif sys.version_info.major == 2 and type_ == str:
        raise ValueError(
            "type 'str' is not supported in python 2. Please use 'unicode' or "
            "'typing.ByteString' instead to unambiguously indicate if this is a "
            "UTF-8 string or a byte array.")

    elif _match_is_exactly_mapping(type_):
        key_type, value_type = map(typing_to_runner_api, _get_args(type_))
        return schema_pb2.FieldType(map_type=schema_pb2.MapType(
            key_type=key_type, value_type=value_type))

    elif _match_is_optional(type_):
        # It's possible that a user passes us Optional[Optional[T]], but in python
        # typing this is indistinguishable from Optional[T] - both resolve to
        # Union[T, None] - so there's no need to check for that case here.
        result = typing_to_runner_api(extract_optional_type(type_))
        result.nullable = True
        return result

    elif _safe_issubclass(type_, Sequence):
        element_type = typing_to_runner_api(_get_args(type_)[0])
        return schema_pb2.FieldType(array_type=schema_pb2.ArrayType(
            element_type=element_type))

    elif _safe_issubclass(type_, Mapping):
        key_type, value_type = map(typing_to_runner_api, _get_args(type_))
        return schema_pb2.FieldType(map_type=schema_pb2.MapType(
            key_type=key_type, value_type=value_type))

    raise ValueError("Unsupported type: %s" % type_)