def _get_coder(self, typehint, window_coder):
     """Returns a coder based on a typehint object."""
     if window_coder:
         return coders.WindowedValueCoder(
             coders.registry.get_coder(typehint), coders.TimestampCoder(),
             window_coder)
     else:
         return coders.registry.get_coder(typehint)
Example #2
0
def get_coder_from_spec(coder_spec, kv_pair=False):
    """Return a coder instance from a coder spec.

  Args:
    coder_spec: A dict where the value of the '@type' key is a pickled instance
      of a Coder instance.
    kv_pair: True if a 2-tuple of coders (key and value) must be returned.

  Returns:
    A coder instance (has encode/decode methods). It is possible to return
    a 2-tuple of (key coder, value coder) if the spec is for a shuffle source
    or sink. Such shuffle source and sinks can take a 2-tuple of coders as
    parameter.

  Raises:
    ValueError: if KV coder requested but coder spec is not of a KV coder.
  """
    assert coder_spec is not None

    # Ignore the wrappers in these encodings.
    ignored_wrappers = (
        'kind:stream',
        'com.google.cloud.dataflow.sdk.util.TimerOrElement$TimerOrElementCoder'
    )
    if coder_spec['@type'] in ignored_wrappers:
        assert len(coder_spec['component_encodings']) == 1
        coder_spec = coder_spec['component_encodings'][0]
        return get_coder_from_spec(coder_spec, kv_pair=kv_pair)

    # We pass coders in the form "<coder_name>$<pickled_data>" to make the job
    # description JSON more readable.
    coder = coders.deserialize_coder(coder_spec['@type'])

    # If this is a coder with components potentially modified by the service,
    # use these components.
    #
    # TODO(ccy): This is necessary since the service may move around the
    # wrapped types of WindowedValueCoders and TupleCoders.  We should refactor
    # coder serialization so these special cases is not necessary.
    if isinstance(coder, coders.WindowedValueCoder):
        value_coder, timestamp_coder, window_coder = [
            get_coder_from_spec(c) for c in coder_spec['component_encodings']
        ]
        coder = coders.WindowedValueCoder(value_coder, timestamp_coder,
                                          window_coder)
    elif isinstance(coder, coders.TupleCoder):
        component_coders = [
            get_coder_from_spec(c) for c in coder_spec['component_encodings']
        ]
        coder = coders.TupleCoder(component_coders)

    if kv_pair:
        if not coder.is_kv_coder():
            raise ValueError('Coder is not a KV coder: %s.' % coder)
        return coder.key_coder(), coder.value_coder()
    else:
        return coder
 def run_Create(self, transform_node):
     transform = transform_node.transform
     step = self._add_step(TransformNames.CREATE_PCOLLECTION,
                           transform_node.full_label, transform_node)
     # TODO(silviuc): Eventually use a coder based on typecoders.
     # Note that we base64-encode values here so that the service will accept
     # the values.
     element_coder = coders.PickleCoder()
     step.add_property(PropertyNames.ELEMENT, [
         base64.b64encode(element_coder.encode(v)) for v in transform.value
     ])
     # The service expects a WindowedValueCoder here, so we wrap the actual
     # encoding in a WindowedValueCoder.
     step.encoding = self._get_cloud_encoding(
         coders.WindowedValueCoder(element_coder))
     step.add_property(PropertyNames.OUTPUT_INFO, [{
         PropertyNames.USER_NAME:
         ('%s.%s' % (transform_node.full_label, PropertyNames.OUT)),
         PropertyNames.ENCODING:
         step.encoding,
         PropertyNames.OUTPUT_NAME:
         PropertyNames.OUT
     }])
        },
        'encoding': {
            'component_encodings': [{
                '@type': 'notused'
            }, {
                '@type': 'notused'
            }],
            '@type': coders.serialize_coder(coders.PickleCoder())
        }
    }],
    '@type':
    'ConcatSource'
}

CODER = coders.PickleCoder()
WINDOWED_CODER = coders.WindowedValueCoder(CODER)

CODER_SPEC = CODER.as_cloud_object()
WINDOWED_CODER_SPEC = WINDOWED_CODER.as_cloud_object()


def add_source_codec_spec(target):
    target.source.codec = dataflow.Source.CodecValue()
    for k, v in CODER_SPEC.iteritems():
        target.source.codec.additionalProperties.append(
            dataflow.Source.CodecValue.AdditionalProperty(
                key=k, value=to_json_value(v)))


def add_source_windowed_codec_spec(target):
    target.source.codec = dataflow.Source.CodecValue()