コード例 #1
0
ファイル: operation_specs.py プロジェクト: zoltan-kski/beam
def get_coder_from_spec(coder_spec):
  """Return a coder instance from a coder spec.

  Args:
    coder_spec: A dict where the value of the '@type' key is a pickled instance
      of a Coder instance.

  Returns:
    A coder instance (has encode/decode methods).
  """
  assert coder_spec is not None

  # Ignore the wrappers in these encodings.
  ignored_wrappers = (
      'com.google.cloud.dataflow.sdk.util.TimerOrElement$TimerOrElementCoder')
  if coder_spec['@type'] in ignored_wrappers:
    assert len(coder_spec['component_encodings']) == 1
    coder_spec = coder_spec['component_encodings'][0]
    return get_coder_from_spec(coder_spec)

  # Handle a few well known types of coders.
  if coder_spec['@type'] == 'kind:pair':
    assert len(coder_spec['component_encodings']) == 2
    component_coders = [
        get_coder_from_spec(c) for c in coder_spec['component_encodings']]
    return coders.TupleCoder(component_coders)
  elif coder_spec['@type'] == 'kind:stream':
    assert len(coder_spec['component_encodings']) == 1
    return coders.IterableCoder(
        get_coder_from_spec(coder_spec['component_encodings'][0]))
  elif coder_spec['@type'] == 'kind:windowed_value':
    assert len(coder_spec['component_encodings']) == 2
    value_coder, window_coder = [
        get_coder_from_spec(c) for c in coder_spec['component_encodings']]
    return coders.coders.WindowedValueCoder(
        value_coder, window_coder=window_coder)
  elif coder_spec['@type'] == 'kind:interval_window':
    assert ('component_encodings' not in coder_spec
            or not coder_spec['component_encodings'])
    return coders.coders.IntervalWindowCoder()
  elif coder_spec['@type'] == 'kind:global_window':
    assert ('component_encodings' not in coder_spec
            or not coder_spec['component_encodings'])
    return coders.coders.GlobalWindowCoder()
  elif coder_spec['@type'] == 'kind:varint':
    assert ('component_encodings' not in coder_spec
            or len(coder_spec['component_encodings'] == 0))
    return coders.coders.VarIntCoder()
  elif coder_spec['@type'] == 'kind:length_prefix':
    assert len(coder_spec['component_encodings']) == 1
    return coders.coders.LengthPrefixCoder(
        get_coder_from_spec(coder_spec['component_encodings'][0]))
  elif coder_spec['@type'] == 'kind:bytes':
    assert ('component_encodings' not in coder_spec
            or len(coder_spec['component_encodings'] == 0))
    return coders.BytesCoder()

  # We pass coders in the form "<coder_name>$<pickled_data>" to make the job
  # description JSON more readable.
  return coders.coders.deserialize_coder(coder_spec['@type'])
コード例 #2
0
def get_coder_from_spec(coder_spec):
    """Return a coder instance from a coder spec.

  Args:
    coder_spec: A dict where the value of the '@type' key is a pickled instance
      of a Coder instance.

  Returns:
    A coder instance (has encode/decode methods).
  """
    assert coder_spec is not None

    # Ignore the wrappers in these encodings.
    # TODO(silviuc): Make sure with all the renamings that names below are ok.
    if coder_spec['@type'] in ignored_wrappers:
        assert len(coder_spec['component_encodings']) == 1
        coder_spec = coder_spec['component_encodings'][0]
        return get_coder_from_spec(coder_spec)

    # Handle a few well known types of coders.
    if coder_spec['@type'] == 'kind:pair':
        assert len(coder_spec['component_encodings']) == 2
        component_coders = [
            get_coder_from_spec(c) for c in coder_spec['component_encodings']
        ]
        return coders.TupleCoder(component_coders)
    elif coder_spec['@type'] == 'kind:stream':
        assert len(coder_spec['component_encodings']) == 1
        return coders.IterableCoder(
            get_coder_from_spec(coder_spec['component_encodings'][0]))
    elif coder_spec['@type'] == 'kind:windowed_value':
        assert len(coder_spec['component_encodings']) == 2
        value_coder, window_coder = [
            get_coder_from_spec(c) for c in coder_spec['component_encodings']
        ]
        return coders.WindowedValueCoder(value_coder,
                                         window_coder=window_coder)
    elif coder_spec['@type'] == 'kind:interval_window':
        assert ('component_encodings' not in coder_spec
                or len(coder_spec['component_encodings'] == 0))
        return coders.IntervalWindowCoder()
    elif coder_spec['@type'] == 'kind:global_window':
        assert ('component_encodings' not in coder_spec
                or not coder_spec['component_encodings'])
        return coders.GlobalWindowCoder()
    elif coder_spec['@type'] == 'kind:length_prefix':
        assert len(coder_spec['component_encodings']) == 1
        return coders.LengthPrefixCoder(
            get_coder_from_spec(coder_spec['component_encodings'][0]))

    # We pass coders in the form "<coder_name>$<pickled_data>" to make the job
    # description JSON more readable.
    return coders.deserialize_coder(coder_spec['@type'])
コード例 #3
0
ファイル: dataflow_runner.py プロジェクト: wangjie05/beam
 def __init__(self, iterable_side_input):
   # pylint: disable=protected-access
   side_input_data = iterable_side_input._side_input_data()
   assert side_input_data.access_pattern == common_urns.ITERABLE_SIDE_INPUT
   iterable_view_fn = side_input_data.view_fn
   self._data = beam.pvalue.SideInputData(
       self.DATAFLOW_MULTIMAP_URN,
       side_input_data.window_mapping_fn,
       lambda multimap: iterable_view_fn(multimap['']),
       coders.WindowedValueCoder(
           coders.TupleCoder((coders.BytesCoder(),
                              side_input_data.coder.wrapped_value_coder)),
           side_input_data.coder.window_coder))
コード例 #4
0
 def coder_id_from_element_type(
     self, element_type, requires_deterministic_key_coder=None):
   # type: (Any, Optional[str]) -> str
   if self.use_fake_coders:
     return pickler.dumps(element_type).decode('ascii')
   else:
     coder = coders.registry.get_coder(element_type)
     if requires_deterministic_key_coder:
       coder = coders.TupleCoder([
           coder.key_coder().as_deterministic_coder(
               requires_deterministic_key_coder),
           coder.value_coder()
       ])
     return self.coders.get_id(coder)
コード例 #5
0
ファイル: standard_coders_test.py プロジェクト: wikier/beam
class StandardCodersTest(unittest.TestCase):

  _urn_to_coder_class = {
      'beam:coders:bytes:0.1': coders.BytesCoder,
      'beam:coders:varint:0.1': coders.VarIntCoder,
      'beam:coders:kv:0.1': lambda k, v: coders.TupleCoder((k, v))
  }

  _urn_to_json_value_parser = {
      'beam:coders:bytes:0.1': lambda x: x,
      'beam:coders:varint:0.1': lambda x: x,
      'beam:coders:kv:0.1':
          lambda x, key_parser, value_parser: (key_parser(x['key']),
                                               value_parser(x['value']))
  }

  # We must prepend an underscore to this name so that the open-source unittest
  # runner does not execute this method directly as a test.
  @classmethod
  def _create_test(cls, spec):
    counter = 0
    name = spec.get('name', spec['coder']['urn'].split(':')[-2])
    unique_name = 'test_' + name
    while hasattr(cls, unique_name):
      counter += 1
      unique_name = 'test_%s_%d' % (name, counter)
    setattr(cls, unique_name, lambda self: self._run_coder_test(spec))

  # We must prepend an underscore to this name so that the open-source unittest
  # runner does not execute this method directly as a test.
  @classmethod
  def _create_tests(cls, coder_test_specs):
    for ix, spec in enumerate(yaml.load_all(open(coder_test_specs))):
      spec['index'] = ix
      cls._create_test(spec)

  def _run_coder_test(self, spec):
    coder = self.parse_coder(spec['coder'])
    parse_value = self.json_value_parser(spec['coder'])
    nested_list = [spec['nested']] if 'nested' in spec else [True, False]
    for nested in nested_list:
      for expected_encoded, json_value in spec['examples'].items():
        value = parse_value(json_value)
        expected_encoded = expected_encoded.encode('latin1')
        actual_encoded = encode_nested(coder, value, nested)
        if self.fix and actual_encoded != expected_encoded:
          self.to_fix[spec['index'], expected_encoded] = actual_encoded
        else:
          self.assertEqual(decode_nested(coder, expected_encoded, nested),
                           value)
          self.assertEqual(expected_encoded, actual_encoded)

  def parse_coder(self, spec):
    return self._urn_to_coder_class[spec['urn']](
        *[self.parse_coder(c) for c in spec.get('components', ())])

  def json_value_parser(self, coder_spec):
    component_parsers = [
        self.json_value_parser(c) for c in coder_spec.get('components', ())]
    return lambda x: self._urn_to_json_value_parser[coder_spec['urn']](
        x, *component_parsers)

  # Used when --fix is passed.

  fix = False
  to_fix = {}

  @classmethod
  def tearDownClass(cls):
    if cls.fix and cls.to_fix:
      print "FIXING", len(cls.to_fix), "TESTS"
      doc_sep = '\n---\n'
      docs = open(STANDARD_CODERS_YAML).read().split(doc_sep)

      def quote(s):
        return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0')
      for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items():
        print quote(expected_encoded), "->", quote(actual_encoded)
        docs[doc_ix] = docs[doc_ix].replace(
            quote(expected_encoded) + ':', quote(actual_encoded) + ':')
      open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
コード例 #6
0
class StandardCodersTest(unittest.TestCase):

    _urn_to_coder_class = {
        'urn:beam:coders:bytes:0.1':
        coders.BytesCoder,
        'urn:beam:coders:varint:0.1':
        coders.VarIntCoder,
        'urn:beam:coders:kv:0.1':
        lambda k, v: coders.TupleCoder((k, v)),
        'urn:beam:coders:interval_window:0.1':
        coders.IntervalWindowCoder,
        'urn:beam:coders:stream:0.1':
        lambda t: coders.IterableCoder(t),
        'urn:beam:coders:global_window:0.1':
        coders.GlobalWindowCoder,
        'urn:beam:coders:windowed_value:0.1':
        lambda v, w: coders.WindowedValueCoder(v, w)
    }

    _urn_to_json_value_parser = {
        'urn:beam:coders:bytes:0.1':
        lambda x: x,
        'urn:beam:coders:varint:0.1':
        lambda x: x,
        'urn:beam:coders:kv:0.1':
        lambda x, key_parser, value_parser:
        (key_parser(x['key']), value_parser(x['value'])),
        'urn:beam:coders:interval_window:0.1':
        lambda x: IntervalWindow(start=Timestamp(micros=(x['end'] - x['span'])
                                                 * 1000),
                                 end=Timestamp(micros=x['end'] * 1000)),
        'urn:beam:coders:stream:0.1':
        lambda x, parser: map(parser, x),
        'urn:beam:coders:global_window:0.1':
        lambda x: window.GlobalWindow(),
        'urn:beam:coders:windowed_value:0.1':
        lambda x, value_parser, window_parser: windowed_value.create(
            value_parser(x['value']), x['timestamp'] * 1000,
            tuple([window_parser(w) for w in x['windows']]))
    }

    def test_standard_coders(self):
        for name, spec in _load_test_cases(STANDARD_CODERS_YAML):
            logging.info('Executing %s test.', name)
            self._run_standard_coder(name, spec)

    def _run_standard_coder(self, name, spec):
        coder = self.parse_coder(spec['coder'])
        parse_value = self.json_value_parser(spec['coder'])
        nested_list = [spec['nested']] if 'nested' in spec else [True, False]
        for nested in nested_list:
            for expected_encoded, json_value in spec['examples'].items():
                value = parse_value(json_value)
                expected_encoded = expected_encoded.encode('latin1')
                if not spec['coder'].get('non_deterministic', False):
                    actual_encoded = encode_nested(coder, value, nested)
                    if self.fix and actual_encoded != expected_encoded:
                        self.to_fix[spec['index'],
                                    expected_encoded] = actual_encoded
                    else:
                        self.assertEqual(expected_encoded, actual_encoded)
                        self.assertEqual(
                            decode_nested(coder, expected_encoded, nested),
                            value)
                else:
                    # Only verify decoding for a non-deterministic coder
                    self.assertEqual(
                        decode_nested(coder, expected_encoded, nested), value)

    def parse_coder(self, spec):
        return self._urn_to_coder_class[spec['urn']](
            *[self.parse_coder(c) for c in spec.get('components', ())])

    def json_value_parser(self, coder_spec):
        component_parsers = [
            self.json_value_parser(c)
            for c in coder_spec.get('components', ())
        ]
        return lambda x: self._urn_to_json_value_parser[coder_spec['urn']](
            x, *component_parsers)

    # Used when --fix is passed.

    fix = False
    to_fix = {}

    @classmethod
    def tearDownClass(cls):
        if cls.fix and cls.to_fix:
            print "FIXING", len(cls.to_fix), "TESTS"
            doc_sep = '\n---\n'
            docs = open(STANDARD_CODERS_YAML).read().split(doc_sep)

            def quote(s):
                return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0')

            for (doc_ix,
                 expected_encoded), actual_encoded in cls.to_fix.items():
                print quote(expected_encoded), "->", quote(actual_encoded)
                docs[doc_ix] = docs[doc_ix].replace(
                    quote(expected_encoded) + ':',
                    quote(actual_encoded) + ':')
            open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))