def get_coder_from_spec(coder_spec): """Return a coder instance from a coder spec. Args: coder_spec: A dict where the value of the '@type' key is a pickled instance of a Coder instance. Returns: A coder instance (has encode/decode methods). """ assert coder_spec is not None # Ignore the wrappers in these encodings. ignored_wrappers = ( 'com.google.cloud.dataflow.sdk.util.TimerOrElement$TimerOrElementCoder') if coder_spec['@type'] in ignored_wrappers: assert len(coder_spec['component_encodings']) == 1 coder_spec = coder_spec['component_encodings'][0] return get_coder_from_spec(coder_spec) # Handle a few well known types of coders. if coder_spec['@type'] == 'kind:pair': assert len(coder_spec['component_encodings']) == 2 component_coders = [ get_coder_from_spec(c) for c in coder_spec['component_encodings']] return coders.TupleCoder(component_coders) elif coder_spec['@type'] == 'kind:stream': assert len(coder_spec['component_encodings']) == 1 return coders.IterableCoder( get_coder_from_spec(coder_spec['component_encodings'][0])) elif coder_spec['@type'] == 'kind:windowed_value': assert len(coder_spec['component_encodings']) == 2 value_coder, window_coder = [ get_coder_from_spec(c) for c in coder_spec['component_encodings']] return coders.coders.WindowedValueCoder( value_coder, window_coder=window_coder) elif coder_spec['@type'] == 'kind:interval_window': assert ('component_encodings' not in coder_spec or not coder_spec['component_encodings']) return coders.coders.IntervalWindowCoder() elif coder_spec['@type'] == 'kind:global_window': assert ('component_encodings' not in coder_spec or not coder_spec['component_encodings']) return coders.coders.GlobalWindowCoder() elif coder_spec['@type'] == 'kind:varint': assert ('component_encodings' not in coder_spec or len(coder_spec['component_encodings'] == 0)) return coders.coders.VarIntCoder() elif coder_spec['@type'] == 'kind:length_prefix': assert len(coder_spec['component_encodings']) == 1 return coders.coders.LengthPrefixCoder( get_coder_from_spec(coder_spec['component_encodings'][0])) elif coder_spec['@type'] == 'kind:bytes': assert ('component_encodings' not in coder_spec or len(coder_spec['component_encodings'] == 0)) return coders.BytesCoder() # We pass coders in the form "<coder_name>$<pickled_data>" to make the job # description JSON more readable. return coders.coders.deserialize_coder(coder_spec['@type'])
def get_coder_from_spec(coder_spec): """Return a coder instance from a coder spec. Args: coder_spec: A dict where the value of the '@type' key is a pickled instance of a Coder instance. Returns: A coder instance (has encode/decode methods). """ assert coder_spec is not None # Ignore the wrappers in these encodings. # TODO(silviuc): Make sure with all the renamings that names below are ok. if coder_spec['@type'] in ignored_wrappers: assert len(coder_spec['component_encodings']) == 1 coder_spec = coder_spec['component_encodings'][0] return get_coder_from_spec(coder_spec) # Handle a few well known types of coders. if coder_spec['@type'] == 'kind:pair': assert len(coder_spec['component_encodings']) == 2 component_coders = [ get_coder_from_spec(c) for c in coder_spec['component_encodings'] ] return coders.TupleCoder(component_coders) elif coder_spec['@type'] == 'kind:stream': assert len(coder_spec['component_encodings']) == 1 return coders.IterableCoder( get_coder_from_spec(coder_spec['component_encodings'][0])) elif coder_spec['@type'] == 'kind:windowed_value': assert len(coder_spec['component_encodings']) == 2 value_coder, window_coder = [ get_coder_from_spec(c) for c in coder_spec['component_encodings'] ] return coders.WindowedValueCoder(value_coder, window_coder=window_coder) elif coder_spec['@type'] == 'kind:interval_window': assert ('component_encodings' not in coder_spec or len(coder_spec['component_encodings'] == 0)) return coders.IntervalWindowCoder() elif coder_spec['@type'] == 'kind:global_window': assert ('component_encodings' not in coder_spec or not coder_spec['component_encodings']) return coders.GlobalWindowCoder() elif coder_spec['@type'] == 'kind:length_prefix': assert len(coder_spec['component_encodings']) == 1 return coders.LengthPrefixCoder( get_coder_from_spec(coder_spec['component_encodings'][0])) # We pass coders in the form "<coder_name>$<pickled_data>" to make the job # description JSON more readable. return coders.deserialize_coder(coder_spec['@type'])
def __init__(self, iterable_side_input): # pylint: disable=protected-access side_input_data = iterable_side_input._side_input_data() assert side_input_data.access_pattern == common_urns.ITERABLE_SIDE_INPUT iterable_view_fn = side_input_data.view_fn self._data = beam.pvalue.SideInputData( self.DATAFLOW_MULTIMAP_URN, side_input_data.window_mapping_fn, lambda multimap: iterable_view_fn(multimap['']), coders.WindowedValueCoder( coders.TupleCoder((coders.BytesCoder(), side_input_data.coder.wrapped_value_coder)), side_input_data.coder.window_coder))
def coder_id_from_element_type( self, element_type, requires_deterministic_key_coder=None): # type: (Any, Optional[str]) -> str if self.use_fake_coders: return pickler.dumps(element_type).decode('ascii') else: coder = coders.registry.get_coder(element_type) if requires_deterministic_key_coder: coder = coders.TupleCoder([ coder.key_coder().as_deterministic_coder( requires_deterministic_key_coder), coder.value_coder() ]) return self.coders.get_id(coder)
class StandardCodersTest(unittest.TestCase): _urn_to_coder_class = { 'beam:coders:bytes:0.1': coders.BytesCoder, 'beam:coders:varint:0.1': coders.VarIntCoder, 'beam:coders:kv:0.1': lambda k, v: coders.TupleCoder((k, v)) } _urn_to_json_value_parser = { 'beam:coders:bytes:0.1': lambda x: x, 'beam:coders:varint:0.1': lambda x: x, 'beam:coders:kv:0.1': lambda x, key_parser, value_parser: (key_parser(x['key']), value_parser(x['value'])) } # We must prepend an underscore to this name so that the open-source unittest # runner does not execute this method directly as a test. @classmethod def _create_test(cls, spec): counter = 0 name = spec.get('name', spec['coder']['urn'].split(':')[-2]) unique_name = 'test_' + name while hasattr(cls, unique_name): counter += 1 unique_name = 'test_%s_%d' % (name, counter) setattr(cls, unique_name, lambda self: self._run_coder_test(spec)) # We must prepend an underscore to this name so that the open-source unittest # runner does not execute this method directly as a test. @classmethod def _create_tests(cls, coder_test_specs): for ix, spec in enumerate(yaml.load_all(open(coder_test_specs))): spec['index'] = ix cls._create_test(spec) def _run_coder_test(self, spec): coder = self.parse_coder(spec['coder']) parse_value = self.json_value_parser(spec['coder']) nested_list = [spec['nested']] if 'nested' in spec else [True, False] for nested in nested_list: for expected_encoded, json_value in spec['examples'].items(): value = parse_value(json_value) expected_encoded = expected_encoded.encode('latin1') actual_encoded = encode_nested(coder, value, nested) if self.fix and actual_encoded != expected_encoded: self.to_fix[spec['index'], expected_encoded] = actual_encoded else: self.assertEqual(decode_nested(coder, expected_encoded, nested), value) self.assertEqual(expected_encoded, actual_encoded) def parse_coder(self, spec): return self._urn_to_coder_class[spec['urn']]( *[self.parse_coder(c) for c in spec.get('components', ())]) def json_value_parser(self, coder_spec): component_parsers = [ self.json_value_parser(c) for c in coder_spec.get('components', ())] return lambda x: self._urn_to_json_value_parser[coder_spec['urn']]( x, *component_parsers) # Used when --fix is passed. fix = False to_fix = {} @classmethod def tearDownClass(cls): if cls.fix and cls.to_fix: print "FIXING", len(cls.to_fix), "TESTS" doc_sep = '\n---\n' docs = open(STANDARD_CODERS_YAML).read().split(doc_sep) def quote(s): return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0') for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items(): print quote(expected_encoded), "->", quote(actual_encoded) docs[doc_ix] = docs[doc_ix].replace( quote(expected_encoded) + ':', quote(actual_encoded) + ':') open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
class StandardCodersTest(unittest.TestCase): _urn_to_coder_class = { 'urn:beam:coders:bytes:0.1': coders.BytesCoder, 'urn:beam:coders:varint:0.1': coders.VarIntCoder, 'urn:beam:coders:kv:0.1': lambda k, v: coders.TupleCoder((k, v)), 'urn:beam:coders:interval_window:0.1': coders.IntervalWindowCoder, 'urn:beam:coders:stream:0.1': lambda t: coders.IterableCoder(t), 'urn:beam:coders:global_window:0.1': coders.GlobalWindowCoder, 'urn:beam:coders:windowed_value:0.1': lambda v, w: coders.WindowedValueCoder(v, w) } _urn_to_json_value_parser = { 'urn:beam:coders:bytes:0.1': lambda x: x, 'urn:beam:coders:varint:0.1': lambda x: x, 'urn:beam:coders:kv:0.1': lambda x, key_parser, value_parser: (key_parser(x['key']), value_parser(x['value'])), 'urn:beam:coders:interval_window:0.1': lambda x: IntervalWindow(start=Timestamp(micros=(x['end'] - x['span']) * 1000), end=Timestamp(micros=x['end'] * 1000)), 'urn:beam:coders:stream:0.1': lambda x, parser: map(parser, x), 'urn:beam:coders:global_window:0.1': lambda x: window.GlobalWindow(), 'urn:beam:coders:windowed_value:0.1': lambda x, value_parser, window_parser: windowed_value.create( value_parser(x['value']), x['timestamp'] * 1000, tuple([window_parser(w) for w in x['windows']])) } def test_standard_coders(self): for name, spec in _load_test_cases(STANDARD_CODERS_YAML): logging.info('Executing %s test.', name) self._run_standard_coder(name, spec) def _run_standard_coder(self, name, spec): coder = self.parse_coder(spec['coder']) parse_value = self.json_value_parser(spec['coder']) nested_list = [spec['nested']] if 'nested' in spec else [True, False] for nested in nested_list: for expected_encoded, json_value in spec['examples'].items(): value = parse_value(json_value) expected_encoded = expected_encoded.encode('latin1') if not spec['coder'].get('non_deterministic', False): actual_encoded = encode_nested(coder, value, nested) if self.fix and actual_encoded != expected_encoded: self.to_fix[spec['index'], expected_encoded] = actual_encoded else: self.assertEqual(expected_encoded, actual_encoded) self.assertEqual( decode_nested(coder, expected_encoded, nested), value) else: # Only verify decoding for a non-deterministic coder self.assertEqual( decode_nested(coder, expected_encoded, nested), value) def parse_coder(self, spec): return self._urn_to_coder_class[spec['urn']]( *[self.parse_coder(c) for c in spec.get('components', ())]) def json_value_parser(self, coder_spec): component_parsers = [ self.json_value_parser(c) for c in coder_spec.get('components', ()) ] return lambda x: self._urn_to_json_value_parser[coder_spec['urn']]( x, *component_parsers) # Used when --fix is passed. fix = False to_fix = {} @classmethod def tearDownClass(cls): if cls.fix and cls.to_fix: print "FIXING", len(cls.to_fix), "TESTS" doc_sep = '\n---\n' docs = open(STANDARD_CODERS_YAML).read().split(doc_sep) def quote(s): return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0') for (doc_ix, expected_encoded), actual_encoded in cls.to_fix.items(): print quote(expected_encoded), "->", quote(actual_encoded) docs[doc_ix] = docs[doc_ix].replace( quote(expected_encoded) + ':', quote(actual_encoded) + ':') open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))