def get_write_work_item(work, env, context): """Parses a write parallel instruction into the appropriate Worker* object.""" specs = {p.key: from_json_value(p.value) for p in work.write.sink.spec.additionalProperties} # Only sinks for which a custom coder can be specified have the # codec property (e.g. TextSink. codec_specs = None if work.write.sink.codec: codec_specs = { p.key: from_json_value(p.value) for p in work.write.sink.codec.additionalProperties} sink = env.parse_sink(specs, codec_specs, context) if sink: write_coder = get_coder_from_spec(codec_specs) # All Worker items have an "output_coders", even if they have no # output, so that the executor can estimate bytes in a uniform way. return WorkerWrite(sink, input=get_input_spec(work.write.input), output_coders=(write_coder,)) if specs['@type'] == 'ShuffleSink': coder = get_coder_from_spec(codec_specs) # TODO(ccy): Reconcile WindowedValueCoder wrappings for sources with custom # coders so this special case won't be necessary. if isinstance(coder, coders.WindowedValueCoder): coder = coder.wrapped_value_coder return WorkerShuffleWrite( shuffle_kind=specs['shuffle_kind']['value'], shuffle_writer_config=specs['shuffle_writer_config']['value'], input=get_input_spec(work.write.input), output_coders=(coder,)) else: raise NotImplementedError('Unknown sink type: %r' % specs)
def get_read_work_item(work, env, context): """Parses a read parallel instruction into the appropriate Worker* object.""" specs = {p.key: from_json_value(p.value) for p in work.read.source.spec.additionalProperties} # Only sources for which a custom coder can be specified have the # codec property (e.g. TextSource). codec_specs = None if work.read.source.codec: codec_specs = { p.key: from_json_value(p.value) for p in work.read.source.codec.additionalProperties} source = env.parse_source(specs, codec_specs, context) if source: return WorkerRead(source, tag=None) coder = get_coder_from_spec(codec_specs) # TODO(ccy): Reconcile WindowedValueCoder wrappings for sources with custom # coders so this special case won't be necessary. if isinstance(coder, coders.WindowedValueCoder): coder = coder.wrapped_value_coder if specs['@type'] == 'GroupingShuffleSource': return WorkerGroupingShuffleRead( start_shuffle_position=specs['start_shuffle_position']['value'], end_shuffle_position=specs['end_shuffle_position']['value'], shuffle_reader_config=specs['shuffle_reader_config']['value'], coder=coder) elif specs['@type'] == 'UngroupedShuffleSource': return WorkerUngroupedShuffleRead( start_shuffle_position=specs['start_shuffle_position']['value'], end_shuffle_position=specs['end_shuffle_position']['value'], shuffle_reader_config=specs['shuffle_reader_config']['value'], coder=coder) else: raise NotImplementedError('Unknown source type: %r' % specs)
def get_side_input_sources(side_inputs_spec, env, context): """Returns a list of Worker...Read objects for the side sources specified.""" side_inputs = [] for side_spec in side_inputs_spec: assert side_spec.tag # All side input sources have tags. # Make sure we got a side input type we understand. specs = {p.key: from_json_value(p.value) for p in side_spec.kind.additionalProperties} assert specs['@type'] == 'collection' for source in side_spec.sources: source_spec = { p.key: from_json_value(p.value) for p in source.spec.additionalProperties} # Only sources for which a custom coder can be specified have the # codec property (e.g. TextSource). if source.codec: source_codec_spec = { p.key: from_json_value(p.value) for p in source.codec.additionalProperties} parsed_source = env.parse_source(source_spec, source_codec_spec, context) if parsed_source: side_inputs.append(WorkerSideInputSource(parsed_source, side_spec.tag)) else: raise NotImplementedError( 'Unknown side input source type: %r' % source_spec) return side_inputs
def convert_row_to_dict(self, row, schema): """Converts a TableRow instance using the schema to a Python dict.""" result = {} for index, field in enumerate(schema.fields): cell = row.f[index] if cell.v is None: continue # Field not present in the row. # The JSON values returned by BigQuery for table fields in a row have # always set the string_value attribute, which means the value below will # be a string. Converting to the appropriate type is not tricky except # for boolean values. For such values the string values are 'true' or # 'false', which cannot be converted by simply calling bool() (it will # return True for both!). value = from_json_value(cell.v) if field.type == 'STRING': value = value elif field.type == 'BOOLEAN': value = value == 'true' elif field.type == 'INTEGER': value = int(value) elif field.type == 'FLOAT': value = float(value) elif field.type == 'TIMESTAMP': value = float(value) elif field.type == 'BYTES': value = value else: # Note that a schema field object supports also a RECORD type. However # when querying, the repeated and/or record fields always come # flattened. For more details please read: # https://cloud.google.com/bigquery/docs/data raise RuntimeError('Unexpected field type: %s' % field.type) result[field.name] = value return result
def convert_row_to_dict(self, row, schema): """Converts a TableRow instance using the schema to a Python dict.""" result = {} for index, field in enumerate(schema.fields): cell = row.f[index] if cell.v is None: continue # Field not present in the row. # The JSON values returned by BigQuery for table fields in a row have # always set the string_value attribute, which means the value below will # be a string. Converting to the appropriate type is not tricky except # for boolean values. For such values the string values are 'true' or # 'false', which cannot be converted by simply calling bool() (it will # return True for both!). value = from_json_value(cell.v) if field.type == 'STRING': value = value elif field.type == 'BOOLEAN': value = value == 'true' elif field.type == 'INTEGER': value = int(value) elif field.type == 'FLOAT': value = float(value) elif field.type == 'TIMESTAMP': value = float(value) else: # Note that a schema field object supports also a RECORD type. However # when querying, the repeated and/or record fields always come # flattened. For more details please read: # https://cloud.google.com/bigquery/docs/data raise RuntimeError('Unexpected field type: %s' % field.type) result[field.name] = value return result
def encode(self, table_row): if self.table_schema is None: raise AttributeError( 'The TableRowJsonCoder requires a table schema for ' 'encoding operations. Please specify a table_schema argument.') return json.dumps( collections.OrderedDict( zip(self.field_names, [from_json_value(f.v) for f in table_row.f])))
def get_output_coders(work): """Return a list of coder instances for the output(s) of this work item. Args: work: a ParallelInstruction protobuf Returns: A list of coders. """ return [get_coder_from_spec({p.key: from_json_value(p.value) for p in output.codec.additionalProperties}) for output in work.outputs]
def get_read_work_item(work, env, context): """Parses a read parallel instruction into the appropriate Worker* object.""" specs = { p.key: from_json_value(p.value) for p in work.read.source.spec.additionalProperties } # Only sources for which a custom coder can be specified have the # codec property (e.g. TextSource). codec_specs = None if work.read.source.codec: codec_specs = { p.key: from_json_value(p.value) for p in work.read.source.codec.additionalProperties } source = env.parse_source(specs, codec_specs, context) if source: return WorkerRead(source, output_coders=get_output_coders(work)) coder = get_coder_from_spec(codec_specs) # TODO(ccy): Reconcile WindowedValueCoder wrappings for sources with custom # coders so this special case won't be necessary. if isinstance(coder, coders.WindowedValueCoder): coder = coder.wrapped_value_coder if specs['@type'] == 'GroupingShuffleSource': return WorkerGroupingShuffleRead( start_shuffle_position=specs['start_shuffle_position']['value'], end_shuffle_position=specs['end_shuffle_position']['value'], shuffle_reader_config=specs['shuffle_reader_config']['value'], coder=coder, output_coders=get_output_coders(work)) elif specs['@type'] == 'UngroupedShuffleSource': return WorkerUngroupedShuffleRead( start_shuffle_position=specs['start_shuffle_position']['value'], end_shuffle_position=specs['end_shuffle_position']['value'], shuffle_reader_config=specs['shuffle_reader_config']['value'], coder=coder, output_coders=get_output_coders(work)) else: raise NotImplementedError('Unknown source type: %r' % specs)
def get_do_work_item(work, env, context): """Parses a do parallel instruction into the appropriate Worker* object.""" # Get side inputs if any. side_inputs = [] if hasattr(work.parDo, 'sideInputs'): side_inputs = get_side_input_sources(work.parDo.sideInputs, env, context) specs = { p.key: from_json_value(p.value) for p in work.parDo.userFn.additionalProperties } if specs['@type'] == 'DoFn': return WorkerDoFn( serialized_fn=specs['serialized_fn']['value'], output_tags=[o.tag for o in work.parDo.multiOutputInfos], output_coders=get_output_coders(work), input=get_input_spec(work.parDo.input), side_inputs=side_inputs) elif specs['@type'] == 'CombineValuesFn': # Note: CombineFn's do not take side inputs like DoFn's so far. return WorkerCombineFn( serialized_fn=specs['serialized_fn']['value'], phase=specs['phase']['value'], # 'add' is one possible value. input=get_input_spec(work.parDo.input), output_coders=get_output_coders(work)) elif specs['@type'] == 'ReifyTimestampAndWindowsDoFn': return WorkerReifyTimestampAndWindows( output_tags=[o.tag for o in work.parDo.multiOutputInfos], output_coders=get_output_coders(work), input=get_input_spec(work.parDo.input)) elif specs['@type'] == 'MergeBucketsDoFn': return WorkerMergeWindows( window_fn=specs['serialized_fn']['value'], combine_fn=specs.get('combine_fn', {}).get('value', None), phase=specs.get('phase', {}).get('value', None), output_tags=[o.tag for o in work.parDo.multiOutputInfos], output_coders=get_output_coders(work), input=get_input_spec(work.parDo.input), coders=None, context=context) # AssignBucketsDoFn is intentionally unimplemented. The implementation of # WindowInto in transforms/core.py does not use a service primitive. else: raise NotImplementedError('Unknown ParDo type: %r' % specs)
def __init__(self, source_operation_split_proto): source_spec = {p.key: from_json_value(p.value) for p in source_operation_split_proto.source.spec .additionalProperties} if not source_spec.has_key(names.SERIALIZED_SOURCE_KEY): raise ValueError( 'Source split spec must contain a serialized source. Received: %r', source_operation_split_proto) self.source = pickler.loads( source_spec[names.SERIALIZED_SOURCE_KEY]['value']) assert self.source is not None assert isinstance(self.source, iobase.BoundedSource) desired_bundle_size_bytes = ( source_operation_split_proto.options.desiredBundleSizeBytes) if not desired_bundle_size_bytes: self.desired_bundle_size_bytes = DEFAULT_DESIRED_BUNDLE_SIZE else: self.desired_bundle_size_bytes = long(desired_bundle_size_bytes)
def get_partial_gbk_work_item(instruction, unused_env, unused_context): """Parses a partial GBK instruction into the appropriate Worker* object. Args: instruction: a ParallelInstruction protobuf with a PartialGroupByKeyInstruction in it. Returns: A WorkerPartialGroupByKey object. """ combine_fn = None if instruction.partialGroupByKey.valueCombiningFn: combine_fn_specs = { p.key: from_json_value(p.value) for p in (instruction.partialGroupByKey.valueCombiningFn .additionalProperties)} combine_fn = combine_fn_specs.get('serialized_fn', {}).get('value', None) return WorkerPartialGroupByKey( combine_fn=combine_fn, input=get_input_spec(instruction.partialGroupByKey.input))
def get_do_work_item(work, env, context): """Parses a do parallel instruction into the appropriate Worker* object.""" # Get side inputs if any. side_inputs = [] if hasattr(work.parDo, 'sideInputs'): side_inputs = get_side_input_sources(work.parDo.sideInputs, env, context) specs = {p.key: from_json_value(p.value) for p in work.parDo.userFn.additionalProperties} if specs['@type'] == 'DoFn': return WorkerDoFn( serialized_fn=specs['serialized_fn']['value'], output_tags=[o.tag for o in work.parDo.multiOutputInfos], output_coders=get_output_coders(work), input=get_input_spec(work.parDo.input), side_inputs=side_inputs) elif specs['@type'] == 'CombineValuesFn': # Note: CombineFn's do not take side inputs like DoFn's so far. return WorkerCombineFn( serialized_fn=specs['serialized_fn']['value'], phase=specs['phase']['value'], # 'add' is one possible value. input=get_input_spec(work.parDo.input), output_coders=get_output_coders(work)) elif specs['@type'] == 'ReifyTimestampAndWindowsDoFn': return WorkerReifyTimestampAndWindows( output_tags=[o.tag for o in work.parDo.multiOutputInfos], output_coders=get_output_coders(work), input=get_input_spec(work.parDo.input)) elif specs['@type'] == 'MergeBucketsDoFn': return WorkerMergeWindows( window_fn=specs['serialized_fn']['value'], combine_fn=specs.get('combine_fn', {}).get('value', None), phase=specs.get('phase', {}).get('value', None), output_tags=[o.tag for o in work.parDo.multiOutputInfos], output_coders=get_output_coders(work), input=get_input_spec(work.parDo.input), coders=None, context=context) # AssignBucketsDoFn is intentionally unimplemented. The implementation of # WindowInto in transforms/core.py does not use a service primitive. else: raise NotImplementedError('Unknown ParDo type: %r' % specs)
def get_partial_gbk_work_item(instruction, unused_env, unused_context): """Parses a partial GBK instruction into the appropriate Worker* object. Args: instruction: a ParallelInstruction protobuf with a PartialGroupByKeyInstruction in it. Returns: A WorkerPartialGroupByKey object. """ combine_fn = None if instruction.partialGroupByKey.valueCombiningFn: combine_fn_specs = { p.key: from_json_value(p.value) for p in (instruction.partialGroupByKey.valueCombiningFn .additionalProperties)} combine_fn = combine_fn_specs.get('serialized_fn', {}).get('value', None) return WorkerPartialGroupByKey( combine_fn=combine_fn, input=get_input_spec(instruction.partialGroupByKey.input), output_coders=get_output_coders(instruction))
def __init__(self, source_operation_split_proto): source_spec = { p.key: from_json_value(p.value) for p in source_operation_split_proto.source.spec.additionalProperties } if not source_spec.has_key(names.SERIALIZED_SOURCE_KEY): raise ValueError( 'Source split spec must contain a serialized source. Received: %r', source_operation_split_proto) self.source = pickler.loads( source_spec[names.SERIALIZED_SOURCE_KEY]['value']) assert self.source is not None assert isinstance(self.source, iobase.BoundedSource) desired_bundle_size_bytes = ( source_operation_split_proto.options.desiredBundleSizeBytes) if not desired_bundle_size_bytes: self.desired_bundle_size_bytes = DEFAULT_DESIRED_BUNDLE_SIZE else: self.desired_bundle_size_bytes = long(desired_bundle_size_bytes)
def test_with_type(self): rt = from_json_value(to_json_value('abcd', with_type=True)) self.assertEquals('http://schema.org/Text', rt['@type']) self.assertEquals('abcd', rt['value'])
def test_float_from(self): self.assertEquals(4.5, from_json_value(to_json_value(4.5)))
def test_int_from(self): self.assertEquals(-27, from_json_value(to_json_value(-27)))
def test_false_from(self): self.assertEquals(False, from_json_value(to_json_value(False)))
def test_true_from(self): self.assertEquals(True, from_json_value(to_json_value(True)))
def test_string_from(self): self.assertEquals('WXYZ', from_json_value(to_json_value('WXYZ')))