Exemple #1
0
def get_write_work_item(work, env, context):
  """Parses a write parallel instruction into the appropriate Worker* object."""
  specs = {p.key: from_json_value(p.value)
           for p in work.write.sink.spec.additionalProperties}
  # Only sinks for which a custom coder can be specified have the
  # codec property (e.g. TextSink.
  codec_specs = None
  if work.write.sink.codec:
    codec_specs = {
        p.key: from_json_value(p.value)
        for p in work.write.sink.codec.additionalProperties}

  sink = env.parse_sink(specs, codec_specs, context)
  if sink:
    write_coder = get_coder_from_spec(codec_specs)
    # All Worker items have an "output_coders", even if they have no
    # output, so that the executor can estimate bytes in a uniform way.
    return WorkerWrite(sink, input=get_input_spec(work.write.input),
                       output_coders=(write_coder,))
  if specs['@type'] == 'ShuffleSink':
    coder = get_coder_from_spec(codec_specs)
    # TODO(ccy): Reconcile WindowedValueCoder wrappings for sources with custom
    # coders so this special case won't be necessary.
    if isinstance(coder, coders.WindowedValueCoder):
      coder = coder.wrapped_value_coder
    return WorkerShuffleWrite(
        shuffle_kind=specs['shuffle_kind']['value'],
        shuffle_writer_config=specs['shuffle_writer_config']['value'],
        input=get_input_spec(work.write.input),
        output_coders=(coder,))
  else:
    raise NotImplementedError('Unknown sink type: %r' % specs)
Exemple #2
0
def get_read_work_item(work, env, context):
  """Parses a read parallel instruction into the appropriate Worker* object."""
  specs = {p.key: from_json_value(p.value)
           for p in work.read.source.spec.additionalProperties}
  # Only sources for which a custom coder can be specified have the
  # codec property (e.g. TextSource).
  codec_specs = None
  if work.read.source.codec:
    codec_specs = {
        p.key: from_json_value(p.value)
        for p in work.read.source.codec.additionalProperties}

  source = env.parse_source(specs, codec_specs, context)
  if source:
    return WorkerRead(source, tag=None)

  coder = get_coder_from_spec(codec_specs)
  # TODO(ccy): Reconcile WindowedValueCoder wrappings for sources with custom
  # coders so this special case won't be necessary.
  if isinstance(coder, coders.WindowedValueCoder):
    coder = coder.wrapped_value_coder
  if specs['@type'] == 'GroupingShuffleSource':
    return WorkerGroupingShuffleRead(
        start_shuffle_position=specs['start_shuffle_position']['value'],
        end_shuffle_position=specs['end_shuffle_position']['value'],
        shuffle_reader_config=specs['shuffle_reader_config']['value'],
        coder=coder)
  elif specs['@type'] == 'UngroupedShuffleSource':
    return WorkerUngroupedShuffleRead(
        start_shuffle_position=specs['start_shuffle_position']['value'],
        end_shuffle_position=specs['end_shuffle_position']['value'],
        shuffle_reader_config=specs['shuffle_reader_config']['value'],
        coder=coder)
  else:
    raise NotImplementedError('Unknown source type: %r' % specs)
Exemple #3
0
def get_side_input_sources(side_inputs_spec, env, context):
  """Returns a list of Worker...Read objects for the side sources specified."""
  side_inputs = []
  for side_spec in side_inputs_spec:
    assert side_spec.tag  # All side input sources have tags.
    # Make sure we got a side input type we understand.
    specs = {p.key: from_json_value(p.value)
             for p in side_spec.kind.additionalProperties}
    assert specs['@type'] == 'collection'
    for source in side_spec.sources:
      source_spec = {
          p.key: from_json_value(p.value)
          for p in source.spec.additionalProperties}
      # Only sources for which a custom coder can be specified have the
      # codec property (e.g. TextSource).
      if source.codec:
        source_codec_spec = {
            p.key: from_json_value(p.value)
            for p in source.codec.additionalProperties}

      parsed_source = env.parse_source(source_spec, source_codec_spec, context)
      if parsed_source:
        side_inputs.append(WorkerSideInputSource(parsed_source, side_spec.tag))
      else:
        raise NotImplementedError(
            'Unknown side input source type: %r' % source_spec)
  return side_inputs
 def convert_row_to_dict(self, row, schema):
   """Converts a TableRow instance using the schema to a Python dict."""
   result = {}
   for index, field in enumerate(schema.fields):
     cell = row.f[index]
     if cell.v is None:
       continue  # Field not present in the row.
     # The JSON values returned by BigQuery for table fields in a row have
     # always set the string_value attribute, which means the value below will
     # be a string. Converting to the appropriate type is not tricky except
     # for boolean values. For such values the string values are 'true' or
     # 'false', which cannot be converted by simply calling bool() (it will
     # return True for both!).
     value = from_json_value(cell.v)
     if field.type == 'STRING':
       value = value
     elif field.type == 'BOOLEAN':
       value = value == 'true'
     elif field.type == 'INTEGER':
       value = int(value)
     elif field.type == 'FLOAT':
       value = float(value)
     elif field.type == 'TIMESTAMP':
       value = float(value)
     elif field.type == 'BYTES':
       value = value
     else:
       # Note that a schema field object supports also a RECORD type. However
       # when querying, the repeated and/or record fields always come
       # flattened.  For more details please read:
       # https://cloud.google.com/bigquery/docs/data
       raise RuntimeError('Unexpected field type: %s' % field.type)
     result[field.name] = value
   return result
 def convert_row_to_dict(self, row, schema):
   """Converts a TableRow instance using the schema to a Python dict."""
   result = {}
   for index, field in enumerate(schema.fields):
     cell = row.f[index]
     if cell.v is None:
       continue  # Field not present in the row.
     # The JSON values returned by BigQuery for table fields in a row have
     # always set the string_value attribute, which means the value below will
     # be a string. Converting to the appropriate type is not tricky except
     # for boolean values. For such values the string values are 'true' or
     # 'false', which cannot be converted by simply calling bool() (it will
     # return True for both!).
     value = from_json_value(cell.v)
     if field.type == 'STRING':
       value = value
     elif field.type == 'BOOLEAN':
       value = value == 'true'
     elif field.type == 'INTEGER':
       value = int(value)
     elif field.type == 'FLOAT':
       value = float(value)
     elif field.type == 'TIMESTAMP':
       value = float(value)
     else:
       # Note that a schema field object supports also a RECORD type. However
       # when querying, the repeated and/or record fields always come
       # flattened.  For more details please read:
       # https://cloud.google.com/bigquery/docs/data
       raise RuntimeError('Unexpected field type: %s' % field.type)
     result[field.name] = value
   return result
 def encode(self, table_row):
   if self.table_schema is None:
     raise AttributeError(
         'The TableRowJsonCoder requires a table schema for '
         'encoding operations. Please specify a table_schema argument.')
   return json.dumps(
       collections.OrderedDict(
           zip(self.field_names,
               [from_json_value(f.v) for f in table_row.f])))
 def encode(self, table_row):
   if self.table_schema is None:
     raise AttributeError(
         'The TableRowJsonCoder requires a table schema for '
         'encoding operations. Please specify a table_schema argument.')
   return json.dumps(
       collections.OrderedDict(
           zip(self.field_names,
               [from_json_value(f.v) for f in table_row.f])))
Exemple #8
0
def get_output_coders(work):
  """Return a list of coder instances for the output(s) of this work item.

  Args:
    work: a ParallelInstruction protobuf

  Returns:
    A list of coders.
  """
  return [get_coder_from_spec({p.key: from_json_value(p.value)
                               for p in output.codec.additionalProperties})
          for output in work.outputs]
def get_read_work_item(work, env, context):
    """Parses a read parallel instruction into the appropriate Worker* object."""
    specs = {
        p.key: from_json_value(p.value)
        for p in work.read.source.spec.additionalProperties
    }
    # Only sources for which a custom coder can be specified have the
    # codec property (e.g. TextSource).
    codec_specs = None
    if work.read.source.codec:
        codec_specs = {
            p.key: from_json_value(p.value)
            for p in work.read.source.codec.additionalProperties
        }

    source = env.parse_source(specs, codec_specs, context)
    if source:
        return WorkerRead(source, output_coders=get_output_coders(work))

    coder = get_coder_from_spec(codec_specs)
    # TODO(ccy): Reconcile WindowedValueCoder wrappings for sources with custom
    # coders so this special case won't be necessary.
    if isinstance(coder, coders.WindowedValueCoder):
        coder = coder.wrapped_value_coder
    if specs['@type'] == 'GroupingShuffleSource':
        return WorkerGroupingShuffleRead(
            start_shuffle_position=specs['start_shuffle_position']['value'],
            end_shuffle_position=specs['end_shuffle_position']['value'],
            shuffle_reader_config=specs['shuffle_reader_config']['value'],
            coder=coder,
            output_coders=get_output_coders(work))
    elif specs['@type'] == 'UngroupedShuffleSource':
        return WorkerUngroupedShuffleRead(
            start_shuffle_position=specs['start_shuffle_position']['value'],
            end_shuffle_position=specs['end_shuffle_position']['value'],
            shuffle_reader_config=specs['shuffle_reader_config']['value'],
            coder=coder,
            output_coders=get_output_coders(work))
    else:
        raise NotImplementedError('Unknown source type: %r' % specs)
def get_do_work_item(work, env, context):
    """Parses a do parallel instruction into the appropriate Worker* object."""
    # Get side inputs if any.
    side_inputs = []
    if hasattr(work.parDo, 'sideInputs'):
        side_inputs = get_side_input_sources(work.parDo.sideInputs, env,
                                             context)
    specs = {
        p.key: from_json_value(p.value)
        for p in work.parDo.userFn.additionalProperties
    }
    if specs['@type'] == 'DoFn':
        return WorkerDoFn(
            serialized_fn=specs['serialized_fn']['value'],
            output_tags=[o.tag for o in work.parDo.multiOutputInfos],
            output_coders=get_output_coders(work),
            input=get_input_spec(work.parDo.input),
            side_inputs=side_inputs)
    elif specs['@type'] == 'CombineValuesFn':
        # Note: CombineFn's do not take side inputs like DoFn's so far.
        return WorkerCombineFn(
            serialized_fn=specs['serialized_fn']['value'],
            phase=specs['phase']['value'],  # 'add' is one possible value.
            input=get_input_spec(work.parDo.input),
            output_coders=get_output_coders(work))
    elif specs['@type'] == 'ReifyTimestampAndWindowsDoFn':
        return WorkerReifyTimestampAndWindows(
            output_tags=[o.tag for o in work.parDo.multiOutputInfos],
            output_coders=get_output_coders(work),
            input=get_input_spec(work.parDo.input))
    elif specs['@type'] == 'MergeBucketsDoFn':
        return WorkerMergeWindows(
            window_fn=specs['serialized_fn']['value'],
            combine_fn=specs.get('combine_fn', {}).get('value', None),
            phase=specs.get('phase', {}).get('value', None),
            output_tags=[o.tag for o in work.parDo.multiOutputInfos],
            output_coders=get_output_coders(work),
            input=get_input_spec(work.parDo.input),
            coders=None,
            context=context)
    # AssignBucketsDoFn is intentionally unimplemented.  The implementation of
    # WindowInto in transforms/core.py does not use a service primitive.
    else:
        raise NotImplementedError('Unknown ParDo type: %r' % specs)
  def __init__(self, source_operation_split_proto):
    source_spec = {p.key: from_json_value(p.value) for p in
                   source_operation_split_proto.source.spec
                   .additionalProperties}
    if not source_spec.has_key(names.SERIALIZED_SOURCE_KEY):
      raise ValueError(
          'Source split spec must contain a serialized source. Received: %r',
          source_operation_split_proto)
    self.source = pickler.loads(
        source_spec[names.SERIALIZED_SOURCE_KEY]['value'])

    assert self.source is not None
    assert isinstance(self.source, iobase.BoundedSource)

    desired_bundle_size_bytes = (
        source_operation_split_proto.options.desiredBundleSizeBytes)
    if not desired_bundle_size_bytes:
      self.desired_bundle_size_bytes = DEFAULT_DESIRED_BUNDLE_SIZE
    else:
      self.desired_bundle_size_bytes = long(desired_bundle_size_bytes)
Exemple #12
0
def get_partial_gbk_work_item(instruction, unused_env, unused_context):
  """Parses a partial GBK instruction into the appropriate Worker* object.

  Args:
    instruction: a ParallelInstruction protobuf with a
                 PartialGroupByKeyInstruction in it.

  Returns:
    A WorkerPartialGroupByKey object.
  """
  combine_fn = None
  if instruction.partialGroupByKey.valueCombiningFn:
    combine_fn_specs = {
        p.key: from_json_value(p.value)
        for p in (instruction.partialGroupByKey.valueCombiningFn
                  .additionalProperties)}
    combine_fn = combine_fn_specs.get('serialized_fn', {}).get('value', None)
  return WorkerPartialGroupByKey(
      combine_fn=combine_fn,
      input=get_input_spec(instruction.partialGroupByKey.input))
Exemple #13
0
def get_do_work_item(work, env, context):
  """Parses a do parallel instruction into the appropriate Worker* object."""
  # Get side inputs if any.
  side_inputs = []
  if hasattr(work.parDo, 'sideInputs'):
    side_inputs = get_side_input_sources(work.parDo.sideInputs, env, context)
  specs = {p.key: from_json_value(p.value)
           for p in work.parDo.userFn.additionalProperties}
  if specs['@type'] == 'DoFn':
    return WorkerDoFn(
        serialized_fn=specs['serialized_fn']['value'],
        output_tags=[o.tag for o in work.parDo.multiOutputInfos],
        output_coders=get_output_coders(work),
        input=get_input_spec(work.parDo.input),
        side_inputs=side_inputs)
  elif specs['@type'] == 'CombineValuesFn':
    # Note: CombineFn's do not take side inputs like DoFn's so far.
    return WorkerCombineFn(
        serialized_fn=specs['serialized_fn']['value'],
        phase=specs['phase']['value'],  # 'add' is one possible value.
        input=get_input_spec(work.parDo.input),
        output_coders=get_output_coders(work))
  elif specs['@type'] == 'ReifyTimestampAndWindowsDoFn':
    return WorkerReifyTimestampAndWindows(
        output_tags=[o.tag for o in work.parDo.multiOutputInfos],
        output_coders=get_output_coders(work),
        input=get_input_spec(work.parDo.input))
  elif specs['@type'] == 'MergeBucketsDoFn':
    return WorkerMergeWindows(
        window_fn=specs['serialized_fn']['value'],
        combine_fn=specs.get('combine_fn', {}).get('value', None),
        phase=specs.get('phase', {}).get('value', None),
        output_tags=[o.tag for o in work.parDo.multiOutputInfos],
        output_coders=get_output_coders(work),
        input=get_input_spec(work.parDo.input),
        coders=None,
        context=context)
  # AssignBucketsDoFn is intentionally unimplemented.  The implementation of
  # WindowInto in transforms/core.py does not use a service primitive.
  else:
    raise NotImplementedError('Unknown ParDo type: %r' % specs)
Exemple #14
0
def get_partial_gbk_work_item(instruction, unused_env, unused_context):
  """Parses a partial GBK instruction into the appropriate Worker* object.

  Args:
    instruction: a ParallelInstruction protobuf with a
                 PartialGroupByKeyInstruction in it.

  Returns:
    A WorkerPartialGroupByKey object.
  """
  combine_fn = None
  if instruction.partialGroupByKey.valueCombiningFn:
    combine_fn_specs = {
        p.key: from_json_value(p.value)
        for p in (instruction.partialGroupByKey.valueCombiningFn
                  .additionalProperties)}
    combine_fn = combine_fn_specs.get('serialized_fn', {}).get('value', None)
  return WorkerPartialGroupByKey(
      combine_fn=combine_fn,
      input=get_input_spec(instruction.partialGroupByKey.input),
      output_coders=get_output_coders(instruction))
Exemple #15
0
    def __init__(self, source_operation_split_proto):
        source_spec = {
            p.key: from_json_value(p.value)
            for p in
            source_operation_split_proto.source.spec.additionalProperties
        }
        if not source_spec.has_key(names.SERIALIZED_SOURCE_KEY):
            raise ValueError(
                'Source split spec must contain a serialized source. Received: %r',
                source_operation_split_proto)
        self.source = pickler.loads(
            source_spec[names.SERIALIZED_SOURCE_KEY]['value'])

        assert self.source is not None
        assert isinstance(self.source, iobase.BoundedSource)

        desired_bundle_size_bytes = (
            source_operation_split_proto.options.desiredBundleSizeBytes)
        if not desired_bundle_size_bytes:
            self.desired_bundle_size_bytes = DEFAULT_DESIRED_BUNDLE_SIZE
        else:
            self.desired_bundle_size_bytes = long(desired_bundle_size_bytes)
 def test_with_type(self):
     rt = from_json_value(to_json_value('abcd', with_type=True))
     self.assertEquals('http://schema.org/Text', rt['@type'])
     self.assertEquals('abcd', rt['value'])
 def test_float_from(self):
     self.assertEquals(4.5, from_json_value(to_json_value(4.5)))
 def test_int_from(self):
     self.assertEquals(-27, from_json_value(to_json_value(-27)))
 def test_false_from(self):
     self.assertEquals(False, from_json_value(to_json_value(False)))
 def test_true_from(self):
     self.assertEquals(True, from_json_value(to_json_value(True)))
 def test_string_from(self):
     self.assertEquals('WXYZ', from_json_value(to_json_value('WXYZ')))