def get_shuffle_source_to_text_sink_message(shuffle_source_spec): rsi = dataflow.ReadInstruction() rsi.source = dataflow.Source() rsi.source.spec = dataflow.Source.SpecValue() for k, v in shuffle_source_spec.iteritems(): rsi.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) add_source_codec_spec(rsi) wi = dataflow.WriteInstruction() wi.input = dataflow.InstructionInput() wi.sink = dataflow.Sink() wi.sink.spec = dataflow.Sink.SpecValue() for k, v in TEXT_SINK_SPEC.iteritems(): wi.sink.spec.additionalProperties.append( dataflow.Sink.SpecValue.AdditionalProperty(key=k, value=to_json_value(v))) add_sink_codec_spec(wi) mt = dataflow.MapTask() mt.instructions.append(get_instruction_with_outputs(read=rsi)) mt.instructions.append(dataflow.ParallelInstruction(write=wi)) wi = dataflow.WorkItem() wi.id = 1234 wi.projectId = 'project' wi.jobId = 'job' wi.mapTask = mt m = dataflow.LeaseWorkItemResponse() m.workItems.append(wi) return m
def get_in_memory_source_to_flatten_message(): rsi = dataflow.ReadInstruction() rsi.source = dataflow.Source() add_source_codec_spec(rsi) rsi.source.spec = dataflow.Source.SpecValue() for k, v in IN_MEMORY_SOURCE_SPEC.iteritems(): rsi.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) # Note that the in-memory source spec requires a windowed coder. add_source_windowed_codec_spec(rsi) fi = dataflow.FlattenInstruction() fi.inputs = [dataflow.InstructionInput()] mt = dataflow.MapTask() mt.instructions.append(get_instruction_with_outputs(read=rsi)) mt.instructions.append(get_instruction_with_outputs(flatten=fi)) wi = dataflow.WorkItem() wi.id = 1234 wi.projectId = 'project' wi.jobId = 'job' wi.mapTask = mt m = dataflow.LeaseWorkItemResponse() m.workItems.append(wi) return m
def get_text_source_to_shuffle_sink_message(): ri = dataflow.ReadInstruction() ri.source = dataflow.Source() ri.source.spec = dataflow.Source.SpecValue() for k, v in TEXT_SOURCE_SPEC.iteritems(): ri.source.spec.additionalProperties.append( dataflow.Source.SpecValue.AdditionalProperty( key=k, value=to_json_value(v))) add_source_codec_spec(ri) di = dataflow.ParDoInstruction() di.input = dataflow.InstructionInput() di.input.producerInstructionIndex = 1 di.multiOutputInfos = [dataflow.MultiOutputInfo(tag='out')] di.userFn = dataflow.ParDoInstruction.UserFnValue() for k, v in PARDO_DOFN_SPEC.iteritems(): di.userFn.additionalProperties.append( dataflow.ParDoInstruction.UserFnValue.AdditionalProperty( key=k, value=to_json_value(v))) wsi = dataflow.WriteInstruction() wsi.input = dataflow.InstructionInput() wsi.input.producerInstructionIndex = 1 di.input.outputNum = 0 wsi.sink = dataflow.Sink() wsi.sink.spec = dataflow.Sink.SpecValue() for k, v in SHUFFLE_SINK_SPEC.iteritems(): wsi.sink.spec.additionalProperties.append( dataflow.Sink.SpecValue.AdditionalProperty(key=k, value=to_json_value(v))) add_sink_codec_spec(wsi) mt = dataflow.MapTask() mt.instructions.append(get_instruction_with_outputs(read=ri)) mt.instructions.append(get_instruction_with_outputs(parDo=di)) mt.instructions.append(dataflow.ParallelInstruction(write=wsi)) wi = dataflow.WorkItem() wi.id = 1234 wi.projectId = 'project' wi.jobId = 'job' wi.mapTask = mt m = dataflow.LeaseWorkItemResponse() m.workItems.append(wi) return m