def run_side_write(self, pcoll, label): map_task_index, producer_index, output_index = self.outputs[pcoll] windowed_element_coder = self._get_coder(pcoll) output_buffer = OutputBuffer(windowed_element_coder) write_sideinput_op = operation_specs.WorkerInMemoryWrite( output_buffer=output_buffer, write_windowed_values=True, input=(producer_index, output_index), output_coders=[windowed_element_coder]) self.map_tasks[map_task_index].append((label, write_sideinput_op)) return output_buffer
def run_Flatten(self, transform_node): output_buffer = OutputBuffer(self._get_coder(transform_node.outputs[None])) output_map_task = self._run_read_from(transform_node, output_buffer.source()) for input in transform_node.inputs: map_task_index, producer_index, output_index = self.outputs[input] element_coder = self._get_coder(input) flatten_write = operation_specs.WorkerInMemoryWrite( output_buffer=output_buffer, write_windowed_values=True, input=(producer_index, output_index), output_coders=[element_coder]) self.map_tasks[map_task_index].append( (transform_node.full_label + '/Write', flatten_write)) self.dependencies[output_map_task].add(map_task_index)
def run__GroupByKeyOnly(self, transform_node): map_task_index, producer_index, output_index = self.outputs[ transform_node.inputs[0]] grouped_element_coder = self._get_coder(transform_node.outputs[None], windowed=False) windowed_ungrouped_element_coder = self._get_coder(transform_node.inputs[0]) output_buffer = GroupingOutputBuffer(grouped_element_coder) shuffle_write = operation_specs.WorkerInMemoryWrite( output_buffer=output_buffer, write_windowed_values=False, input=(producer_index, output_index), output_coders=[windowed_ungrouped_element_coder]) self.map_tasks[map_task_index].append( (transform_node.full_label + '/Write', shuffle_write)) output_map_task_index = self._run_read_from( transform_node, output_buffer.source()) self.dependencies[output_map_task_index].add(map_task_index)
def run_ParDo(self, transform_node): transform = transform_node.transform output = transform_node.outputs[None] element_coder = self._get_coder(output) map_task_index, producer_index, output_index = self.outputs[ transform_node.inputs[0]] # If any of this ParDo's side inputs depend on outputs from this map_task, # we can't continue growing this map task. def is_reachable(leaf, root): if leaf == root: return True else: return any( is_reachable(x, root) for x in self.dependencies[leaf]) if any( is_reachable(self.outputs[side_input.pvalue][0], map_task_index) for side_input in transform_node.side_inputs): # Start a new map tasks. input_element_coder = self._get_coder(transform_node.inputs[0]) output_buffer = OutputBuffer(input_element_coder) fusion_break_write = operation_specs.WorkerInMemoryWrite( output_buffer=output_buffer, write_windowed_values=True, input=(producer_index, output_index), output_coders=[input_element_coder]) self.map_tasks[map_task_index].append( (transform_node.full_label + '/Write', fusion_break_write)) original_map_task_index = map_task_index map_task_index, producer_index, output_index = len( self.map_tasks), 0, 0 fusion_break_read = operation_specs.WorkerRead( output_buffer.source_bundle(), output_coders=[input_element_coder]) self.map_tasks.append([(transform_node.full_label + '/Read', fusion_break_read)]) self.dependencies[map_task_index].add(original_map_task_index) def create_side_read(side_input): label = self.side_input_labels[side_input] output_buffer = self.run_side_write( side_input.pvalue, '%s/%s' % (transform_node.full_label, label)) return operation_specs.WorkerSideInputSource( output_buffer.source(), label) do_op = operation_specs.WorkerDoFn( # serialized_fn=pickler.dumps( DataflowRunner._pardo_fn_data( transform_node, lambda side_input: self.side_input_labels[side_input])), output_tags=[PropertyNames.OUT] + [ '%s_%s' % (PropertyNames.OUT, tag) for tag in transform.output_tags ], # Same assumption that DataflowRunner has about coders being compatible # across outputs. output_coders=[element_coder] * (len(transform.output_tags) + 1), input=(producer_index, output_index), side_inputs=[ create_side_read(side_input) for side_input in transform_node.side_inputs ]) producer_index = len(self.map_tasks[map_task_index]) self.outputs[transform_node.outputs[None]] = (map_task_index, producer_index, 0) for ix, tag in enumerate(transform.output_tags): self.outputs[transform_node. outputs[tag]] = map_task_index, producer_index, ix + 1 self.map_tasks[map_task_index].append( (transform_node.full_label, do_op)) for side_input in transform_node.side_inputs: self.dependencies[map_task_index].add( self.outputs[side_input.pvalue][0])