def test_pcoll_by_name(self): p = beam.Pipeline() pcoll = p | beam.Create([1]) ib.watch({'p': p, 'pcoll': pcoll}) name_to_pcoll = utils.pcoll_by_name() self.assertIn('pcoll', name_to_pcoll)
def __init__( self, user_pipeline, # type: beam.Pipeline pcolls, # type: List[beam.pvalue.PCollection] result, # type: beam.runner.PipelineResult max_n, # type: int max_duration_secs, # type: float ): self._user_pipeline = user_pipeline self._result = result self._result_lock = threading.Lock() self._pcolls = pcolls pcoll_var = lambda pcoll: { v: k for k, v in utils.pcoll_by_name().items() }.get(pcoll, None) self._streams = { pcoll: ElementStream( pcoll, pcoll_var(pcoll), CacheKey.from_pcoll(pcoll_var(pcoll), pcoll).to_str(), max_n, max_duration_secs) for pcoll in pcolls } self._start = time.time() self._duration_secs = max_duration_secs self._set_computed = bcj.is_cache_complete(str(id(user_pipeline))) # Run a separate thread for marking the PCollections done. This is because # the pipeline run may be asynchronous. self._mark_computed = threading.Thread(target=self._mark_all_computed) self._mark_computed.daemon = True self._mark_computed.start()
def beam_sql(self, line: str, cell: str) -> Union[None, PValue]: """The beam_sql cell magic that executes a Beam SQL. Args: line: (optional) the string on the same line after the beam_sql magic. Used as the output variable name in the __main__ module. cell: everything else in the same notebook cell as a string. Used as a Beam SQL query. Returns None if running into an error, otherwise a PValue as if a SqlTransform is applied. """ if line and not line.strip().isidentifier() or keyword.iskeyword( line.strip()): on_error( 'The output_name "%s" is not a valid identifier. Please supply a ' 'valid identifier that is not a Python keyword.', line) return if not cell or cell.isspace(): on_error('Please supply the sql to be executed.') return found = find_pcolls(cell, pcoll_by_name()) for _, pcoll in found.items(): if not is_namedtuple(pcoll.element_type): on_error( 'PCollection %s of type %s is not a NamedTuple. See ' 'https://beam.apache.org/documentation/programming-guide/#schemas ' 'for more details.', pcoll, pcoll.element_type) return register_coder_for_schema(pcoll.element_type) output_name, output = apply_sql(cell, line, found) cache_output(output_name, output) return output
def to_pipeline(self, pipeline: Optional[beam.Pipeline]) -> beam.Pipeline: """Converts the chain into an executable pipeline.""" if pipeline not in self.evaluated: # The whole chain should form a single pipeline. source = self.source if isinstance(self.source, beam.Pipeline): if pipeline: # use the known pipeline source = pipeline else: # use the source pipeline pipeline = self.source else: name_to_pcoll = pcoll_by_name() if len(self.source) == 1: source = name_to_pcoll.get(next(iter(self.source))) else: source = {s: name_to_pcoll.get(s) for s in self.source} if isinstance(source, beam.Pipeline): output = source | 'beam_sql_{}_{}'.format( self.output_name, self.execution_count) >> SqlTransform( self.query) else: output = source | 'schema_loaded_beam_sql_{}_{}'.format( self.output_name, self.execution_count) >> SchemaLoadedSqlTransform( self.output_name, self.query, self.schemas, self.execution_count) _ = create_var_in_main(self.output_name, output) self.evaluated.add(pipeline) if self.next: return self.next.to_pipeline(pipeline) else: return pipeline
def beam_sql(self, line: str, cell: Optional[str] = None) -> Optional[PValue]: """The beam_sql line/cell magic that executes a Beam SQL. Args: line: the string on the same line after the beam_sql magic. cell: everything else in the same notebook cell as a string. If None, beam_sql is used as line magic. Otherwise, cell magic. Returns None if running into an error, otherwise a PValue as if a SqlTransform is applied. """ input_str = line if cell: input_str += ' ' + cell parsed = self._parser.parse(input_str.strip().split()) if not parsed: # Failed to parse inputs, let the parser handle the exit. return output_name = parsed.output_name verbose = parsed.verbose query = parsed.query if output_name and not output_name.isidentifier() or keyword.iskeyword( output_name): on_error( 'The output_name "%s" is not a valid identifier. Please supply a ' 'valid identifier that is not a Python keyword.', line) return if not query: on_error('Please supply the SQL query to be executed.') return query = ' '.join(query) found = find_pcolls(query, pcoll_by_name(), verbose=verbose) for _, pcoll in found.items(): if not is_namedtuple(pcoll.element_type): on_error( 'PCollection %s of type %s is not a NamedTuple. See ' 'https://beam.apache.org/documentation/programming-guide/#schemas ' 'for more details.', pcoll, pcoll.element_type) return register_coder_for_schema(pcoll.element_type, verbose=verbose) output_name, output = apply_sql(query, output_name, found) cache_output(output_name, output) return output
def beam_sql(self, line: str, cell: Optional[str] = None) -> Optional[PValue]: """The beam_sql line/cell magic that executes a Beam SQL. Args: line: the string on the same line after the beam_sql magic. cell: everything else in the same notebook cell as a string. If None, beam_sql is used as line magic. Otherwise, cell magic. Returns None if running into an error or waiting for user input (running on a selected runner remotely), otherwise a PValue as if a SqlTransform is applied. """ input_str = line if cell: input_str += ' ' + cell parsed = self._parser.parse(input_str.strip().split()) if not parsed: # Failed to parse inputs, let the parser handle the exit. return output_name = parsed.output_name verbose = parsed.verbose query = parsed.query runner = parsed.runner if output_name and not output_name.isidentifier() or keyword.iskeyword( output_name): on_error( 'The output_name "%s" is not a valid identifier. Please supply a ' 'valid identifier that is not a Python keyword.', line) return if not query: on_error('Please supply the SQL query to be executed.') return if runner and runner not in _SUPPORTED_RUNNERS: on_error( 'Runner "%s" is not supported. Supported runners are %s.', runner, _SUPPORTED_RUNNERS) return query = ' '.join(query) found = find_pcolls(query, pcoll_by_name(), verbose=verbose) schemas = set() main_session = importlib.import_module('__main__') for _, pcoll in found.items(): if not match_is_named_tuple(pcoll.element_type): on_error( 'PCollection %s of type %s is not a NamedTuple. See ' 'https://beam.apache.org/documentation/programming-guide/#schemas ' 'for more details.', pcoll, pcoll.element_type) return register_coder_for_schema(pcoll.element_type, verbose=verbose) # Only care about schemas defined by the user in the main module. if hasattr(main_session, pcoll.element_type.__name__): schemas.add(pcoll.element_type) if runner in ('DirectRunner', None): collect_data_for_local_run(query, found) output_name, output, chain = apply_sql(query, output_name, found) chain.current.schemas = schemas cache_output(output_name, output) return output output_name, current_node, chain = apply_sql( query, output_name, found, False) current_node.schemas = schemas # TODO(BEAM-10708): Move the options setup and result handling to a # separate module when more runners are supported. if runner == 'DataflowRunner': _ = chain.to_pipeline() _ = DataflowOptionsForm( output_name, pcoll_by_name()[output_name], verbose).display_for_input() return None else: raise ValueError('Unsupported runner %s.', runner)