コード例 #1
0
def cache_output(output_name: str, output: PValue) -> None:
  user_pipeline = ie.current_env().user_pipeline(output.pipeline)
  if user_pipeline:
    cache_manager = ie.current_env().get_cache_manager(
        user_pipeline, create_if_absent=True)
  else:
    _LOGGER.warning(
        'Something is wrong with %s. Cannot introspect its data.', output)
    return
  key = CacheKey.from_pcoll(output_name, output).to_str()
  _ = reify_to_cache(pcoll=output, cache_key=key, cache_manager=cache_manager)
  try:
    output.pipeline.run().wait_until_finish()
  except (KeyboardInterrupt, SystemExit):
    raise
  except:  # pylint: disable=bare-except
    _LOGGER.warning(
        _NOT_SUPPORTED_MSG, traceback.format_exc(), output.pipeline.runner)
    return
  ie.current_env().mark_pcollection_computed([output])
  visualize_computed_pcoll(
      output_name, output, max_n=float('inf'), max_duration_secs=float('inf'))
コード例 #2
0
 def cache_key_of(self, name, pcoll):
     return CacheKey.from_pcoll(name, pcoll).to_str()
コード例 #3
0
def _build_query_components(
    query: str,
    found: Dict[str, beam.PCollection],
    output_name: str,
    run: bool = True
) -> Tuple[str, Union[Dict[str, beam.PCollection], beam.PCollection,
                      beam.Pipeline], SqlChain]:
    """Builds necessary components needed to apply the SqlTransform.

  Args:
    query: The SQL query to be executed by the magic.
    found: The PCollections with variable names found to be used by the query.
    output_name: The output variable name in __main__ module.
    run: Whether to prepare components for a local run or not.

  Returns:
    The processed query to be executed by the magic; a source to apply the
    SqlTransform to: a dictionary of tagged PCollections, or a single
    PCollection, or the pipeline to execute the query; the chain of applied
    beam_sql magics this one belongs to.
  """
    if found:
        user_pipeline = ie.current_env().user_pipeline(
            next(iter(found.values())).pipeline)
        sql_pipeline = beam.Pipeline(options=user_pipeline._options)
        ie.current_env().add_derived_pipeline(user_pipeline, sql_pipeline)
        sql_source = {}
        if run:
            if has_source_to_cache(user_pipeline):
                sql_source = pcolls_from_streaming_cache(
                    user_pipeline, sql_pipeline, found)
            else:
                cache_manager = ie.current_env().get_cache_manager(
                    user_pipeline, create_if_absent=True)
                for pcoll_name, pcoll in found.items():
                    cache_key = CacheKey.from_pcoll(pcoll_name, pcoll).to_str()
                    sql_source[pcoll_name] = unreify_from_cache(
                        pipeline=sql_pipeline,
                        cache_key=cache_key,
                        cache_manager=cache_manager,
                        element_type=pcoll.element_type)
        else:
            sql_source = found
        if len(sql_source) == 1:
            query = replace_single_pcoll_token(query,
                                               next(iter(sql_source.keys())))
            sql_source = next(iter(sql_source.values()))

        node = SqlNode(output_name=output_name,
                       source=set(found.keys()),
                       query=query)
        chain = ie.current_env().get_sql_chain(
            user_pipeline, set_user_pipeline=True).append(node)
    else:  # does not query any existing PCollection
        sql_source = beam.Pipeline()
        ie.current_env().add_user_pipeline(sql_source)

        # The node should be the root node of the chain created below.
        node = SqlNode(output_name=output_name, source=sql_source, query=query)
        chain = ie.current_env().get_sql_chain(sql_source).append(node)
    return query, sql_source, chain