def test_get_val(self, cell):
        with cell:  # Cell 1
            pipeline = beam.Pipeline(ir.InteractiveRunner())
            # pylint: disable=range-builtin-not-iterating
            pcoll = pipeline | 'Create' >> beam.Create(range(10))

        with cell:  # Cell 2
            # Re-executes the line that created pipeline causing the original
            # pipeline become an anonymous pipeline that is still inspectable because
            # its pcoll is still inspectable.
            pipeline = beam.Pipeline(ir.InteractiveRunner())

        ib.watch(locals())
        ins = inspector.InteractiveEnvironmentInspector()
        _ = ins.list_inspectables()
        pipeline_identifier = obfuscate(inspector.meta('pipeline', pipeline))
        self.assertIs(ins.get_val(pipeline_identifier), pipeline)
        pcoll_identifier = obfuscate(inspector.meta('pcoll', pcoll))
        self.assertIs(ins.get_val(pcoll_identifier), pcoll)
        anonymous_pipeline_name = inspector.synthesize_pipeline_name(
            pcoll.pipeline)
        anonymous_pipeline_identifier = obfuscate(
            inspector.meta(anonymous_pipeline_name, pcoll.pipeline))
        self.assertIs(ins.get_val(anonymous_pipeline_identifier),
                      pcoll.pipeline)
Beispiel #2
0
  def test_inspect_pipelines(self, cell):
    with cell:  # Cell 1
      pipeline_1 = beam.Pipeline(ir.InteractiveRunner())
      pipeline_2 = beam.Pipeline(ir.InteractiveRunner())

    with cell:  # Cell 2
      # Re-executes the line that created pipeline_1 causing the original
      # pipeline_1 no longer inspectable.
      pipeline_1 = beam.Pipeline(ir.InteractiveRunner())

    ib.watch(locals())
    expected_inspectable_pipelines = {
        pipeline_1: 'pipeline_1', pipeline_2: 'pipeline_2'
    }
    for inspectable_pipeline in inspector.inspect_pipelines().items():
      self.assertTrue(
          inspectable_pipeline in expected_inspectable_pipelines.items())

    pipeline_1_metadata = inspector.meta('pipeline_1', pipeline_1)
    pipeline_2_metadata = inspector.meta('pipeline_2', pipeline_2)
    expected_inspectables = {
        obfuscate(pipeline_2_metadata): {
            'metadata': pipeline_2_metadata, 'value': pipeline_2
        },
        obfuscate(pipeline_1_metadata): {
            'metadata': pipeline_1_metadata, 'value': pipeline_1
        }
    }
    for inspectable in inspector.inspect().items():
      self.assertTrue(inspectable in expected_inspectables.items())
Beispiel #3
0
  def test_inspect(self, cell):
    with cell:  # Cell 1
      pipeline = beam.Pipeline(ir.InteractiveRunner())
      # Early watch the pipeline so that cell re-execution can be handled.
      ib.watch({'pipeline': pipeline})
      # pylint: disable=bad-option-value
      pcoll = pipeline | 'Create' >> beam.Create(range(10))

    with cell:  # Cell 2
      # Re-executes the line that created the pcoll causing the original
      # pcoll no longer inspectable.
      # pylint: disable=bad-option-value
      pcoll = pipeline | 'Create' >> beam.Create(range(10))

    ib.watch(locals())
    pipeline_metadata = inspector.meta('pipeline', pipeline)
    pcoll_metadata = inspector.meta('pcoll', pcoll)
    expected_inspectables = {
        obfuscate(pipeline_metadata): {
            'metadata': pipeline_metadata, 'value': pipeline
        },
        obfuscate(pcoll_metadata): {
            'metadata': pcoll_metadata, 'value': pcoll
        }
    }
    for inspectable in inspector.inspect().items():
      self.assertTrue(inspectable in expected_inspectables.items())
Beispiel #4
0
 def __init__(self, pcoll, include_window_info=False, display_facets=False):
     assert _pcoll_visualization_ready, (
         'Dependencies for PCollection visualization are not available. Please '
         'use `pip install apache-beam[interactive]` to install necessary '
         'dependencies and make sure that you are executing code in an '
         'interactive environment such as a Jupyter notebook.')
     assert isinstance(pcoll, pvalue.PCollection), (
         'pcoll should be apache_beam.pvalue.PCollection')
     self._pcoll = pcoll
     # This allows us to access cache key and other meta data about the pipeline
     # whether it's the pipeline defined in user code or a copy of that pipeline.
     # Thus, this module doesn't need any other user input but the PCollection
     # variable to be visualized. It then automatically figures out the pipeline
     # definition, materialized data and the pipeline result for the execution
     # even if the user never assigned or waited the result explicitly.
     # With only the constructor of PipelineInstrument, any interactivity related
     # pre-process or instrument is not triggered for performance concerns.
     self._pin = instr.PipelineInstrument(pcoll.pipeline)
     # Variable name as the title for element value in the rendered data table.
     self._pcoll_var = self._pin.cacheable_var_by_pcoll_id(
         self._pin.pcolls_to_pcoll_id.get(str(pcoll), None))
     if not self._pcoll_var:
         self._pcoll_var = 'Value'
     self._cache_key = self._pin.cache_key(self._pcoll)
     obfuscated_id = obfuscate(self._cache_key, id(self))
     self._dive_display_id = 'facets_dive_{}'.format(obfuscated_id)
     self._overview_display_id = 'facets_overview_{}'.format(obfuscated_id)
     self._df_display_id = 'df_{}'.format(obfuscated_id)
     self._include_window_info = include_window_info
     self._display_facets = display_facets
     self._is_datatable_empty = True
Beispiel #5
0
    def execute(self):
        """Executes all notebooks found in the scoped path and gathers their
    outputs into HTML pages stored in the output folder."""
        for path in self._paths:
            with open(path, 'r') as nb_f:
                nb = nbformat.read(nb_f, as_version=4)
                ep = ExecutePreprocessor(timeout=-1,
                                         allow_errors=True,
                                         kernel_name='test')
                ep.preprocess(nb,
                              {'metadata': {
                                  'path': os.path.dirname(path)
                              }})

            execution_id = obfuscate(path)
            output_html_path = os.path.join(self._output_html_dir,
                                            execution_id + '.html')
            with open(output_html_path, 'a+') as sink:
                sink.write('<html>\n')
                sink.write('<head>\n')
                sink.write('</head>\n')
                sink.write('<body>\n')
                for cell in nb['cells']:
                    if cell['cell_type'] == 'code':
                        for output in cell['outputs']:
                            _extract_html(output, sink)
                sink.write('</body>\n')
                sink.write('</html>\n')
            self._output_html_paths[execution_id] = output_html_path
            self._notebook_path_to_execution_id[path] = execution_id
Beispiel #6
0
 def test_list_clusters(self):
     master_url = 'test-url'
     cluster_name = 'test-cluster'
     project = 'test-project'
     region = 'test-region'
     pipelines = ['pid']
     dashboard = 'test-dashboard'
     ie.current_env(
     ).clusters.master_urls[master_url] = MasterURLIdentifier(
         project, region, cluster_name)
     ie.current_env(
     ).clusters.master_urls_to_pipelines[master_url] = pipelines
     ie.current_env(
     ).clusters.master_urls_to_dashboards[master_url] = dashboard
     ins = inspector.InteractiveEnvironmentInspector()
     cluster_id = obfuscate(project, region, cluster_name)
     self.assertEqual(
         {
             cluster_id: {
                 'cluster_name': cluster_name,
                 'project': project,
                 'region': region,
                 'master_url': master_url,
                 'dashboard': dashboard,
                 'pipelines': pipelines
             }
         }, json.loads(ins.list_clusters()))
Beispiel #7
0
    def list_clusters(self):
        """Retrieves information for all clusters as a json.

    The json object maps a unique obfuscated identifier of a cluster to
    the corresponding cluster_name, project, region, master_url, dashboard,
    and pipelines. Furthermore, copies the mapping to self._clusters.
    """
        from apache_beam.runners.interactive import interactive_environment as ie
        clusters = ie.current_env().clusters
        all_cluster_data = {}
        for master_url in clusters.master_urls:
            cluster_metadata = clusters.master_urls[master_url]
            project = cluster_metadata.project_id
            region = cluster_metadata.region
            name = cluster_metadata.cluster_name

            all_cluster_data[obfuscate(project, region, name)] = {
                'cluster_name': name,
                'project': project,
                'region': region,
                'master_url': master_url,
                'dashboard': clusters.master_urls_to_dashboards[master_url],
                'pipelines': clusters.master_urls_to_pipelines[master_url]
            }
        self._clusters = all_cluster_data
        return all_cluster_data
Beispiel #8
0
    def test_set_default_cluster(self):
        clusters = ie.current_env().clusters
        master_url = 'test-url'
        cluster_name = 'test-cluster'
        project = 'test-project'
        region = 'test-region'
        pipelines = ['pid']
        dashboard = 'test-dashboard'

        cluster_id = obfuscate(project, region, cluster_name)
        ie.current_env().inspector._clusters = {
            cluster_id: {
                'cluster_name': cluster_name,
                'project': project,
                'region': region,
                'master_url': master_url,
                'dashboard': dashboard,
                'pipelines': pipelines
            }
        }
        clusters.master_urls[master_url] = MasterURLIdentifier(
            project, region, cluster_name)
        clusters.set_default_cluster(
            ie.current_env().inspector.get_cluster_master_url(cluster_id))
        self.assertEqual(MasterURLIdentifier(project, region, cluster_name),
                         clusters.default_cluster_metadata)
Beispiel #9
0
    def test_delete_cluster(self):
        clusters = ie.current_env().clusters

        class MockClusterManager:
            master_url = 'test-url'

            def cleanup(self):
                pass

        master_url = 'test-url'
        cluster_name = 'test-cluster'
        project = 'test-project'
        region = 'test-region'
        metadata = MasterURLIdentifier(project, region, cluster_name)

        p = beam.Pipeline(ir.InteractiveRunner())
        ie.current_env()._tracked_user_pipelines.add_user_pipeline(p)
        clusters.master_urls[master_url] = metadata
        clusters.master_urls_to_dashboards[master_url] = 'test-dashboard'
        clusters.dataproc_cluster_managers[str(id(p))] = MockClusterManager()
        clusters.master_urls_to_pipelines[master_url] = [str(id(p))]

        cluster_id = obfuscate(project, region, cluster_name)
        ie.current_env().inspector._clusters[cluster_id] = {
            'master_url': master_url,
            'pipelines': [str(id(p))]
        }
        clusters.delete_cluster(
            ie.current_env().inspector.get_cluster_master_url(cluster_id))
        self.assertEqual(clusters.master_urls, {})
        self.assertEqual(clusters.master_urls_to_pipelines, {})
Beispiel #10
0
 def __init__(self, var, version, producer_version, pipeline_id):
     # Makes sure that the variable name is obfuscated and only first 10
     # characters taken so that the CacheKey has a constant length.
     self.var = obfuscate(var)[:10]
     self.version = version
     self.producer_version = producer_version
     self.pipeline_id = pipeline_id
    def list_inspectables(self):
        """Lists inspectables in JSON format.

    When listing, pcollections are organized by the pipeline they belong to.
    If a pipeline is no longer assigned to a variable but its pcollections
    assigned to variables are still in scope, the pipeline will be given a name
    as 'anonymous_pipeline[id:$inMemoryId]'.
    The listing doesn't contain object values of the pipelines or pcollections.
    The obfuscated identifier can be used to trace back to those values in the
    kernel.
    The listing includes anonymous pipelines that are not assigned to variables
    but still containing inspectable PCollections.
    """
        listing = {}
        pipelines = inspect_pipelines()
        for pipeline, name in pipelines.items():
            metadata = meta(name, pipeline)
            listing[obfuscate(metadata)] = {'metadata': metadata, 'pcolls': {}}
        for identifier, inspectable in self.inspectables.items():
            if inspectable['metadata']['type'] == 'pcollection':
                pipeline = inspectable['value'].pipeline
                if pipeline not in list(pipelines.keys()):
                    pipeline_name = synthesize_pipeline_name(pipeline)
                    pipelines[pipeline] = pipeline_name
                    pipeline_metadata = meta(pipeline_name, pipeline)
                    pipeline_identifier = obfuscate(pipeline_metadata)
                    self._anonymous[pipeline_identifier] = {
                        'metadata': pipeline_metadata,
                        'value': pipeline
                    }
                    listing[pipeline_identifier] = {
                        'metadata': pipeline_metadata,
                        'pcolls': {
                            identifier: inspectable['metadata']
                        }
                    }
                else:
                    pipeline_identifier = obfuscate(
                        meta(pipelines[pipeline], pipeline))
                    listing[pipeline_identifier]['pcolls'][
                        identifier] = inspectable['metadata']
        self._inspectable_pipelines = dict(
            (str(id(pipeline)), pipeline) for pipeline in pipelines)
        return listing
Beispiel #12
0
def _generate_output_name(output_name: Optional[str], query: str,
                          found: Dict[str, beam.PCollection]) -> str:
    """Generates a unique output name if None is provided.

  Otherwise, returns the given output name directly.
  The generated output name is sql_output_{uuid} where uuid is an obfuscated
  value from the query and PCollections found to be used in the query.
  """
    if not output_name:
        execution_id = obfuscate(query, found)[:12]
        output_name = 'sql_output_' + execution_id
    return output_name
    def test_list_inspectables(self, cell):
        with cell:  # Cell 1
            pipeline = beam.Pipeline(ir.InteractiveRunner())
            # pylint: disable=range-builtin-not-iterating
            pcoll_1 = pipeline | 'Create' >> beam.Create(range(10))
            pcoll_2 = pcoll_1 | 'Square' >> beam.Map(lambda x: x * x)

        with cell:  # Cell 2
            # Re-executes the line that created pipeline causing the original
            # pipeline become an anonymous pipeline that is still inspectable because
            # its pcoll_1 and pcoll_2 are still inspectable.
            pipeline = beam.Pipeline(ir.InteractiveRunner())

        ib.watch(locals())
        anonymous_pipeline_name = inspector.synthesize_pipeline_name(
            pcoll_1.pipeline)
        anonymous_pipeline_metadata = inspector.meta(anonymous_pipeline_name,
                                                     pcoll_1.pipeline)
        pipeline_metadata = inspector.meta('pipeline', pipeline)
        pcoll_1_metadata = inspector.meta('pcoll_1', pcoll_1)
        pcoll_2_metadata = inspector.meta('pcoll_2', pcoll_2)
        expected_inspectable_list = {
            obfuscate(pipeline_metadata): {
                'metadata': pipeline_metadata,
                'pcolls': {}
            },
            obfuscate(anonymous_pipeline_metadata): {
                'metadata': anonymous_pipeline_metadata,
                'pcolls': {
                    obfuscate(pcoll_1_metadata): pcoll_1_metadata,
                    obfuscate(pcoll_2_metadata): pcoll_2_metadata
                }
            }
        }
        ins = inspector.InteractiveEnvironmentInspector()
        actual_listings = ins.list_inspectables()
        self.assertEqual(actual_listings,
                         json.dumps(expected_inspectable_list))
 def test_list_clusters(self):
     meta = ClusterMetadata(project_id='project')
     dcm = self.current_env.clusters.create(meta)
     p = beam.Pipeline()
     dcm.pipelines.add(p)
     self.current_env.clusters.pipelines[p] = dcm
     cluster_id = obfuscate(meta)
     self.assertEqual(
         {
             cluster_id: {
                 'cluster_name': meta.cluster_name,
                 'project': meta.project_id,
                 'region': meta.region,
                 'master_url': meta.master_url,
                 'dashboard': meta.dashboard,
                 'pipelines': [str(id(p)) for p in dcm.pipelines]
             }
         }, json.loads(self.current_env.inspector.list_clusters()))
Beispiel #15
0
  def test_get_pcoll_data(self):
    pipeline = beam.Pipeline(ir.InteractiveRunner())
    # pylint: disable=range-builtin-not-iterating
    pcoll = pipeline | 'Create' >> beam.Create(range(10))
    counts = pcoll | beam.combiners.Count.PerElement()

    ib.watch(locals())
    counts_identifier = obfuscate(inspector.meta('counts', counts))
    ins = inspector.InteractiveEnvironmentInspector()
    _ = ins.list_inspectables()

    actual_counts_pcoll_data = ins.get_pcoll_data(counts_identifier)
    expected_counts_pcoll_data = ib.collect(counts).to_json(orient='table')
    self.assertEqual(actual_counts_pcoll_data, expected_counts_pcoll_data)

    actual_counts_with_window_info = ins.get_pcoll_data(counts_identifier, True)
    expected_counts_with_window_info = ib.collect(counts,
                                                  True).to_json(orient='table')
    self.assertEqual(
        actual_counts_with_window_info, expected_counts_with_window_info)
    def list_clusters(self):
        """Retrieves information for all clusters as a json.

    The json object maps a unique obfuscated identifier of a cluster to
    the corresponding cluster_name, project, region, master_url, dashboard,
    and pipelines. Furthermore, copies the mapping to self._clusters.
    """
        from apache_beam.runners.interactive import interactive_environment as ie

        clusters = ie.current_env().clusters
        all_cluster_data = {}
        for meta, dcm in clusters.dataproc_cluster_managers.items():
            all_cluster_data[obfuscate(meta)] = {
                'cluster_name': meta.cluster_name,
                'project': meta.project_id,
                'region': meta.region,
                'master_url': meta.master_url,
                'dashboard': meta.dashboard,
                'pipelines': [str(id(p)) for p in dcm.pipelines]
            }
        self._clusters = all_cluster_data
        return all_cluster_data
Beispiel #17
0
  def test_get_pcoll_data(self):
    pipeline = beam.Pipeline(ir.InteractiveRunner())
    # pylint: disable=bad-option-value
    pcoll = pipeline | 'Create' >> beam.Create(list(range(10)))
    counts = pcoll | beam.combiners.Count.PerElement()

    ib.watch(locals())
    ie.current_env().track_user_pipelines()
    counts_identifier = obfuscate(inspector.meta('counts', counts))
    ins = inspector.InteractiveEnvironmentInspector()
    _ = ins.list_inspectables()

    actual_counts_pcoll_data = ins.get_pcoll_data(counts_identifier)
    expected_counts_pcoll_data = ib.collect(
        counts, n=10).to_json(orient='table')
    self.assertEqual(actual_counts_pcoll_data, expected_counts_pcoll_data)

    actual_counts_with_window_info = ins.get_pcoll_data(counts_identifier, True)
    expected_counts_with_window_info = ib.collect(
        counts, include_window_info=True).to_json(orient='table')
    self.assertEqual(
        actual_counts_with_window_info, expected_counts_with_window_info)
def inspect(ignore_synthetic=True):
    """Inspects current interactive environment to track metadata and values of
  pipelines and pcollections.

  Each pipeline and pcollections tracked is given a unique identifier.
  """
    from apache_beam.runners.interactive import interactive_environment as ie

    inspectables = {}
    for watching in ie.current_env().watching():
        for name, value in watching:
            # Ignore synthetic vars created by Interactive Beam itself.
            if ignore_synthetic and name.startswith('synthetic_var_'):
                continue
            metadata = meta(name, value)
            identifier = obfuscate(metadata)
            if isinstance(value,
                          (beam.pipeline.Pipeline, beam.pvalue.PCollection)):
                inspectables[identifier] = {
                    'metadata': metadata,
                    'value': value
                }
    return inspectables
  def display_id(self, suffix):
    # type: (str) -> str

    """Returns a unique id able to be displayed in a web browser."""
    return utils.obfuscate(self._cache_key, suffix)
Beispiel #20
0
 def __post_init__(self):
     from apache_beam.runners.interactive.utils import obfuscate
     # Normalize arbitrary variable name to a fixed length hex str.
     self.var = obfuscate(self.var)[:10]
Beispiel #21
0
 def __post_init__(self):
     # Normalize arbitrary variable name to a fixed length hex str.
     self.var = obfuscate(self.var)[:10]