Ejemplo n.º 1
0
    def __init__(self,
                 underlying_runner=None,
                 render_option=None,
                 skip_display=True,
                 force_compute=True,
                 blocking=True):
        """Constructor of InteractiveRunner.

    Args:
      underlying_runner: (runner.PipelineRunner)
      render_option: (str) this parameter decides how the pipeline graph is
          rendered. See display.pipeline_graph_renderer for available options.
      skip_display: (bool) whether to skip display operations when running the
          pipeline. Useful if running large pipelines when display is not
          needed.
      force_compute: (bool) whether sequential pipeline runs can use cached data
          of PCollections computed from the previous runs including show API
          invocation from interactive_beam module. If True, always run the whole
          pipeline and compute data for PCollections forcefully. If False, use
          available data and run minimum pipeline fragment to only compute data
          not available.
      blocking: (bool) whether the pipeline run should be blocking or not.
    """
        self._underlying_runner = (underlying_runner
                                   or direct_runner.DirectRunner())
        self._render_option = render_option
        self._in_session = False
        self._skip_display = skip_display
        self._force_compute = force_compute
        self._blocking = blocking
Ejemplo n.º 2
0
    def test_base_model_validator_ptransform(self):
        with pipeline.TestPipeline(runner=direct_runner.DirectRunner()) as p:
            invalid_id = MockModel(id='123@?!*',
                                   deleted=False,
                                   created_on=self.year_ago,
                                   last_updated=self.now)
            invalid_timestamp = MockModel(id='124',
                                          deleted=False,
                                          created_on=self.now,
                                          last_updated=self.year_later)
            expired_model = MockModel(id='125',
                                      deleted=True,
                                      created_on=self.year_ago,
                                      last_updated=self.year_ago)
            valid_model = MockModel(id='126',
                                    deleted=False,
                                    created_on=self.year_ago,
                                    last_updated=self.now)
            pcoll = (p
                     | beam.Create([
                         invalid_id, invalid_timestamp, expired_model,
                         valid_model
                     ]))

            output = pcoll | base_model_validator.BaseModelValidator()

            beam_testing_util.assert_that(
                output,
                beam_testing_util.equal_to([
                    errors.ModelInvalidIdError(invalid_id),
                    errors.ModelMutatedDuringJobError(invalid_timestamp),
                    errors.ModelExpiredError(expired_model)
                ]))
Ejemplo n.º 3
0
    def test_dataframes_with_multi_index_get_result(self):
        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        data = [
            Record('a', 20, 170),
            Record('a', 30, 170),
            Record('b', 22, 180),
            Record('c', 18, 150)
        ]

        aggregate = lambda df: df.groupby(['name', 'height']).mean()['age']

        deferred_df = aggregate(to_dataframe(p | beam.Create(data)))
        df_expected = aggregate(pd.DataFrame(data))

        # Watch the local scope for Interactive Beam so that values will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        pd.testing.assert_series_equal(df_expected,
                                       ib.collect(deferred_df, n=10))
Ejemplo n.º 4
0
    def test_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        # Count the occurrences of each word.
        counts = (p
                  | beam.Create(['to be or not to be that is the question'])
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        result = p.run()
        result.wait_until_finish()

        actual = dict(result.get(counts))
        self.assertDictEqual(
            actual, {
                'to': 2,
                'be': 2,
                'or': 1,
                'not': 1,
                'that': 1,
                'is': 1,
                'the': 1,
                'question': 1
            })
Ejemplo n.º 5
0
    def __init__(self,
                 underlying_runner=None,
                 cache_dir=None,
                 cache_format='text',
                 render_option=None,
                 skip_display=False):
        """Constructor of InteractiveRunner.

    Args:
      underlying_runner: (runner.PipelineRunner)
      cache_dir: (str) the directory where PCollection caches are kept
      cache_format: (str) the file format that should be used for saving
          PCollection caches. Available options are 'text' and 'tfrecord'.
      render_option: (str) this parameter decides how the pipeline graph is
          rendered. See display.pipeline_graph_renderer for available options.
      skip_display: (bool) whether to skip display operations when running the
          pipeline. Useful if running large pipelines when display is not
          needed.
    """
        self._underlying_runner = (underlying_runner
                                   or direct_runner.DirectRunner())
        self._cache_manager = cache.FileBasedCacheManager(
            cache_dir, cache_format)
        self._renderer = pipeline_graph_renderer.get_renderer(render_option)
        self._in_session = False
        self._skip_display = skip_display
Ejemplo n.º 6
0
 def test_basic(self):
     p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
         direct_runner.DirectRunner()))
     p.run().wait_until_finish()
     pc0 = (p | 'read' >> beam.Create([1, 2, 3])
            | 'Print1.1' >> beam.Map(print_with_message('Run1.1')))
     pc = pc0 | 'Print1.2' >> beam.Map(print_with_message('Run1.2'))
     p.run().wait_until_finish()
     _ = pc | 'Print2' >> beam.Map(print_with_message('Run2'))
     p.run().wait_until_finish()
     _ = pc0 | 'Print3' >> beam.Map(print_with_message('Run3'))
     p.run().wait_until_finish()
Ejemplo n.º 7
0
    def run(self, result=None):
        """Run the test within the context of a test pipeline.

        https://docs.python.org/3/library/unittest.html#unittest.TestCase.run

        Args:
            result: TestResult | None. Holds onto the results of each test. If
                None, a temporary result object is created (by calling the
                defaultTestResult() method) and used instead.
        """
        runner = direct_runner.DirectRunner()
        with test_pipeline.TestPipeline(runner=runner) as p:
            self.pipeline = p
            super(BeamTestBase, self).run(result=result)
Ejemplo n.º 8
0
    def test_process_reports_model_mutated_during_job_error(self):
        with pipeline.TestPipeline(runner=direct_runner.DirectRunner()) as p:
            invalid_timestamp = MockModel(id='124',
                                          created_on=self.now,
                                          last_updated=self.year_later)
            pcoll = p | beam.Create([invalid_timestamp])

            output = (pcoll
                      | beam.ParDo(
                          base_model_validator.ValidateModelTimeFields()))

            beam_testing_util.assert_that(
                output,
                beam_testing_util.equal_to(
                    [errors.ModelMutatedDuringJobError(invalid_timestamp)]))
Ejemplo n.º 9
0
    def test_process_reports_error_for_old_deleted_model(self):
        with pipeline.TestPipeline(runner=direct_runner.DirectRunner()) as p:
            expired_model = MockModel(id='123',
                                      deleted=True,
                                      created_on=self.year_ago,
                                      last_updated=self.year_ago)
            pcoll = p | beam.Create([expired_model])

            output = (pcoll
                      | beam.ParDo(base_model_validator.ValidateDeleted()))

            beam_testing_util.assert_that(
                output,
                beam_testing_util.equal_to(
                    [errors.ModelExpiredError(expired_model)]))
Ejemplo n.º 10
0
  def __init__(self, underlying_runner=None, cache_dir=None,
               render_option=None):
    """Constructor of InteractiveRunner.

    Args:
      underlying_runner: (runner.PipelineRunner)
      cache_dir: (str) the directory where PCollection caches are kept
      render_option: (str) this parameter decides how the pipeline graph is
          rendered. See display.pipeline_graph_renderer for available options.
    """
    self._underlying_runner = (underlying_runner
                               or direct_runner.DirectRunner())
    self._cache_manager = cache.FileBasedCacheManager(cache_dir)
    self._renderer = pipeline_graph_renderer.get_renderer(render_option)
    self._in_session = False
Ejemplo n.º 11
0
    def test_validate_model_id(self):
        with pipeline.TestPipeline(runner=direct_runner.DirectRunner()) as p:
            invalid_id_model = MockModel(id='123@?!*',
                                         created_on=self.year_ago,
                                         last_updated=self.now)
            pcoll = p | beam.Create([invalid_id_model])

            output = (pcoll
                      | beam.ParDo(
                          base_model_validator.ValidateModelIdWithRegex(),
                          '^[A-Za-z0-9-_]{1,%s}$' % base_models.ID_LENGTH))

            beam_testing_util.assert_that(
                output,
                beam_testing_util.equal_to(
                    [errors.ModelInvalidIdError(invalid_id_model)]))
Ejemplo n.º 12
0
def preprocess(*, common_config, scrape_type, data_dir, download_dir):
    """Preprocess the fever data to the TFDS Fever data.

  Args:
    common_config: Common configuration from config.Config
    scrape_type: Which scrape to use, drqa/lucene/ukp, in training
    data_dir: Where to write data to
    download_dir: Where to download data to, unused but required by TFDS API
  """
    logging.info('Creating fever dataset builder')
    text_matcher_params_path = common_config.text_matcher_params
    fever_train_path = common_config.fever_train
    fever_dev_path = common_config.fever_dev
    fever_test_path = common_config.fever_test

    ukp_docs_train = common_config.ukp_docs_train
    ukp_docs_dev = common_config.ukp_docs_dev
    ukp_docs_test = common_config.ukp_docs_test
    builder = fever_tfds.FeverEvidence(
        wiki_db_path=common_config.wikipedia_db_path,
        text_matcher_params_path=text_matcher_params_path,
        fever_train_path=fever_train_path,
        fever_dev_path=fever_dev_path,
        fever_test_path=fever_test_path,
        drqa_db_path=common_config.drqa_scrape_db_path,
        lucene_db_path=common_config.lucene_scrape_db_path,
        data_dir=data_dir,
        n_similar_negatives=common_config.n_similar_negatives,
        n_background_negatives=common_config.n_background_negatives,
        ukp_docs_train=ukp_docs_train,
        ukp_docs_dev=ukp_docs_dev,
        ukp_docs_test=ukp_docs_test,
        train_scrape_type=scrape_type,
        title_in_scoring=common_config.title_in_scoring,
        n_inference_candidates=common_config.n_inference_candidates,
        include_not_enough_info=common_config.include_not_enough_info,
        n_inference_documents=common_config.n_inference_documents,
        max_inference_sentence_id=common_config.max_inference_sentence_id,
    )

    logging.info('Preparing fever evidence dataset')
    beam_runner = direct_runner.DirectRunner()
    download_config = tfds.download.DownloadConfig(beam_runner=beam_runner, )
    builder.download_and_prepare(download_dir=download_dir,
                                 download_config=download_config)
Ejemplo n.º 13
0
    def testWriteSplitCounter(self):
        count = 10

        def Pipeline(root):
            data = [tf.train.Example()] * count
            _ = (root
                 | beam.Create(data)
                 | base_example_gen_executor._WriteSplit(
                     self._output_data_dir))

        run_result = direct_runner.DirectRunner().run(Pipeline)
        run_result.wait_until_finish()

        num_instances = run_result.metrics().query(
            MetricsFilter().with_name('num_instances'))
        self.assertTrue(num_instances['counters'])
        self.assertEqual(len(num_instances['counters']), 1)
        self.assertEqual(num_instances['counters'][0].result, count)
Ejemplo n.º 14
0
  def test_dataframes(self):
    p = beam.Pipeline(
        runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))
    data = p | beam.Create(
        [1, 2, 3]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))
    df = to_dataframe(data)

    # Watch the local scope for Interactive Beam so that values will be cached.
    ib.watch(locals())

    # This is normally done in the interactive_utils when a transform is
    # applied but needs an IPython environment. So we manually run this here.
    ie.current_env().track_user_pipelines()

    df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]})
    pd.testing.assert_frame_equal(
        df_expected, ib.collect(df, n=10).reset_index(drop=True))
Ejemplo n.º 15
0
def run_type(pipeline, runner_type):
    """Executes pipeline with certain runner type."""
    if runner_type == RunnerType.DIRECT:
        print(
            "Running pipeline with direct runner this might take a long time!")
        return direct_runner.DirectRunner().run(pipeline)
    if runner_type == RunnerType.DATAFLOW:
        options = pipeline_options.PipelineOptions()
        gc_options = options.view_as(pipeline_options.GoogleCloudOptions)
        gc_options.project = FLAGS.gc_project
        gc_options.region = FLAGS.gc_region
        gc_options.job_name = FLAGS.gc_job_name
        gc_options.staging_location = FLAGS.gc_staging_location
        gc_options.temp_location = FLAGS.gc_temp_location
        setup = options.view_as(pipeline_options.SetupOptions)
        setup.extra_packages = FLAGS.extra_packages
        return runners.DataflowRunner().run(pipeline, options=options)
    raise ValueError(f"Unsupported runner type: {runner_type}")
Ejemplo n.º 16
0
    def test_validate_post_commit_is_private_when_status_is_private(self):
        with pipeline.TestPipeline(runner=direct_runner.DirectRunner()) as p:
            invalid_commit_status = base_models.BaseCommitLogEntryModel(
                id='123',
                created_on=self.year_ago,
                last_updated=self.now,
                commit_type='invalid-type',
                user_id='',
                post_commit_status='private',
                post_commit_is_private=False,
                commit_cmds=[])
            pcoll = p | beam.Create([invalid_commit_status])

            output = (pcoll
                      | beam.ParDo(
                          base_model_validator.ValidatePostCommitIsPrivate()))
            beam_testing_util.assert_that(
                output,
                beam_testing_util.equal_to([
                    errors.ModelInvalidCommitStatusError(invalid_commit_status)
                ]))
Ejemplo n.º 17
0
  def __init__(
      self,
      underlying_runner=None,
      cache_dir=None,
      cache_format='text',
      render_option=None,
      skip_display=True,
      force_compute=True,
      blocking=True):
    """Constructor of InteractiveRunner.

    Args:
      underlying_runner: (runner.PipelineRunner)
      cache_dir: (str) the directory where PCollection caches are kept
      cache_format: (str) the file format that should be used for saving
          PCollection caches. Available options are 'text' and 'tfrecord'.
      render_option: (str) this parameter decides how the pipeline graph is
          rendered. See display.pipeline_graph_renderer for available options.
      skip_display: (bool) whether to skip display operations when running the
          pipeline. Useful if running large pipelines when display is not
          needed.
      force_compute: (bool) whether sequential pipeline runs can use cached data
          of PCollections computed from the previous runs including show API
          invocation from interactive_beam module. If True, always run the whole
          pipeline and compute data for PCollections forcefully. If False, use
          available data and run minimum pipeline fragment to only compute data
          not available.
      blocking: (bool) whether the pipeline run should be blocking or not.
    """
    self._underlying_runner = (
        underlying_runner or direct_runner.DirectRunner())
    if not ie.current_env().cache_manager():
      ie.current_env().set_cache_manager(
          cache.FileBasedCacheManager(cache_dir, cache_format))
    self._cache_manager = ie.current_env().cache_manager()
    self._render_option = render_option
    self._in_session = False
    self._skip_display = skip_display
    self._force_compute = force_compute
    self._blocking = blocking
Ejemplo n.º 18
0
    def testWriteSplitCounter_WithTFRECORDS_GZIP(self):
        count = 10

        def Pipeline(root):
            data = [tf.train.Example()] * count
            _ = (root
                 | beam.Create(data)
                 | write_split.WriteSplit(
                     self._output_data_dir,
                     example_gen_pb2.FORMAT_TFRECORDS_GZIP))

        run_result = direct_runner.DirectRunner().run(Pipeline)
        run_result.wait_until_finish()

        num_instances = run_result.metrics().query(
            MetricsFilter().with_name('num_instances'))

        self.assertTrue(
            fileio.exists(
                os.path.join(self._output_data_dir,
                             'data_tfrecord-00000-of-00001.gz')))
        self.assertTrue(num_instances['counters'])
        self.assertEqual(len(num_instances['counters']), 1)
        self.assertEqual(num_instances['counters'][0].result, count)
    def test_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        # Count the occurrences of each word.
        counts = (p
                  | beam.Create(['to be or not to be that is the question'])
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that counts will be cached.
        ib.watch(locals())

        result = p.run()
        result.wait_until_finish()

        actual = list(result.get(counts))
        self.assertSetEqual(
            set(actual),
            set([
                ('or', 1),
                ('that', 1),
                ('be', 2),
                ('is', 1),
                ('question', 1),
                ('to', 2),
                ('the', 1),
                ('not', 1),
            ]))

        # Truncate the precision to millis because the window coder uses millis
        # as units then gets upcast to micros.
        end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
        df_counts = ib.collect(counts, include_window_info=True)
        df_expected = pd.DataFrame(
            {
                0: [e[0] for e in actual],
                1: [e[1] for e in actual],
                'event_time': [end_of_window for _ in actual],
                'windows': [[GlobalWindow()] for _ in actual],
                'pane_info': [
                    PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)
                    for _ in actual
                ]
            },
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])

        pd.testing.assert_frame_equal(df_expected, df_counts)

        actual_reified = result.get(counts, include_window_info=True)
        expected_reified = [
            WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()],
                          PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))
            for e in actual
        ]
        self.assertEqual(actual_reified, expected_reified)
Ejemplo n.º 20
0
 def __init__(self, underlying_runner=None, cache_dir=None):
   self._underlying_runner = (underlying_runner
                              or direct_runner.DirectRunner())
   self._cache_manager = cache.FileBasedCacheManager(cache_dir)
   self._in_session = False
Ejemplo n.º 21
0
 def setUp(self):
     self.runner = direct_runner.DirectRunner()
     self.cache_manager = cache.FileBasedCacheManager()
Ejemplo n.º 22
0
 def setUp(self):
     self.runner = direct_runner.DirectRunner()