def __init__(self, underlying_runner=None, render_option=None, skip_display=True, force_compute=True, blocking=True): """Constructor of InteractiveRunner. Args: underlying_runner: (runner.PipelineRunner) render_option: (str) this parameter decides how the pipeline graph is rendered. See display.pipeline_graph_renderer for available options. skip_display: (bool) whether to skip display operations when running the pipeline. Useful if running large pipelines when display is not needed. force_compute: (bool) whether sequential pipeline runs can use cached data of PCollections computed from the previous runs including show API invocation from interactive_beam module. If True, always run the whole pipeline and compute data for PCollections forcefully. If False, use available data and run minimum pipeline fragment to only compute data not available. blocking: (bool) whether the pipeline run should be blocking or not. """ self._underlying_runner = (underlying_runner or direct_runner.DirectRunner()) self._render_option = render_option self._in_session = False self._skip_display = skip_display self._force_compute = force_compute self._blocking = blocking
def test_base_model_validator_ptransform(self): with pipeline.TestPipeline(runner=direct_runner.DirectRunner()) as p: invalid_id = MockModel(id='123@?!*', deleted=False, created_on=self.year_ago, last_updated=self.now) invalid_timestamp = MockModel(id='124', deleted=False, created_on=self.now, last_updated=self.year_later) expired_model = MockModel(id='125', deleted=True, created_on=self.year_ago, last_updated=self.year_ago) valid_model = MockModel(id='126', deleted=False, created_on=self.year_ago, last_updated=self.now) pcoll = (p | beam.Create([ invalid_id, invalid_timestamp, expired_model, valid_model ])) output = pcoll | base_model_validator.BaseModelValidator() beam_testing_util.assert_that( output, beam_testing_util.equal_to([ errors.ModelInvalidIdError(invalid_id), errors.ModelMutatedDuringJobError(invalid_timestamp), errors.ModelExpiredError(expired_model) ]))
def test_dataframes_with_multi_index_get_result(self): p = beam.Pipeline(runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) data = [ Record('a', 20, 170), Record('a', 30, 170), Record('b', 22, 180), Record('c', 18, 150) ] aggregate = lambda df: df.groupby(['name', 'height']).mean()['age'] deferred_df = aggregate(to_dataframe(p | beam.Create(data))) df_expected = aggregate(pd.DataFrame(data)) # Watch the local scope for Interactive Beam so that values will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() pd.testing.assert_series_equal(df_expected, ib.collect(deferred_df, n=10))
def test_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words p = beam.Pipeline(runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) # Count the occurrences of each word. counts = (p | beam.Create(['to be or not to be that is the question']) | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) result = p.run() result.wait_until_finish() actual = dict(result.get(counts)) self.assertDictEqual( actual, { 'to': 2, 'be': 2, 'or': 1, 'not': 1, 'that': 1, 'is': 1, 'the': 1, 'question': 1 })
def __init__(self, underlying_runner=None, cache_dir=None, cache_format='text', render_option=None, skip_display=False): """Constructor of InteractiveRunner. Args: underlying_runner: (runner.PipelineRunner) cache_dir: (str) the directory where PCollection caches are kept cache_format: (str) the file format that should be used for saving PCollection caches. Available options are 'text' and 'tfrecord'. render_option: (str) this parameter decides how the pipeline graph is rendered. See display.pipeline_graph_renderer for available options. skip_display: (bool) whether to skip display operations when running the pipeline. Useful if running large pipelines when display is not needed. """ self._underlying_runner = (underlying_runner or direct_runner.DirectRunner()) self._cache_manager = cache.FileBasedCacheManager( cache_dir, cache_format) self._renderer = pipeline_graph_renderer.get_renderer(render_option) self._in_session = False self._skip_display = skip_display
def test_basic(self): p = beam.Pipeline(runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) p.run().wait_until_finish() pc0 = (p | 'read' >> beam.Create([1, 2, 3]) | 'Print1.1' >> beam.Map(print_with_message('Run1.1'))) pc = pc0 | 'Print1.2' >> beam.Map(print_with_message('Run1.2')) p.run().wait_until_finish() _ = pc | 'Print2' >> beam.Map(print_with_message('Run2')) p.run().wait_until_finish() _ = pc0 | 'Print3' >> beam.Map(print_with_message('Run3')) p.run().wait_until_finish()
def run(self, result=None): """Run the test within the context of a test pipeline. https://docs.python.org/3/library/unittest.html#unittest.TestCase.run Args: result: TestResult | None. Holds onto the results of each test. If None, a temporary result object is created (by calling the defaultTestResult() method) and used instead. """ runner = direct_runner.DirectRunner() with test_pipeline.TestPipeline(runner=runner) as p: self.pipeline = p super(BeamTestBase, self).run(result=result)
def test_process_reports_model_mutated_during_job_error(self): with pipeline.TestPipeline(runner=direct_runner.DirectRunner()) as p: invalid_timestamp = MockModel(id='124', created_on=self.now, last_updated=self.year_later) pcoll = p | beam.Create([invalid_timestamp]) output = (pcoll | beam.ParDo( base_model_validator.ValidateModelTimeFields())) beam_testing_util.assert_that( output, beam_testing_util.equal_to( [errors.ModelMutatedDuringJobError(invalid_timestamp)]))
def test_process_reports_error_for_old_deleted_model(self): with pipeline.TestPipeline(runner=direct_runner.DirectRunner()) as p: expired_model = MockModel(id='123', deleted=True, created_on=self.year_ago, last_updated=self.year_ago) pcoll = p | beam.Create([expired_model]) output = (pcoll | beam.ParDo(base_model_validator.ValidateDeleted())) beam_testing_util.assert_that( output, beam_testing_util.equal_to( [errors.ModelExpiredError(expired_model)]))
def __init__(self, underlying_runner=None, cache_dir=None, render_option=None): """Constructor of InteractiveRunner. Args: underlying_runner: (runner.PipelineRunner) cache_dir: (str) the directory where PCollection caches are kept render_option: (str) this parameter decides how the pipeline graph is rendered. See display.pipeline_graph_renderer for available options. """ self._underlying_runner = (underlying_runner or direct_runner.DirectRunner()) self._cache_manager = cache.FileBasedCacheManager(cache_dir) self._renderer = pipeline_graph_renderer.get_renderer(render_option) self._in_session = False
def test_validate_model_id(self): with pipeline.TestPipeline(runner=direct_runner.DirectRunner()) as p: invalid_id_model = MockModel(id='123@?!*', created_on=self.year_ago, last_updated=self.now) pcoll = p | beam.Create([invalid_id_model]) output = (pcoll | beam.ParDo( base_model_validator.ValidateModelIdWithRegex(), '^[A-Za-z0-9-_]{1,%s}$' % base_models.ID_LENGTH)) beam_testing_util.assert_that( output, beam_testing_util.equal_to( [errors.ModelInvalidIdError(invalid_id_model)]))
def preprocess(*, common_config, scrape_type, data_dir, download_dir): """Preprocess the fever data to the TFDS Fever data. Args: common_config: Common configuration from config.Config scrape_type: Which scrape to use, drqa/lucene/ukp, in training data_dir: Where to write data to download_dir: Where to download data to, unused but required by TFDS API """ logging.info('Creating fever dataset builder') text_matcher_params_path = common_config.text_matcher_params fever_train_path = common_config.fever_train fever_dev_path = common_config.fever_dev fever_test_path = common_config.fever_test ukp_docs_train = common_config.ukp_docs_train ukp_docs_dev = common_config.ukp_docs_dev ukp_docs_test = common_config.ukp_docs_test builder = fever_tfds.FeverEvidence( wiki_db_path=common_config.wikipedia_db_path, text_matcher_params_path=text_matcher_params_path, fever_train_path=fever_train_path, fever_dev_path=fever_dev_path, fever_test_path=fever_test_path, drqa_db_path=common_config.drqa_scrape_db_path, lucene_db_path=common_config.lucene_scrape_db_path, data_dir=data_dir, n_similar_negatives=common_config.n_similar_negatives, n_background_negatives=common_config.n_background_negatives, ukp_docs_train=ukp_docs_train, ukp_docs_dev=ukp_docs_dev, ukp_docs_test=ukp_docs_test, train_scrape_type=scrape_type, title_in_scoring=common_config.title_in_scoring, n_inference_candidates=common_config.n_inference_candidates, include_not_enough_info=common_config.include_not_enough_info, n_inference_documents=common_config.n_inference_documents, max_inference_sentence_id=common_config.max_inference_sentence_id, ) logging.info('Preparing fever evidence dataset') beam_runner = direct_runner.DirectRunner() download_config = tfds.download.DownloadConfig(beam_runner=beam_runner, ) builder.download_and_prepare(download_dir=download_dir, download_config=download_config)
def testWriteSplitCounter(self): count = 10 def Pipeline(root): data = [tf.train.Example()] * count _ = (root | beam.Create(data) | base_example_gen_executor._WriteSplit( self._output_data_dir)) run_result = direct_runner.DirectRunner().run(Pipeline) run_result.wait_until_finish() num_instances = run_result.metrics().query( MetricsFilter().with_name('num_instances')) self.assertTrue(num_instances['counters']) self.assertEqual(len(num_instances['counters']), 1) self.assertEqual(num_instances['counters'][0].result, count)
def test_dataframes(self): p = beam.Pipeline( runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) data = p | beam.Create( [1, 2, 3]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x)) df = to_dataframe(data) # Watch the local scope for Interactive Beam so that values will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]}) pd.testing.assert_frame_equal( df_expected, ib.collect(df, n=10).reset_index(drop=True))
def run_type(pipeline, runner_type): """Executes pipeline with certain runner type.""" if runner_type == RunnerType.DIRECT: print( "Running pipeline with direct runner this might take a long time!") return direct_runner.DirectRunner().run(pipeline) if runner_type == RunnerType.DATAFLOW: options = pipeline_options.PipelineOptions() gc_options = options.view_as(pipeline_options.GoogleCloudOptions) gc_options.project = FLAGS.gc_project gc_options.region = FLAGS.gc_region gc_options.job_name = FLAGS.gc_job_name gc_options.staging_location = FLAGS.gc_staging_location gc_options.temp_location = FLAGS.gc_temp_location setup = options.view_as(pipeline_options.SetupOptions) setup.extra_packages = FLAGS.extra_packages return runners.DataflowRunner().run(pipeline, options=options) raise ValueError(f"Unsupported runner type: {runner_type}")
def test_validate_post_commit_is_private_when_status_is_private(self): with pipeline.TestPipeline(runner=direct_runner.DirectRunner()) as p: invalid_commit_status = base_models.BaseCommitLogEntryModel( id='123', created_on=self.year_ago, last_updated=self.now, commit_type='invalid-type', user_id='', post_commit_status='private', post_commit_is_private=False, commit_cmds=[]) pcoll = p | beam.Create([invalid_commit_status]) output = (pcoll | beam.ParDo( base_model_validator.ValidatePostCommitIsPrivate())) beam_testing_util.assert_that( output, beam_testing_util.equal_to([ errors.ModelInvalidCommitStatusError(invalid_commit_status) ]))
def __init__( self, underlying_runner=None, cache_dir=None, cache_format='text', render_option=None, skip_display=True, force_compute=True, blocking=True): """Constructor of InteractiveRunner. Args: underlying_runner: (runner.PipelineRunner) cache_dir: (str) the directory where PCollection caches are kept cache_format: (str) the file format that should be used for saving PCollection caches. Available options are 'text' and 'tfrecord'. render_option: (str) this parameter decides how the pipeline graph is rendered. See display.pipeline_graph_renderer for available options. skip_display: (bool) whether to skip display operations when running the pipeline. Useful if running large pipelines when display is not needed. force_compute: (bool) whether sequential pipeline runs can use cached data of PCollections computed from the previous runs including show API invocation from interactive_beam module. If True, always run the whole pipeline and compute data for PCollections forcefully. If False, use available data and run minimum pipeline fragment to only compute data not available. blocking: (bool) whether the pipeline run should be blocking or not. """ self._underlying_runner = ( underlying_runner or direct_runner.DirectRunner()) if not ie.current_env().cache_manager(): ie.current_env().set_cache_manager( cache.FileBasedCacheManager(cache_dir, cache_format)) self._cache_manager = ie.current_env().cache_manager() self._render_option = render_option self._in_session = False self._skip_display = skip_display self._force_compute = force_compute self._blocking = blocking
def testWriteSplitCounter_WithTFRECORDS_GZIP(self): count = 10 def Pipeline(root): data = [tf.train.Example()] * count _ = (root | beam.Create(data) | write_split.WriteSplit( self._output_data_dir, example_gen_pb2.FORMAT_TFRECORDS_GZIP)) run_result = direct_runner.DirectRunner().run(Pipeline) run_result.wait_until_finish() num_instances = run_result.metrics().query( MetricsFilter().with_name('num_instances')) self.assertTrue( fileio.exists( os.path.join(self._output_data_dir, 'data_tfrecord-00000-of-00001.gz'))) self.assertTrue(num_instances['counters']) self.assertEqual(len(num_instances['counters']), 1) self.assertEqual(num_instances['counters'][0].result, count)
def test_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words p = beam.Pipeline(runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) # Count the occurrences of each word. counts = (p | beam.Create(['to be or not to be that is the question']) | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that counts will be cached. ib.watch(locals()) result = p.run() result.wait_until_finish() actual = list(result.get(counts)) self.assertSetEqual( set(actual), set([ ('or', 1), ('that', 1), ('be', 2), ('is', 1), ('question', 1), ('to', 2), ('the', 1), ('not', 1), ])) # Truncate the precision to millis because the window coder uses millis # as units then gets upcast to micros. end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000 df_counts = ib.collect(counts, include_window_info=True) df_expected = pd.DataFrame( { 0: [e[0] for e in actual], 1: [e[1] for e in actual], 'event_time': [end_of_window for _ in actual], 'windows': [[GlobalWindow()] for _ in actual], 'pane_info': [ PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0) for _ in actual ] }, columns=[0, 1, 'event_time', 'windows', 'pane_info']) pd.testing.assert_frame_equal(df_expected, df_counts) actual_reified = result.get(counts, include_window_info=True) expected_reified = [ WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) for e in actual ] self.assertEqual(actual_reified, expected_reified)
def __init__(self, underlying_runner=None, cache_dir=None): self._underlying_runner = (underlying_runner or direct_runner.DirectRunner()) self._cache_manager = cache.FileBasedCacheManager(cache_dir) self._in_session = False
def setUp(self): self.runner = direct_runner.DirectRunner() self.cache_manager = cache.FileBasedCacheManager()
def setUp(self): self.runner = direct_runner.DirectRunner()