def test_label_detection_with_video_context(self): with TestPipeline(is_integration_test=True) as p: output = ( p | beam.Create([( self.VIDEO_PATH, types.VideoContext( label_detection_config=types.LabelDetectionConfig( label_detection_mode=enums.LabelDetectionMode.SHOT_MODE, model='builtin/latest')))]) | AnnotateVideoWithContext(features=[enums.Feature.LABEL_DETECTION]) | beam.ParDo(extract_entities_descriptions) | beam.combiners.ToList()) # Search for at least one entity that contains 'bicycle'. assert_that( output, matches_all([hc.has_item(hc.contains_string('bicycle'))]))
def test_label_detection_with_video_context(self): with TestPipeline(is_integration_test=True) as p: output = ( p | beam.Create( [(self.VIDEO_PATH, types.VideoContext( label_detection_config=types.LabelDetectionConfig( label_detection_mode=enums.LabelDetectionMode. SHOT_MODE)))]) | AnnotateVideoWithContext( features=[enums.Feature.LABEL_DETECTION]) | beam.ParDo(extract_entities_descriptions) | beam.combiners.ToList()) assert_that(output, matches_all([hc.has_items('bicycle', 'dinosaur')]))
def test_streaming_different_file_types(self): dir = self._new_tempdir() input = iter(WriteFilesTest.SIMPLE_COLLECTION) ts = (TestStream().advance_watermark_to(0).add_elements( [next(input), next(input)]).advance_watermark_to(10).add_elements( [next(input), next(input)]).advance_watermark_to(20).add_elements([ next(input), next(input) ]).advance_watermark_to(30).add_elements([ next(input), next(input) ]).advance_watermark_to(40).advance_watermark_to_infinity()) def no_colon_file_naming(*args): file_name = fileio.destination_prefix_naming()(*args) return file_name.replace(':', '_') with TestPipeline() as p: _ = (p | ts | beam.WindowInto(FixedWindows(10)) | beam.io.fileio.WriteToFiles( path=dir, destination=lambda record: record['foundation'], sink=lambda dest: (WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) if dest == 'apache' else WriteFilesTest.JsonSink()), file_naming=no_colon_file_naming, max_writers_per_bundle=0, )) with TestPipeline() as p: cncf_files = (p | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) | "CncfFileNames" >> beam.Map(lambda fm: fm.path)) apache_files = (p | "MatchApache" >> fileio.MatchFiles( FileSystems.join(dir, 'apache*')) | "ApacheFileNames" >> beam.Map(lambda fm: fm.path)) assert_that( cncf_files, matches_all([ stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_00-1970-01-01T00_00_10--.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_10-1970-01-01T00_00_20--.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_20-1970-01-01T00_00_30--.*'), stringmatches.matches_regexp( '.*cncf-1970-01-01T00_00_30-1970-01-01T00_00_40--.*') ]), label='verifyCNCFFiles') assert_that( apache_files, matches_all([ stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_00-1970-01-01T00_00_10--.*' ), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_10-1970-01-01T00_00_20--.*' ), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_20-1970-01-01T00_00_30--.*' ), stringmatches.matches_regexp( '.*apache-1970-01-01T00_00_30-1970-01-01T00_00_40--.*') ]), label='verifyApacheFiles')
def test_streaming_complex_timing(self): # Use state on the TestCase class, since other references would be pickled # into a closure and not have the desired side effects. # # TODO(BEAM-5295): Use assert_that after it works for the cases here in # streaming mode. WriteFilesTest.all_records = [] dir = '%s%s' % (self._new_tempdir(), os.sep) # Setting up the input (TestStream) ts = TestStream().advance_watermark_to(0) for elm in WriteFilesTest.LARGER_COLLECTION: timestamp = int(elm) ts.add_elements([('key', '%s' % elm)]) if timestamp % 5 == 0 and timestamp != 0: # TODO(BEAM-3759): Add many firings per window after getting PaneInfo. ts.advance_processing_time(5) ts.advance_watermark_to(timestamp) ts.advance_watermark_to_infinity() def no_colon_file_naming(*args): file_name = fileio.destination_prefix_naming()(*args) return file_name.replace(':', '_') # The pipeline that we are testing options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: res = (p | ts | beam.WindowInto( FixedWindows(10), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.FlatMap(lambda x: x[1])) # Triggering after 5 processing-time seconds, and on the watermark. Also # discarding old elements. _ = (res | beam.io.fileio.WriteToFiles( path=dir, file_naming=no_colon_file_naming, max_writers_per_bundle=0) | beam.Map(lambda fr: FileSystems.join(dir, fr.file_name)) | beam.ParDo(self.record_dofn())) # Verification pipeline with TestPipeline() as p: files = (p | beam.io.fileio.MatchFiles(FileSystems.join(dir, '*'))) file_names = (files | beam.Map(lambda fm: fm.path)) file_contents = ( files | beam.io.fileio.ReadMatches() | beam.Map(lambda rf: (rf.metadata.path, rf.read_utf8().strip( ).split('\n')))) content = (file_contents | beam.FlatMap(lambda fc: [ln.strip() for ln in fc[1]])) assert_that(file_names, equal_to(WriteFilesTest.all_records), label='AssertFilesMatch') assert_that(content, matches_all(WriteFilesTest.LARGER_COLLECTION), label='AssertContentsMatch')