def test_transform_on_gcs(self): args = self.test_pipeline.get_full_options_as_args() with beam.Pipeline(argv=args) as p: matches_pc = ( p | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE]) | fileio.MatchAll() | 'GetPath' >> beam.Map(lambda metadata: metadata.path)) assert_that(matches_pc, equal_to([self.INPUT_FILE] + self.WIKI_FILES), label='Matched Files') checksum_pc = ( p | 'SingleFile' >> beam.Create([self.INPUT_FILE]) | 'MatchOneAll' >> fileio.MatchAll() | fileio.ReadMatches() | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n')) | 'Checksums' >> beam.Map(compute_hash)) assert_that(checksum_pc, equal_to([self.KINGLEAR_CHECKSUM]), label='Assert Checksums')
def test_store_fileio_file_small_buffer_flush(self, FakeClient): input_dict = {} input_dict['project_id'] = "test_project" input_dict['region'] = "test_region" input_dict['dataset_id'] = "test_dataset_id" input_dict['dicom_store_id'] = "test_dicom_store_id" fc = FakeHttpClient() FakeClient.return_value = fc temp_dir = '%s%s' % (self._new_tempdir(), os.sep) dict_input_1 = { 'PatientName': 'George', 'Age': 23, 'TestResult': 'Negative' } str_input_1 = json.dumps(dict_input_1) self._create_temp_file(dir=temp_dir, content=str_input_1) dict_input_2 = {'PatientName': 'Peter', 'Age': 54, 'TestResult': 'Positive'} str_input_2 = json.dumps(dict_input_2) self._create_temp_file(dir=temp_dir, content=str_input_2) dict_input_3 = {'PatientName': 'Zen', 'Age': 27, 'TestResult': 'Negative'} str_input_3 = json.dumps(dict_input_3) self._create_temp_file(dir=temp_dir, content=str_input_3) with TestPipeline() as p: results = ( p | beam.Create([FileSystems.join(temp_dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | UploadToDicomStore(input_dict, 'fileio', buffer_size=1) | beam.Map(lambda x: x['success'])) assert_that(results, equal_to([True] * 3)) self.assertTrue(dict_input_1 in fc.dicom_metadata) self.assertTrue(dict_input_2 in fc.dicom_metadata) self.assertTrue(dict_input_3 in fc.dicom_metadata)
def expand(self, pcoll): return (pcoll | 'MatchAll' >> fileio.MatchAll() | beam.Reshuffle() | 'ReadEach' >> fileio.ReadMatches() | beam.FlatMap(lambda rfile: csv.DictReader( io.TextIOWrapper(rfile.open()))))
def test_basic_file_name_provided(self): content = 'TestingMyContent\nIn multiple lines\nhaha!' dir = '%s/' % self._new_tempdir() self._create_temp_file(dir=dir, content=content) with TestPipeline() as p: content_pc = (p | beam.Create([dir]) | fileio.MatchAll() | fileio.ReadMatches() | beam.Map(lambda f: f.read().decode('utf-8'))) assert_that(content_pc, equal_to([content]))
def test_basic_file_name_provided(self): content = 'TestingMyContent\nIn multiple lines\nhaha!' dir = '%s%s' % (self._new_tempdir(), os.sep) self._create_temp_file(dir=dir, content=content) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read().decode('utf-8').splitlines())) assert_that(content_pc, equal_to(content.splitlines()))
def test_csv_file_source(self): content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden' rows = [r.split(',') for r in content.split('\n')] dir = '%s%s' % (self._new_tempdir(), os.sep) self._create_temp_file(dir=dir, content=content) with TestPipeline() as p: content_pc = (p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) assert_that(content_pc, equal_to(rows))
def test_match_files_one_directory_failure(self): directories = [ '%s%s' % (self._new_tempdir(), os.sep), '%s%s' % (self._new_tempdir(), os.sep)] files = list() files.append(self._create_temp_file(dir=directories[0])) files.append(self._create_temp_file(dir=directories[0])) with TestPipeline() as p: files_pc = ( p | beam.Create(['%s*' % d for d in directories]) | fileio.MatchAll(fileio.EmptyMatchTreatment.ALLOW_IF_WILDCARD) | beam.Map(lambda x: x.path)) assert_that(files_pc, equal_to(files))
def test_read_gzip_compressed_file_without_suffix(self): dir = '%s%s' % (self._new_tempdir(), os.sep) file_contents = b'compressed_contents!' import gzip with gzip.GzipFile(os.path.join(dir, 'compressed'), 'w') as f: f.write(file_contents) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.Map(lambda rf: rf.open(compression_type=CompressionTypes .GZIP).read(len(file_contents)))) assert_that(content_pc, equal_to([file_contents]))
def test_match_files_one_directory_failure(self): directories = [ '%s%s' % (self._new_tempdir(), os.sep), '%s%s' % (self._new_tempdir(), os.sep)] files = list() files.append(self._create_temp_file(dir=directories[0])) files.append(self._create_temp_file(dir=directories[0])) with self.assertRaises(beam.io.filesystem.BeamIOError): with TestPipeline() as p: files_pc = ( p | beam.Create(directories) | fileio.MatchAll(fileio.EmptyMatchTreatment.DISALLOW) | beam.Map(lambda x: x.path)) assert_that(files_pc, equal_to(files))
def test_match_all_two_directories(self): files = [] directories = [] for _ in range(2): # TODO: What about this having to append the ending slash? d = '%s/' % self._new_tempdir() directories.append(d) files.append(self._create_temp_file(dir=d)) files.append(self._create_temp_file(dir=d)) with TestPipeline() as p: files_pc = (p | beam.Create(directories) | fileio.MatchAll() | beam.Map(lambda x: x.path)) assert_that(files_pc, equal_to(files))
def test_infer_compressed_file(self): dir = '%s%s' % (self._new_tempdir(), os.sep) file_contents = b'compressed_contents!' import gzip with gzip.GzipFile(os.path.join(dir, 'compressed.gz'), 'w') as f: f.write(file_contents) file_contents2 = b'compressed_contents_bz2!' import bz2 with bz2.BZ2File(os.path.join(dir, 'compressed2.bz2'), 'w') as f: f.write(file_contents2) with TestPipeline() as p: content_pc = (p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.Map(lambda rf: rf.open().readline())) assert_that(content_pc, equal_to([file_contents, file_contents2]))
def test_csv_file_source(self): content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden' rows = [r.split(',') for r in content.split('\n')] dir = '%s/' % self._new_tempdir() self._create_temp_file(dir=dir, content=content) def get_csv_reader(readable_file): if sys.version_info >= (3, 0): return csv.reader(io.TextIOWrapper(readable_file.open())) else: return csv.reader(readable_file.open()) with TestPipeline() as p: content_pc = (p | beam.Create([dir]) | fileio.MatchAll() | fileio.ReadMatches() | beam.FlatMap(get_csv_reader)) assert_that(content_pc, equal_to(rows))
def expand(self, root): paths_pcoll = root | beam.Create([self.path]) match = io.filesystems.FileSystems.match([self.path], limits=[1])[0] if not match.metadata_list: # TODO(BEAM-12031): This should be allowed for streaming pipelines if # user provides an explicit schema. raise FileNotFoundError(f"Found no files that match {self.path!r}") first_path = match.metadata_list[0].path with io.filesystems.FileSystems.open(first_path) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) matches_pcoll = paths_pcoll | fileio.MatchAll() indices_pcoll = ( matches_pcoll.pipeline | 'DoOnce' >> beam.Create([None]) | beam.Map( lambda _, paths: {path: ix for ix, path in enumerate(sorted(paths))}, paths=beam.pvalue.AsList(matches_pcoll | beam.Map(lambda match: match.path))) ) pcoll = (matches_pcoll | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter), path_indices=beam.pvalue.AsSingleton(indices_pcoll))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=sample[:0])