Esempio n. 1
0
    def test_transform_on_gcs(self):
        args = self.test_pipeline.get_full_options_as_args()

        with beam.Pipeline(argv=args) as p:
            matches_pc = (
                p
                | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE])
                | fileio.MatchAll()
                | 'GetPath' >> beam.Map(lambda metadata: metadata.path))

            assert_that(matches_pc,
                        equal_to([self.INPUT_FILE] + self.WIKI_FILES),
                        label='Matched Files')

            checksum_pc = (
                p
                | 'SingleFile' >> beam.Create([self.INPUT_FILE])
                | 'MatchOneAll' >> fileio.MatchAll()
                | fileio.ReadMatches()
                | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n'))
                | 'Checksums' >> beam.Map(compute_hash))

            assert_that(checksum_pc,
                        equal_to([self.KINGLEAR_CHECKSUM]),
                        label='Assert Checksums')
Esempio n. 2
0
  def test_store_fileio_file_small_buffer_flush(self, FakeClient):
    input_dict = {}
    input_dict['project_id'] = "test_project"
    input_dict['region'] = "test_region"
    input_dict['dataset_id'] = "test_dataset_id"
    input_dict['dicom_store_id'] = "test_dicom_store_id"

    fc = FakeHttpClient()
    FakeClient.return_value = fc

    temp_dir = '%s%s' % (self._new_tempdir(), os.sep)
    dict_input_1 = {
        'PatientName': 'George', 'Age': 23, 'TestResult': 'Negative'
    }
    str_input_1 = json.dumps(dict_input_1)
    self._create_temp_file(dir=temp_dir, content=str_input_1)
    dict_input_2 = {'PatientName': 'Peter', 'Age': 54, 'TestResult': 'Positive'}
    str_input_2 = json.dumps(dict_input_2)
    self._create_temp_file(dir=temp_dir, content=str_input_2)
    dict_input_3 = {'PatientName': 'Zen', 'Age': 27, 'TestResult': 'Negative'}
    str_input_3 = json.dumps(dict_input_3)
    self._create_temp_file(dir=temp_dir, content=str_input_3)

    with TestPipeline() as p:
      results = (
          p
          | beam.Create([FileSystems.join(temp_dir, '*')])
          | fileio.MatchAll()
          | fileio.ReadMatches()
          | UploadToDicomStore(input_dict, 'fileio', buffer_size=1)
          | beam.Map(lambda x: x['success']))
      assert_that(results, equal_to([True] * 3))
    self.assertTrue(dict_input_1 in fc.dicom_metadata)
    self.assertTrue(dict_input_2 in fc.dicom_metadata)
    self.assertTrue(dict_input_3 in fc.dicom_metadata)
Esempio n. 3
0
 def expand(self, pcoll):
     return (pcoll
             | 'MatchAll' >> fileio.MatchAll()
             | beam.Reshuffle()
             | 'ReadEach' >> fileio.ReadMatches()
             | beam.FlatMap(lambda rfile: csv.DictReader(
                 io.TextIOWrapper(rfile.open()))))
Esempio n. 4
0
    def test_basic_file_name_provided(self):
        content = 'TestingMyContent\nIn multiple lines\nhaha!'
        dir = '%s/' % self._new_tempdir()
        self._create_temp_file(dir=dir, content=content)

        with TestPipeline() as p:
            content_pc = (p
                          | beam.Create([dir])
                          | fileio.MatchAll()
                          | fileio.ReadMatches()
                          | beam.Map(lambda f: f.read().decode('utf-8')))

            assert_that(content_pc, equal_to([content]))
Esempio n. 5
0
  def test_basic_file_name_provided(self):
    content = 'TestingMyContent\nIn multiple lines\nhaha!'
    dir = '%s%s' % (self._new_tempdir(), os.sep)
    self._create_temp_file(dir=dir, content=content)

    with TestPipeline() as p:
      content_pc = (
          p
          | beam.Create([FileSystems.join(dir, '*')])
          | fileio.MatchAll()
          | fileio.ReadMatches()
          | beam.FlatMap(lambda f: f.read().decode('utf-8').splitlines()))

      assert_that(content_pc, equal_to(content.splitlines()))
Esempio n. 6
0
  def test_csv_file_source(self):
    content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden'
    rows = [r.split(',') for r in content.split('\n')]

    dir = '%s%s' % (self._new_tempdir(), os.sep)
    self._create_temp_file(dir=dir, content=content)

    with TestPipeline() as p:
      content_pc = (p
                    | beam.Create([FileSystems.join(dir, '*')])
                    | fileio.MatchAll()
                    | fileio.ReadMatches()
                    | beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf))))

      assert_that(content_pc, equal_to(rows))
Esempio n. 7
0
  def test_match_files_one_directory_failure(self):
    directories = [
        '%s%s' % (self._new_tempdir(), os.sep),
        '%s%s' % (self._new_tempdir(), os.sep)]

    files = list()
    files.append(self._create_temp_file(dir=directories[0]))
    files.append(self._create_temp_file(dir=directories[0]))

    with TestPipeline() as p:
      files_pc = (
          p
          | beam.Create(['%s*' % d for d in directories])
          | fileio.MatchAll(fileio.EmptyMatchTreatment.ALLOW_IF_WILDCARD)
          | beam.Map(lambda x: x.path))

      assert_that(files_pc, equal_to(files))
Esempio n. 8
0
    def test_read_gzip_compressed_file_without_suffix(self):
        dir = '%s%s' % (self._new_tempdir(), os.sep)

        file_contents = b'compressed_contents!'
        import gzip
        with gzip.GzipFile(os.path.join(dir, 'compressed'), 'w') as f:
            f.write(file_contents)

        with TestPipeline() as p:
            content_pc = (
                p
                | beam.Create([FileSystems.join(dir, '*')])
                | fileio.MatchAll()
                | fileio.ReadMatches()
                | beam.Map(lambda rf: rf.open(compression_type=CompressionTypes
                                              .GZIP).read(len(file_contents))))

            assert_that(content_pc, equal_to([file_contents]))
Esempio n. 9
0
  def test_match_files_one_directory_failure(self):
    directories = [
        '%s%s' % (self._new_tempdir(), os.sep),
        '%s%s' % (self._new_tempdir(), os.sep)]

    files = list()
    files.append(self._create_temp_file(dir=directories[0]))
    files.append(self._create_temp_file(dir=directories[0]))

    with self.assertRaises(beam.io.filesystem.BeamIOError):
      with TestPipeline() as p:
        files_pc = (
            p
            | beam.Create(directories)
            | fileio.MatchAll(fileio.EmptyMatchTreatment.DISALLOW)
            | beam.Map(lambda x: x.path))

        assert_that(files_pc, equal_to(files))
Esempio n. 10
0
    def test_match_all_two_directories(self):
        files = []
        directories = []

        for _ in range(2):
            # TODO: What about this having to append the ending slash?
            d = '%s/' % self._new_tempdir()
            directories.append(d)

            files.append(self._create_temp_file(dir=d))
            files.append(self._create_temp_file(dir=d))

        with TestPipeline() as p:
            files_pc = (p
                        | beam.Create(directories)
                        | fileio.MatchAll()
                        | beam.Map(lambda x: x.path))

            assert_that(files_pc, equal_to(files))
Esempio n. 11
0
    def test_infer_compressed_file(self):
        dir = '%s%s' % (self._new_tempdir(), os.sep)

        file_contents = b'compressed_contents!'
        import gzip
        with gzip.GzipFile(os.path.join(dir, 'compressed.gz'), 'w') as f:
            f.write(file_contents)

        file_contents2 = b'compressed_contents_bz2!'
        import bz2
        with bz2.BZ2File(os.path.join(dir, 'compressed2.bz2'), 'w') as f:
            f.write(file_contents2)

        with TestPipeline() as p:
            content_pc = (p
                          | beam.Create([FileSystems.join(dir, '*')])
                          | fileio.MatchAll()
                          | fileio.ReadMatches()
                          | beam.Map(lambda rf: rf.open().readline()))

            assert_that(content_pc, equal_to([file_contents, file_contents2]))
Esempio n. 12
0
    def test_csv_file_source(self):
        content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden'
        rows = [r.split(',') for r in content.split('\n')]

        dir = '%s/' % self._new_tempdir()
        self._create_temp_file(dir=dir, content=content)

        def get_csv_reader(readable_file):
            if sys.version_info >= (3, 0):
                return csv.reader(io.TextIOWrapper(readable_file.open()))
            else:
                return csv.reader(readable_file.open())

        with TestPipeline() as p:
            content_pc = (p
                          | beam.Create([dir])
                          | fileio.MatchAll()
                          | fileio.ReadMatches()
                          | beam.FlatMap(get_csv_reader))

            assert_that(content_pc, equal_to(rows))
Esempio n. 13
0
    def expand(self, root):
        paths_pcoll = root | beam.Create([self.path])
        match = io.filesystems.FileSystems.match([self.path], limits=[1])[0]
        if not match.metadata_list:
            # TODO(BEAM-12031): This should be allowed for streaming pipelines if
            # user provides an explicit schema.
            raise FileNotFoundError(f"Found no files that match {self.path!r}")
        first_path = match.metadata_list[0].path
        with io.filesystems.FileSystems.open(first_path) as handle:
            if not self.binary:
                handle = TextIOWrapper(handle)
            if self.incremental:
                sample = next(
                    self.reader(handle, *self.args,
                                **dict(self.kwargs, chunksize=100)))
            else:
                sample = self.reader(handle, *self.args, **self.kwargs)

        matches_pcoll = paths_pcoll | fileio.MatchAll()
        indices_pcoll = (
            matches_pcoll.pipeline
            | 'DoOnce' >> beam.Create([None])
            | beam.Map(
                lambda _, paths:
                {path: ix
                 for ix, path in enumerate(sorted(paths))},
                paths=beam.pvalue.AsList(matches_pcoll
                                         | beam.Map(lambda match: match.path)))
        )

        pcoll = (matches_pcoll
                 | beam.Reshuffle()
                 | fileio.ReadMatches()
                 | beam.ParDo(
                     _ReadFromPandasDoFn(self.reader, self.args, self.kwargs,
                                         self.binary, self.incremental,
                                         self.splitter),
                     path_indices=beam.pvalue.AsSingleton(indices_pcoll)))
        from apache_beam.dataframe import convert
        return convert.to_dataframe(pcoll, proxy=sample[:0])