예제 #1
0
 def expand(self, pcoll):
     return (pcoll
             | beam.ParDo(_GenerateObjectIdFn())
             | Reshuffle()
             | beam.ParDo(
                 _WriteMongoFn(self._uri, self._db, self._coll,
                               self._batch_size, self._spec)))
예제 #2
0
 def expand(self, pcoll):
     return (pcoll.pipeline
             | 'UserQuery' >> beam.Create([1])
             | 'SplitQuery' >> beam.ParDo(PaginateQueryDoFn(*self.args, **self.kwargs))
             | "reshuffle" >> Reshuffle()
             | 'Read' >> beam.ParDo(SQLSourceDoFn(*self.args, **self.kwargs))
             )
예제 #3
0
    def expand(self, pcoll):
        # This is a composite transform involves the following:
        #   1. Create a singleton of the user provided `query` and apply a ``ParDo``
        #   that splits the query into `num_splits` queries if possible.
        #
        #   If the value of `num_splits` is 0, the number of splits will be
        #   computed dynamically based on the size of the data for the `query`.
        #
        #   2. The resulting ``PCollection`` is sharded across workers using a
        #   ``Reshuffle`` operation.
        #
        #   3. In the third step, a ``ParDo`` reads entities for each query and
        #   outputs a ``PCollection[Entity]``.

        return (pcoll.pipeline
                | 'UserQuery' >> Create([self._query])
                | 'SplitQuery' >> ParDo(
                    ReadFromDatastore._SplitQueryFn(self._num_splits))
                | Reshuffle()
                | 'Read' >> ParDo(ReadFromDatastore._QueryFn()))