def expand(self, pcoll): return (pcoll | beam.ParDo(_GenerateObjectIdFn()) | Reshuffle() | beam.ParDo( _WriteMongoFn(self._uri, self._db, self._coll, self._batch_size, self._spec)))
def expand(self, pcoll): return (pcoll.pipeline | 'UserQuery' >> beam.Create([1]) | 'SplitQuery' >> beam.ParDo(PaginateQueryDoFn(*self.args, **self.kwargs)) | "reshuffle" >> Reshuffle() | 'Read' >> beam.ParDo(SQLSourceDoFn(*self.args, **self.kwargs)) )
def expand(self, pcoll): # This is a composite transform involves the following: # 1. Create a singleton of the user provided `query` and apply a ``ParDo`` # that splits the query into `num_splits` queries if possible. # # If the value of `num_splits` is 0, the number of splits will be # computed dynamically based on the size of the data for the `query`. # # 2. The resulting ``PCollection`` is sharded across workers using a # ``Reshuffle`` operation. # # 3. In the third step, a ``ParDo`` reads entities for each query and # outputs a ``PCollection[Entity]``. return (pcoll.pipeline | 'UserQuery' >> Create([self._query]) | 'SplitQuery' >> ParDo( ReadFromDatastore._SplitQueryFn(self._num_splits)) | Reshuffle() | 'Read' >> ParDo(ReadFromDatastore._QueryFn()))