def test_limit_write_shard_limit_1(self): variants = self._get_sample_variants() input_pcoll = Create(variants) pipeline = TestPipeline() output_pcoll = (pipeline | input_pcoll | 'LimitWrite' >> limit_write.LimitWrite(1)) assert_that(output_pcoll, equal_to(variants)) pipeline.run()
def expand(self, pcoll): bq_rows = pcoll | 'ConvertToBigQueryTableRow' >> beam.ParDo( ConvertVariantToRow(self._bigquery_row_generator, self._allow_incompatible_records, self._omit_empty_sample_calls)) if self._num_bigquery_write_shards > 1: # We split data into self._num_bigquery_write_shards random partitions # and then write each part to final BQ by appending them together. # Combined with LimitWrite transform, this will avoid the BQ failure. bq_row_partitions = bq_rows | beam.Partition( lambda _, n: random.randint(0, n - 1), self._num_bigquery_write_shards) bq_writes = [] for i in range(self._num_bigquery_write_shards): bq_rows = (bq_row_partitions[i] | 'LimitWrite' + str(i) >> limit_write.LimitWrite(_WRITE_SHARDS_LIMIT)) bq_writes.append( bq_rows | 'WriteToBigQuery' + str(i) >> beam.io.Write( beam.io.BigQuerySink( self._output_table, schema=self._schema, create_disposition=( beam.io.BigQueryDisposition.CREATE_NEVER), write_disposition=( beam.io.BigQueryDisposition.WRITE_APPEND)))) return bq_writes else: return (bq_rows | 'WriteToBigQuery' >> beam.io.Write( beam.io.BigQuerySink( self._output_table, schema=self._schema, create_disposition=( beam.io.BigQueryDisposition.CREATE_NEVER), write_disposition=( beam.io.BigQueryDisposition.WRITE_APPEND if self._append else beam.io.BigQueryDisposition.WRITE_EMPTY))))