def run(argv=None): from apache_beam.io.gcp.bigquery_file_loads import BigQueryBatchFileLoads from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore from datetime import datetime options = Ds2bqOptions(flags=argv) options.view_as(beam.options.pipeline_options.GoogleCloudOptions ).region = "asia-northeast1" options.view_as( beam.options.pipeline_options.WorkerOptions).num_workers = 2 options.view_as( beam.options.pipeline_options.WorkerOptions).disk_size_gb = 50 # Setup options.view_as(beam.options.pipeline_options.StandardOptions ).runner = 'DataflowRunner' options.view_as( beam.options.pipeline_options.SetupOptions).setup_file = './setup.py' logging.info(options) project_id = options.view_as( beam.options.pipeline_options.GoogleCloudOptions).project gcs_dir = "gs://{}-dataflow/temp/{}".format( project_id, datetime.now().strftime("%Y%m%d%H%M%S")) with beam.Pipeline(options=options) as p: from transform.datastore import convert, CreateQuery, GetKinds from transform.bigquery import GetBqTableMap, get_partition_conf table_names_dict = beam.pvalue.AsDict( p | "Get BigQuery Table Map" >> GetBqTableMap( project_id, options.dataset)) entities = (p | 'Get Kinds' >> GetKinds(project_id) | 'Create Query' >> beam.ParDo(CreateQuery(project_id)) | 'Get Entity' >> beam.ParDo(ReadFromDatastore._QueryFn())) _ = (entities | 'Convert Entity' >> beam.Map(convert) | 'BigQuery Load' >> BigQueryBatchFileLoads( destination=lambda row, table_dict: table_dict[row["__key__"][ "kind"]], custom_gcs_temp_location=gcs_dir, write_disposition='WRITE_TRUNCATE', table_side_inputs=(table_names_dict, ), additional_bq_parameters=get_partition_conf, schema='SCHEMA_AUTODETECT'))
def test_QueryFn_metric_on_failure(self): MetricsEnvironment.process_wide_container().reset() with patch.object(helper, 'get_client', return_value=self._mock_client): self._mock_query.project = self._PROJECT self._mock_query.namespace = self._NAMESPACE _query_fn = ReadFromDatastore._QueryFn() client_query = self._mock_query._to_client_query() # Test with exception client_query.fetch.side_effect = [ exceptions.DeadlineExceeded("Deadline exceed") ] list(_query_fn.process(self._mock_query)) self.verify_read_call_metric(self._PROJECT, self._NAMESPACE, "deadline_exceeded", 1) # Test success client_query.fetch.side_effect = [[]] list(_query_fn.process(self._mock_query)) self.verify_read_call_metric(self._PROJECT, self._NAMESPACE, "ok", 1)