Esempio n. 1
0
def run(argv=None):
    from apache_beam.io.gcp.bigquery_file_loads import BigQueryBatchFileLoads
    from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
    from datetime import datetime

    options = Ds2bqOptions(flags=argv)
    options.view_as(beam.options.pipeline_options.GoogleCloudOptions
                    ).region = "asia-northeast1"
    options.view_as(
        beam.options.pipeline_options.WorkerOptions).num_workers = 2
    options.view_as(
        beam.options.pipeline_options.WorkerOptions).disk_size_gb = 50

    # Setup
    options.view_as(beam.options.pipeline_options.StandardOptions
                    ).runner = 'DataflowRunner'
    options.view_as(
        beam.options.pipeline_options.SetupOptions).setup_file = './setup.py'

    logging.info(options)

    project_id = options.view_as(
        beam.options.pipeline_options.GoogleCloudOptions).project
    gcs_dir = "gs://{}-dataflow/temp/{}".format(
        project_id,
        datetime.now().strftime("%Y%m%d%H%M%S"))

    with beam.Pipeline(options=options) as p:
        from transform.datastore import convert, CreateQuery, GetKinds
        from transform.bigquery import GetBqTableMap, get_partition_conf
        table_names_dict = beam.pvalue.AsDict(
            p | "Get BigQuery Table Map" >> GetBqTableMap(
                project_id, options.dataset))

        entities = (p
                    | 'Get Kinds' >> GetKinds(project_id)
                    | 'Create Query' >> beam.ParDo(CreateQuery(project_id))
                    | 'Get Entity' >> beam.ParDo(ReadFromDatastore._QueryFn()))

        _ = (entities
             | 'Convert Entity' >> beam.Map(convert)
             | 'BigQuery Load' >> BigQueryBatchFileLoads(
                 destination=lambda row, table_dict: table_dict[row["__key__"][
                     "kind"]],
                 custom_gcs_temp_location=gcs_dir,
                 write_disposition='WRITE_TRUNCATE',
                 table_side_inputs=(table_names_dict, ),
                 additional_bq_parameters=get_partition_conf,
                 schema='SCHEMA_AUTODETECT'))
Esempio n. 2
0
 def test_QueryFn_metric_on_failure(self):
     MetricsEnvironment.process_wide_container().reset()
     with patch.object(helper, 'get_client',
                       return_value=self._mock_client):
         self._mock_query.project = self._PROJECT
         self._mock_query.namespace = self._NAMESPACE
         _query_fn = ReadFromDatastore._QueryFn()
         client_query = self._mock_query._to_client_query()
         # Test with exception
         client_query.fetch.side_effect = [
             exceptions.DeadlineExceeded("Deadline exceed")
         ]
         list(_query_fn.process(self._mock_query))
         self.verify_read_call_metric(self._PROJECT, self._NAMESPACE,
                                      "deadline_exceeded", 1)
         # Test success
         client_query.fetch.side_effect = [[]]
         list(_query_fn.process(self._mock_query))
         self.verify_read_call_metric(self._PROJECT, self._NAMESPACE, "ok",
                                      1)