def filter_table_schema(self, include_fields=None):
     if include_fields is None:
         schema = self.table_schema
     else:
         schema = TableSchema()
         schema.fields = [field for field in self.table_schema.fields if field.name in include_fields]
     return schema
Exemple #2
0
def table_schema():
    table_schema = TableSchema()
    table_schema.fields = [
        table_field('total_score', 'INTEGER'),
        table_field('user', 'STRING')
    ]
    return table_schema
def table_schema():
    table_schema = TableSchema()
    table_schema.fields = [
        table_field('team', 'STRING'),
        table_field('total_score', 'INTEGER'),
        table_field('window_start', 'TIMESTAMP')
    ]
    return table_schema
def table_schema():
    """
    The schema of the table where we'll be writing the output data
    """
    table_schema = TableSchema()
    table_schema.fields = [
        table_field('data', 'STRING')
    ]
    return table_schema
def table_schema():
    """
    The schema of the table where we'll be writing the output data
    """
    table_schema = TableSchema()
    table_schema.fields = [
        table_field('team', 'STRING'),
        table_field('total_score', 'INTEGER'),
        table_field('window_start', 'TIMESTAMP')
    ]
    return table_schema
  def generate_schema(self):
    from apache_beam.io.gcp.internal.clients.bigquery import TableFieldSchema
    from apache_beam.io.gcp.internal.clients.bigquery import TableSchema
    json_fields = [
        TableFieldSchema(name='country_code', type='STRING', mode='NULLABLE'),
        TableFieldSchema(name='country', type='JSON', mode='NULLABLE'),
        TableFieldSchema(
            name='stats',
            type='STRUCT',
            mode='NULLABLE',
            fields=[
                TableFieldSchema(
                    name="gdp_per_capita", type='JSON', mode='NULLABLE'),
                TableFieldSchema(
                    name="co2_emissions", type='JSON', mode='NULLABLE'),
            ]),
        TableFieldSchema(
            name='cities',
            type='STRUCT',
            mode='REPEATED',
            fields=[
                TableFieldSchema(
                    name="city_name", type='STRING', mode='NULLABLE'),
                TableFieldSchema(name="city", type='JSON', mode='NULLABLE'),
            ]),
        TableFieldSchema(name='landmarks', type='JSON', mode='REPEATED'),
    ]

    schema = TableSchema(fields=json_fields)

    return schema
def _get_bq_schema(fields):
    bq_fields = []
    for k, v in fields.items():
        bq_fields.append(
            TableFieldSchema(name=k, type=v, description='Field %s' % k))
    bq_fields.append(
        TableFieldSchema(name='_RAWTIMESTAMP',
                         type='TIMESTAMP',
                         description='Injected timestamp'))
    return TableSchema(fields=bq_fields)
Exemple #8
0
def run(args=None):
  pipeline_options = PipelineOptions(args)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options.view_as(SetupOptions).save_main_session = True

  normalize_options = pipeline_options.view_as(NormalizeOptions)
  gcp_options = pipeline_options.view_as(GoogleCloudOptions)

  d1, d2 = parse_date_range(normalize_options.date_range)
  helper = QueryHelper(table=normalize_options.source_table, first_date_ts=d1, last_date_ts=d2)
  select_fields = ['mmsi', 'timestamp', 'seg_id', 'shipname', 'callsign', 'imo']
  where_sql = 'shipname is not null or callsign is not null or imo is not null'
  if normalize_options.mmsi_quotient > 1:
      where_sql = "hash(mmsi) % {} = 0 and ({})".format(normalize_options.mmsi_quotient, where_sql)

  source_schema = helper.filter_table_schema(select_fields)
  source = BigQuerySource(query=helper.build_query(include_fields=select_fields, where_sql=where_sql))

  dest_schema = TableSchema(fields=source_schema.fields)
  dest_schema.fields.append(TableFieldSchema(name=NORMALIZED_SHIPNAME, type='STRING'))
  dest_schema.fields.append(TableFieldSchema(name=NORMALIZED_CALLSIGN, type='STRING'))
  dest_schema.fields.append(TableFieldSchema(name=VALID_IMO, type='INTEGER'))

  pipeline = beam.Pipeline(options=pipeline_options)
  (
      pipeline
      | "ReadSource" >> ReadAsJSONDict(source)
      | "ConvertTimestamp" >> beam.ParDo(ParseBeamBQStrTimestampDoFn())
      | "AddTimestamp" >> beam.ParDo(TimestampedValueDoFn())
      | "NormalizeNames" >> beam.ParDo(NormalizeNamesDoFn())
      | "WriteDest" >> WriteToBigQueryDatePartitioned(
          temp_gcs_location=gcp_options.temp_location,
          table=normalize_options.dest_table,
          schema=dest_schema,
          write_disposition=BigQueryDisposition.WRITE_TRUNCATE)
  )

  result = pipeline.run()
  success_states = set([PipelineState.DONE])

  if normalize_options.wait:
    result.wait_until_finish()
  else:
      success_states.add(PipelineState.RUNNING)

  return 0 if result.state in success_states else 1