コード例 #1
0
def anonymize_table(table_id):

    if project.verbose:
        print('ANONYMIZE TABLE', project.task['bigquery']['to']['dataset'],
              table_id)

    schema = API_BigQuery(project.task['auth']).tables().get(
        projectId=project.task['bigquery']['from']['project'],
        datasetId=project.task['bigquery']['from']['dataset'],
        tableId=table_id).execute()['schema']['fields']

    rows = table_to_rows(project.task['auth'],
                         project.task['bigquery']['from']['project'],
                         project.task['bigquery']['from']['dataset'], table_id)

    rows = anonymize_rows(rows, schema)

    rows_to_table(project.task['auth'],
                  project.task['bigquery']['to']['project'],
                  project.task['bigquery']['to']['dataset'],
                  table_id,
                  rows,
                  schema,
                  skip_rows=0,
                  disposition='WRITE_TRUNCATE')
コード例 #2
0
def bigquery():

  # check schema if given ( check independent of values )
  if 'schema' in project.task['bigquery']:
    schema = table_to_schema(
      project.task['auth'],
      project.id,
      project.task['bigquery']['dataset'],
      project.task['bigquery']['table']
    )
    object_compare(schema['fields'], project.task['bigquery']['schema'])

  # if query given check it
  if 'query' in project.task['bigquery']:
    rows = query_to_rows(
      project.task['auth'],
      project.id,
      project.task['bigquery']['dataset'],
      project.task['bigquery']['query']
    )

    object_compare(sorted(rows), sorted(project.task['bigquery']['values']))

  # simple table check ( unless query given )
  elif 'values' in project.task['bigquery']:
    rows = table_to_rows(
      project.task['auth'],
      project.id,
      project.task['bigquery']['dataset'],
      project.task['bigquery']['table']
    )

    object_compare(sorted(rows), sorted(project.task['bigquery']['values']))
コード例 #3
0
def anonymize_table(table_id):

    if project.verbose:
        print('ANONYMIZE TABLE', project.task['bigquery']['to']['dataset'],
              table_id)

    schema = table_to_schema(project.task['auth'],
                             project.task['bigquery']['from']['project'],
                             project.task['bigquery']['from']['dataset'],
                             table_id)

    rows = table_to_rows(project.task['auth'],
                         project.task['bigquery']['from']['project'],
                         project.task['bigquery']['from']['dataset'],
                         table_id,
                         as_object=True)

    rows = anonymize_rows(rows,
                          project.task['bigquery']['to'].get('columns', []))

    json_to_table(project.task['auth'],
                  project.task['bigquery']['to']['project'],
                  project.task['bigquery']['to']['dataset'],
                  table_id,
                  rows,
                  schema,
                  disposition='WRITE_TRUNCATE')
コード例 #4
0
def bigquery():
    print('TEST: bigquery')

    # check schema if given ( check independent of values )
    if 'schema' in project.task['bigquery']:
        schema = table_to_schema(project.task['auth'], project.id,
                                 project.task['bigquery']['dataset'],
                                 project.task['bigquery']['table'])
        deltas = schema_compare(project.task['bigquery']['schema'],
                                schema,
                                path='')

        if deltas:
            print(
                '\nFAILED *******************************************************\n'
            )
            for delta in deltas.values():
                print('%(path)s: %(error)s ( %(expected)s - %(actual)s)' %
                      delta)
            print(
                '\n**************************************************************\n'
            )
            test_failed()
        else:
            test_passed()

    # if query given check it
    if 'query' in project.task['bigquery']:
        rows = query_to_rows(
            project.task['auth'],
            project.id,
            project.task['bigquery']['dataset'],
            query_parameters(project.task['bigquery']['query'],
                             project.task['bigquery'].get('parameters')),
            legacy=project.task['bigquery'].get('legacy', True))

        object_compare(sorted(rows),
                       sorted(project.task['bigquery']['values']))

    # simple table check ( unless query given )
    elif 'values' in project.task['bigquery']:
        rows = table_to_rows(project.task['auth'], project.id,
                             project.task['bigquery']['dataset'],
                             project.task['bigquery']['table'])

        object_compare(sorted(rows),
                       sorted(project.task['bigquery']['values']))
コード例 #5
0
def anonymize_table(table_id):

    if project.verbose:
        print('ANONYMIZE TABLE', project.task['bigquery']['to']['dataset'],
              table_id)

    schema = API_BigQuery(project.task['auth']).tables().get(
        projectId=project.task['bigquery']['from']['project'],
        datasetId=project.task['bigquery']['from']['dataset'],
        tableId=table_id).execute()['schema']['fields']

    is_object = any([s['type'] == 'RECORD' for s in schema])

    rows = table_to_rows(project.task['auth'],
                         project.task['bigquery']['from']['project'],
                         project.task['bigquery']['from']['dataset'],
                         table_id,
                         as_object=is_object)

    rows = anonymize_rows(rows, schema, is_object)

    if is_object:
        json_to_table(project.task['auth'],
                      project.task['bigquery']['to']['project'],
                      project.task['bigquery']['to']['dataset'],
                      table_id,
                      rows,
                      schema,
                      disposition='WRITE_TRUNCATE')
    else:
        rows_to_table(project.task['auth'],
                      project.task['bigquery']['to']['project'],
                      project.task['bigquery']['to']['dataset'],
                      table_id,
                      rows,
                      schema,
                      skip_rows=0,
                      disposition='WRITE_TRUNCATE')
コード例 #6
0
def get_rows(auth, source):
    """Processes standard read JSON block for dynamic loading of data.

  Allows us to quickly pull a column or columns of data from and use it as an
  input
  into a script. For example pull a list of ids from bigquery and act on each
  one.

  - When pulling a single column specify single_cell = True. Returns list AKA
  values.
  - When pulling a multiple columns specify single_cell = False. Returns list of
  lists AKA rows.
  - Values are always given as a list ( single_cell will trigger necessary
  wrapping ).
  - Values, bigquery, sheet are optional, if multiple given result is one
  continous iterator.
  - Extensible, add a handler to define a new source ( be kind update the
  documentation json ).

  Include the following JSON in a recipe, then in the run.py handler when
  encountering that block pass it to this function and use the returned results.

    from utils.data import get_rows

    var_json = {
      "in":{
        "single_cell":[ boolean ],
        "values": [ integer list ],
        "bigquery":{
          "dataset": [ string ],
          "query": [ string ],
          "legacy":[ boolean ]
        },
        "bigquery":{
          "dataset": [ string ],
          "table": [ string ],
        },
        "sheet":{
          "sheet":[ string - full URL, suggest using share link ],
          "tab":[ string ],
          "range":[ string - A1:A notation ]
        }
      }
    }

    values = get_rows('user', var_json)

  Or you can use it directly with project singleton.

    from util.project import project
    from utils.data import get_rows

    @project.from_parameters
    def something():
      values = get_rows(project.task['auth'], project.task['in'])

    if __name__ == "__main__":
      something()

  Args:
    auth: (string) The type of authentication to use, user or service.
    source: (json) A json block resembling var_json described above.

  Returns:
    If single_cell is False: Returns a list of row values [[v1], [v2], ... ]
    If single_cell is True: Returns a list of values [v1, v2, ...]
"""

    # if handler points to list, concatenate all the values from various sources into one list
    if isinstance(source, list):
        for s in source:
            for r in get_rows(auth, s):
                yield r

    # if handler is an endpoint, fetch data
    else:
        if 'values' in source:
            if isinstance(source['values'], list):
                for value in source['values']:
                    yield value
            else:
                yield source['values']

        if 'sheet' in source:
            rows = sheets_read(
                project.task['auth'],
                source['sheet']['sheet'],
                source['sheet']['tab'],
                source['sheet']['range'],
            )

            for row in rows:
                yield row[0] if source.get('single_cell', False) else row

        if 'bigquery' in source:

            rows = []

            if 'table' in source['bigquery']:
                rows = table_to_rows(source['bigquery'].get('auth', auth),
                                     project.id,
                                     source['bigquery']['dataset'],
                                     source['bigquery']['table'],
                                     as_object=source['bigquery'].get(
                                         'as_object', False))

            else:
                rows = query_to_rows(
                    source['bigquery'].get('auth', auth),
                    project.id,
                    source['bigquery']['dataset'],
                    query_parameters(source['bigquery']['query'],
                                     source['bigquery'].get('parameters', {})),
                    legacy=source['bigquery'].get('legacy', False),
                    as_object=source['bigquery'].get('as_object', False))

            for row in rows:
                yield row[0] if source.get('single_cell', False) else row