Example #1
0
def bigquery():

    # check schema if given ( check independent of values )
    if 'schema' in project.task['bigquery']:
        schema = table_to_schema(project.task['auth'], project.id,
                                 project.task['bigquery']['dataset'],
                                 project.task['bigquery']['table'])
        object_compare(schema, project.task['bigquery']['schema'])

    # if query given check it
    if 'query' in project.task['bigquery']:
        rows = query_to_rows(
            project.task['auth'],
            project.id,
            project.task['bigquery']['dataset'],
            query_parameters(project.task['bigquery']['query'],
                             project.task['bigquery'].get('parameters')),
        )

        object_compare(sorted(rows),
                       sorted(project.task['bigquery']['values']))

    # simple table check ( unless query given )
    elif 'values' in project.task['bigquery']:
        rows = table_to_rows(project.task['auth'], project.id,
                             project.task['bigquery']['dataset'],
                             project.task['bigquery']['table'])

        object_compare(sorted(rows),
                       sorted(project.task['bigquery']['values']))
Example #2
0
def bigquery_run():
    """Execute a query without expected return results.

  See: scripts/bigquery_run_query.json
  """

    if project.verbose:
        print('RUN QUERY', project.task['run']['query'])

    run_query(
        project.task['auth'], project.id,
        query_parameters(project.task['run']['query'],
                         project.task['run'].get('parameters')),
        project.task['run'].get('legacy', True))
Example #3
0
def bigquery_run(config, task):
    """Execute a query without expected return results.

  See: scripts/bigquery_run_query.json
  """

    if config.verbose:
        print('RUN QUERY', task['run']['query'])

    bq.run_query(
        config, task['auth'], config.project,
        bq.query_parameters(task['run']['query'],
                            task['run'].get('parameters')),
        task['run'].get('legacy', True))
Example #4
0
def bigquery():
    print('TEST: bigquery')

    # check schema if given ( check independent of values )
    if 'schema' in project.task['bigquery']:
        schema = table_to_schema(project.task['auth'], project.id,
                                 project.task['bigquery']['dataset'],
                                 project.task['bigquery']['table'])
        deltas = schema_compare(project.task['bigquery']['schema'],
                                schema,
                                path='')

        if deltas:
            print(
                '\nFAILED *******************************************************\n'
            )
            for delta in deltas.values():
                print('%(path)s: %(error)s ( %(expected)s - %(actual)s)' %
                      delta)
            print(
                '\n**************************************************************\n'
            )
            test_failed()
        else:
            test_passed()

    # if query given check it
    if 'query' in project.task['bigquery']:
        rows = query_to_rows(
            project.task['auth'],
            project.id,
            project.task['bigquery']['dataset'],
            query_parameters(project.task['bigquery']['query'],
                             project.task['bigquery'].get('parameters')),
            legacy=project.task['bigquery'].get('legacy', True))

        object_compare(sorted(rows),
                       sorted(project.task['bigquery']['values']))

    # simple table check ( unless query given )
    elif 'values' in project.task['bigquery']:
        rows = table_to_rows(project.task['auth'], project.id,
                             project.task['bigquery']['dataset'],
                             project.task['bigquery']['table'])

        object_compare(sorted(rows),
                       sorted(project.task['bigquery']['values']))
Example #5
0
def bigquery():

    if 'function' in project.task:
        query = None

        if project.task['function'] == 'pearson_significance_test':
            query = pearson_significance_test()

        if query:
            run_query(project.task['auth'], project.id, query, False,
                      project.task['to']['dataset'])

    elif 'run' in project.task and 'query' in project.task.get('run', {}):
        if project.verbose:
            print('QUERY', project.task['run']['query'])
        run_query(
            project.task['auth'],
            project.id,
            query_parameters(project.task['run']['query'],
                             project.task['run'].get('parameters')),
            project.task['run'].get('legacy', True),
        )

    elif 'values' in project.task['from']:
        rows = get_rows(project.task['auth'], project.task['from'])

        rows_to_table(project.task['to'].get('auth', project.task['auth']),
                      project.id, project.task['to']['dataset'],
                      project.task['to']['table'], rows,
                      project.task.get('schema', []), 0)

    elif 'query' in project.task['from']:

        if 'table' in project.task['to']:
            if project.verbose:
                print('QUERY TO TABLE', project.task['to']['table'])

            query_to_table(
                project.task['auth'],
                project.id,
                project.task['to']['dataset'],
                project.task['to']['table'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                disposition=project.task['write_disposition']
                if 'write_disposition' in project.task else 'WRITE_TRUNCATE',
                legacy=project.task['from'].get(
                    'legacy', project.task['from'].get(
                        'useLegacySql', True)),  # DEPRECATED: useLegacySql,
                target_project_id=project.task['to'].get(
                    'project_id', project.id))

        elif 'sheet' in project.task['to']:
            if project.verbose:
                print('QUERY TO SHEET', project.task['to']['sheet'])
            rows = query_to_rows(
                project.task['auth'],
                project.id,
                project.task['from']['dataset'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                legacy=project.task['from'].get('legacy', True))

            # makes sure types are correct in sheet
            rows = rows_to_type(rows)

            sheets_clear(project.task['to'].get('auth', project.task['auth']),
                         project.task['to']['sheet'],
                         project.task['to']['tab'],
                         project.task['to'].get('range', 'A2'))
            sheets_write(project.task['to'].get('auth', project.task['auth']),
                         project.task['to']['sheet'],
                         project.task['to']['tab'],
                         project.task['to'].get('range', 'A2'), rows)

        elif 'sftp' in project.task['to']:
            rows = query_to_rows(
                project.task['auth'],
                project.id,
                project.task['from']['dataset'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                legacy=project.task['from'].get('use_legacy_sql', True))

            if rows:
                if project.verbose:
                    print('QUERY TO SFTP')
                put_rows(project.task['auth'], project.task['to'], rows)

        else:
            if project.verbose:
                print('QUERY TO VIEW', project.task['to']['view'])
            query_to_view(
                project.task['auth'],
                project.id,
                project.task['to']['dataset'],
                project.task['to']['view'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                project.task['from'].get(
                    'legacy', project.task['from'].get(
                        'useLegacySql', True)),  # DEPRECATED: useLegacySql
                project.task['to'].get('replace', False))

    else:
        if project.verbose:
            print('STORAGE TO TABLE', project.task['to']['table'])
        storage_to_table(
            project.task['auth'], project.id, project.task['to']['dataset'],
            project.task['to']['table'], project.task['from']['bucket'] + ':' +
            project.task['from']['path'], project.task.get('schema', []),
            project.task.get('skip_rows', 1),
            project.task.get('structure', 'CSV'),
            project.task.get('disposition', 'WRITE_TRUNCATE'))
Example #6
0
def get_rows(auth, source):
    """Processes standard read JSON block for dynamic loading of data.

  Allows us to quickly pull a column or columns of data from and use it as an
  input
  into a script. For example pull a list of ids from bigquery and act on each
  one.

  - When pulling a single column specify single_cell = True. Returns list AKA
  values.
  - When pulling a multiple columns specify single_cell = False. Returns list of
  lists AKA rows.
  - Values are always given as a list ( single_cell will trigger necessary
  wrapping ).
  - Values, bigquery, sheet are optional, if multiple given result is one
  continous iterator.
  - Extensible, add a handler to define a new source ( be kind update the
  documentation json ).

  Include the following JSON in a recipe, then in the run.py handler when
  encountering that block pass it to this function and use the returned results.

    from utils.data import get_rows

    var_json = {
      "in":{
        "single_cell":[ boolean ],
        "values": [ integer list ],
        "bigquery":{
          "dataset": [ string ],
          "query": [ string ],
          "legacy":[ boolean ]
        },
        "bigquery":{
          "dataset": [ string ],
          "table": [ string ],
        },
        "sheet":{
          "sheet":[ string - full URL, suggest using share link ],
          "tab":[ string ],
          "range":[ string - A1:A notation ]
        }
      }
    }

    values = get_rows('user', var_json)

  Or you can use it directly with project singleton.

    from util.project import project
    from utils.data import get_rows

    @project.from_parameters
    def something():
      values = get_rows(project.task['auth'], project.task['in'])

    if __name__ == "__main__":
      something()

  Args:
    auth: (string) The type of authentication to use, user or service.
    source: (json) A json block resembling var_json described above.

  Returns:
    If single_cell is False: Returns a list of row values [[v1], [v2], ... ]
    If single_cell is True: Returns a list of values [v1, v2, ...]
"""

    # if handler points to list, concatenate all the values from various sources into one list
    if isinstance(source, list):
        for s in source:
            for r in get_rows(auth, s):
                yield r

    # if handler is an endpoint, fetch data
    else:
        if 'values' in source:
            if isinstance(source['values'], list):
                for value in source['values']:
                    yield value
            else:
                yield source['values']

        if 'sheet' in source:
            rows = sheets_read(
                project.task['auth'],
                source['sheet']['sheet'],
                source['sheet']['tab'],
                source['sheet']['range'],
            )

            for row in rows:
                yield row[0] if source.get('single_cell', False) else row

        if 'bigquery' in source:

            rows = []

            if 'table' in source['bigquery']:
                rows = table_to_rows(source['bigquery'].get('auth', auth),
                                     project.id,
                                     source['bigquery']['dataset'],
                                     source['bigquery']['table'],
                                     as_object=source['bigquery'].get(
                                         'as_object', False))

            else:
                rows = query_to_rows(
                    source['bigquery'].get('auth', auth),
                    project.id,
                    source['bigquery']['dataset'],
                    query_parameters(source['bigquery']['query'],
                                     source['bigquery'].get('parameters', {})),
                    legacy=source['bigquery'].get('legacy', False),
                    as_object=source['bigquery'].get('as_object', False))

            for row in rows:
                yield row[0] if source.get('single_cell', False) else row
Example #7
0
def bigquery_query():
    """Execute a query and write results to table.

  TODO: Replace with get_rows and put_rows combination.

  See: scripts/bigquery_query.json
       scripts/bigquery_storage.json
       scripts/bigquery_to_sheet.json
       scripts/bigquery_view.json
  """

    if 'table' in project.task['to']:
        if project.verbose:
            print('QUERY TO TABLE', project.task['to']['table'])

        query_to_table(
            project.task['auth'],
            project.id,
            project.task['to']['dataset'],
            project.task['to']['table'],
            query_parameters(project.task['from']['query'],
                             project.task['from'].get('parameters')),
            disposition=project.task['write_disposition']
            if 'write_disposition' in project.task else 'WRITE_TRUNCATE',
            legacy=project.task['from'].get(
                'legacy',
                project.task['from'].get('useLegacySql',
                                         True)),  # DEPRECATED: useLegacySql,
            target_project_id=project.task['to'].get('project_id', project.id))

    elif 'sheet' in project.task['to']:
        if project.verbose:
            print('QUERY TO SHEET', project.task['to']['sheet'])

        rows = query_to_rows(project.task['auth'],
                             project.id,
                             project.task['from']['dataset'],
                             query_parameters(
                                 project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                             legacy=project.task['from'].get('legacy', True))

        # makes sure types are correct in sheet
        rows = rows_to_type(rows)

        sheets_clear(project.task['to'].get('auth', project.task['auth']),
                     project.task['to']['sheet'], project.task['to']['tab'],
                     project.task['to'].get('range', 'A2'))
        sheets_write(project.task['to'].get('auth', project.task['auth']),
                     project.task['to']['sheet'], project.task['to']['tab'],
                     project.task['to'].get('range', 'A2'), rows)

    elif 'sftp' in project.task['to']:
        if project.verbose:
            print('QUERY TO SFTP')

        rows = query_to_rows(
            project.task['auth'],
            project.id,
            project.task['from']['dataset'],
            query_parameters(project.task['from']['query'],
                             project.task['from'].get('parameters')),
            legacy=project.task['from'].get('use_legacy_sql', True))

        if rows:
            put_rows(project.task['auth'], project.task['to'], rows)

    else:
        if project.verbose:
            print('QUERY TO VIEW', project.task['to']['view'])

        query_to_view(
            project.task['auth'],
            project.id,
            project.task['to']['dataset'],
            project.task['to']['view'],
            query_parameters(project.task['from']['query'],
                             project.task['from'].get('parameters')),
            project.task['from'].get(
                'legacy',
                project.task['from'].get('useLegacySql',
                                         True)),  # DEPRECATED: useLegacySql
            project.task['to'].get('replace', False))
Example #8
0
def bigquery():

    if 'run' in project.task and 'query' in project.task.get('run', {}):
        if project.verbose: print("QUERY", project.task['run']['query'])
        run_query(
            project.task['auth'],
            project.id,
            query_parameters(project.task['run']['query'],
                             project.task['run'].get('parameters')),
            project.task['run'].get('legacy', True),
        )

    elif 'values' in project.task['from']:
        rows = get_rows(project.task['auth'], project.task['from'])

        rows_to_table(project.task['to'].get('auth', project.task['auth']),
                      project.id, project.task['to']['dataset'],
                      project.task['to']['table'], rows,
                      project.task.get('schema', []), 0)

    elif 'query' in project.task['from']:
        if 'table' in project.task['to']:
            if project.verbose:
                print("QUERY TO TABLE", project.task['to']['table'])

            if 'pre_process_query' in project.task['to']:
                execute_statement(project.task['auth'],
                                  project.id,
                                  project.task['to']['dataset'],
                                  project.task['to']['pre_process_query'],
                                  use_legacy_sql=project.task['from'].get(
                                      'legacy', project.task['from'].get(
                                          'useLegacySql', True)))
            query_to_table(
                project.task['auth'],
                project.id,
                project.task['to']['dataset'],
                project.task['to']['table'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                disposition=project.task['write_disposition']
                if 'write_disposition' in project.task else 'WRITE_TRUNCATE',
                legacy=project.task['from'].get(
                    'legacy', project.task['from'].get(
                        'useLegacySql', True)),  # DEPRECATED: useLegacySql,
                target_project_id=project.task['to'].get(
                    'project_id', project.id))
        # NOT USED SO RIPPING IT OUT
        # Mauriciod: Yes, it is used, look at project/mauriciod/target_winrate.json
        elif 'storage' in project.task['to']:
            if project.verbose:
                print("QUERY TO STORAGE", project.task['to']['storage'])
            local_file_name = '/tmp/%s' % str(uuid.uuid1())
            rows = query_to_rows(project.task['auth'], project.id,
                                 project.task['from']['dataset'],
                                 project.task['from']['query'])

            f = open(local_file_name, 'wb')
            writer = csv.writer(f)
            writer.writerows(rows)
            f.close()

            f = open(local_file_name, 'rb')
            object_put(project.task['auth'], project.task['to']['storage'], f)
            f.close()

            os.remove(local_file_name)
        elif 'sheet' in project.task['to']:
            if project.verbose:
                print("QUERY TO SHEET", project.task['to']['sheet'])
            rows = query_to_rows(project.task['auth'],
                                 project.id,
                                 project.task['from']['dataset'],
                                 project.task['from']['query'],
                                 legacy=project.task['from'].get(
                                     'legacy', True))

            # makes sure types are correct in sheet
            rows = rows_to_type(rows)

            sheets_clear(project.task['auth'], project.task['to']['sheet'],
                         project.task['to']['tab'],
                         project.task['to'].get('range', 'A2'))
            sheets_write(project.task['auth'], project.task['to']['sheet'],
                         project.task['to']['tab'],
                         project.task['to'].get('range', 'A2'), rows)
        elif 'sftp' in project.task['to']:
            rows = query_to_rows(project.task['auth'],
                                 project.id,
                                 project.task['from']['dataset'],
                                 project.task['from']['query'],
                                 legacy=project.task['from'].get(
                                     'use_legacy_sql', True))

            if rows:
                if project.verbose: print("QUERY TO SFTP")
                put_rows(project.task['auth'], project.task['to'], rows)
        else:
            if project.verbose:
                print("QUERY TO VIEW", project.task['to']['view'])
            query_to_view(
                project.task['auth'],
                project.id,
                project.task['to']['dataset'],
                project.task['to']['view'],
                query_parameters(project.task['from']['query'],
                                 project.task['from'].get('parameters')),
                project.task['from'].get(
                    'legacy', project.task['from'].get(
                        'useLegacySql', True)),  # DEPRECATED: useLegacySql
                project.task['to'].get('replace', False))
    else:
        if project.verbose:
            print("STORAGE TO TABLE", project.task['to']['table'])
        storage_to_table(
            project.task['auth'], project.id, project.task['to']['dataset'],
            project.task['to']['table'], project.task['from']['bucket'] + ':' +
            project.task['from']['path'], project.task.get('schema', []),
            project.task.get('skip_rows', 1),
            project.task.get('structure', 'CSV'),
            project.task.get('disposition', 'WRITE_TRUNCATE'))