Ejemplo n.º 1
0
def sdf_to_bigquery(config,
                    auth,
                    sdf_zip_file,
                    project_id,
                    dataset,
                    time_partitioned_table,
                    create_single_day_table,
                    table_suffix=''):
    with zipfile.ZipFile(sdf_zip_file, 'r', zipfile.ZIP_DEFLATED) as d:
        file_names = d.namelist()
        for file_name in file_names:
            if config.verbose:
                print('SDF: Loading: ' + file_name)
            with d.open(file_name) as sdf_file:
                rows = csv_to_rows(sdf_file.read().decode('utf-8'))
                if not rows:
                    if config.verbose:
                        print('SDF: Empty file ' + file_name)
                    continue
                table_name = file_name.split('.')[0].replace(
                    '-', '_') + table_suffix
                schema = sdf_schema(next(rows))

                # Check if each SDF should have a dated table
                if create_single_day_table:
                    table_name_dated = table_name + date.today().strftime(
                        '%Y_%m_%d')

                    # Create table and upload data
                    table_create(auth, project_id, dataset, table_name_dated)
                    rows_to_table(config,
                                  auth,
                                  project_id,
                                  dataset,
                                  table_name_dated,
                                  rows,
                                  schema=schema,
                                  skip_rows=1,
                                  disposition='WRITE_TRUNCATE')

                # Create end result table if it doesn't already exist
                if not table_exists(config, auth, project_id, dataset,
                                    table_name):
                    table_create(config,
                                 auth,
                                 project_id,
                                 dataset,
                                 table_name,
                                 is_time_partition=time_partitioned_table)

                rows_to_table(config,
                              auth,
                              project_id,
                              dataset,
                              table_name,
                              rows,
                              schema=schema,
                              skip_rows=1,
                              disposition='WRITE_APPEND'
                              if time_partitioned_table else 'WRITE_TRUNCATE')
Ejemplo n.º 2
0
def dt():
  jobs = []

  if project.verbose: print("DT To BigQuery")

  # legacy deprecated ( do not use )
  if 'path' in project.task: project.task['paths'] = [project.task['path']]

  # loop all dt files to match pattern or match any pattern
  print('PATHS', project.task['paths'])

  for path in (project.task['paths'] or ['']):

    print(path)
    for dt_object in object_list(project.task['auth'], '%s:%s' % (project.task['bucket'], path), raw=True):
      dt_size = dt_object['size']
      dt_file = dt_object['name']
      dt_time = dt_timestamp(dt_file)

      dt_partition = dt_file.split('.', 1)[0]
      if ((project.task.get('days') is None and project.task.get('hours') is None) or (dt_time > project.now - timedelta(days=project.task.get('days', 60), hours=project.task.get('hours', 0)))):
        if not table_exists(project.task['to']['auth'], project.id, project.task['to']['dataset'], dt_partition):
           dt_move(dt_object, dt_partition, jobs) 
        else:
          if project.verbose: print ('DT Partition Exists:', dt_partition)

  for count, job in enumerate(jobs):
    print('Waiting For Job: %d of %d' % (count + 1, len(jobs)))
    job_wait(project.task['to']['auth'], job)
Ejemplo n.º 3
0
def dcm_log(config, task):
    if config.verbose:
        print('DCM LOG')

    accounts = list(get_rows(config, 'user', task['accounts']))

    # determine start log date
    if table_exists(config, task['out']['auth'], task['out']['project'],
                    task['out']['dataset'], CHANGELOGS_TABLE):
        start = next(
            query_to_rows(
                config, task['out']['auth'], task['out']['project'],
                task['out']['dataset'],
                'SELECT FORMAT_TIMESTAMP("%%Y-%%m-%%dT%%H:%%M:%%S-00:00", MAX(changeTime), "UTC") FROM `%s`'
                % CHANGELOGS_TABLE, 1, False))[0]
        disposition = 'WRITE_APPEND'

    else:
        start = (datetime.utcnow() - timedelta(days=int(task['days']))
                 ).strftime('%Y-%m-%dT%H:%M:%S-00:00')
        disposition = 'WRITE_TRUNCATE'

    # load new logs
    rows = get_changelogs(config, task, accounts, start)
    if rows:
        rows_to_table(config, task['out']['auth'], task['out']['project'],
                      task['out']['dataset'], CHANGELOGS_TABLE, rows,
                      CHANGELOGS_SCHEMA, 0, disposition)
Ejemplo n.º 4
0
    def handle(self, *args, **kwargs):

        impact = [
        ]  #{ 'day': DATE, 'deployment':INT, 'account': INT, 'product': STRING, 'recipe': STRING, 'user': STRING }
        missing = {}
        id_max = 0

        project.initialize(_service=settings.UI_SERVICE, _verbose=True)

        if table_exists('service', 'google.com:starthinker', 'dashboard',
                        'ST_Scripts'):
            id_max = next(
                query_to_rows('service',
                              'google.com:starthinker',
                              'dashboard',
                              'SELECT MAX(Deployment) FROM ST_Scripts',
                              legacy=False))[0]

        for recipe in Recipe.objects.filter(
                id__gt=id_max).order_by('id')[:kwargs['recipes']]:

            project.initialize(_user=recipe.account.get_credentials_path(),
                               _service=settings.UI_SERVICE,
                               _verbose=True)

            values = recipe.get_values()
            for v in values:
                if v['tag'] in ('dcm_to_bigquery', 'dcm_to_sheets',
                                'dcm_to_storage', 'dcm_run',
                                'conversion_upload_from_bigquery',
                                'conversion_upload_from_sheets'):
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        v['values'].get('account'),
                        'script':
                        v['tag'],
                        'product':
                        'dcm',
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                elif v['tag'] in ('dbm_to_bigquery', 'dbm_to_sheets',
                                  'dbm_to_storage'):
                    for partner in account_from_dbm_report(
                            v['values'].get('dbm_report_id'),
                            v['values'].get('dbm_report_name')):
                        impact.append({
                            'day':
                            recipe.birthday,
                            'deployment':
                            recipe.id,
                            'account':
                            partner,
                            'script':
                            v['tag'],
                            'product':
                            'dbm',
                            'user':
                            recipe.account.email.replace('@google.com', '')
                        })
                elif v['tag'] in ('dt', ):
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        account_from_dt(v['values']),
                        'script':
                        v['tag'],
                        'product':
                        'dcm',
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                elif v['tag'] == 'barnacle':
                    for account in v['values']['accounts']:
                        impact.append({
                            'day':
                            recipe.birthday,
                            'deployment':
                            recipe.id,
                            'account':
                            account,
                            'script':
                            v['tag'],
                            'product':
                            'dcm',
                            'user':
                            recipe.account.email.replace('@google.com', '')
                        })
                elif v['tag'] in ('entity', ):
                    for partner in v['values']['partners']:
                        impact.append({
                            'day':
                            recipe.birthday,
                            'deployment':
                            recipe.id,
                            'account':
                            partner,
                            'script':
                            v['tag'],
                            'product':
                            'dbm',
                            'user':
                            recipe.account.email.replace('@google.com', '')
                        })
                elif v['tag'] == 'itp':
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        v['values']['dcm_account'],
                        'script':
                        v['tag'],
                        'product':
                        'dcm',
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        v['values']['dbm_partner'],
                        'script':
                        v['tag'],
                        'product':
                        'dbm',
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                elif v['tag'] == 'itp_audit':
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        v['values']['cm_account_id'],
                        'script':
                        v['tag'],
                        'product':
                        'dcm',
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                    for partner in account_from_dbm_report(
                            None, v['values'].get('dv360_report_name')):
                        impact.append({
                            'day':
                            recipe.birthday,
                            'deployment':
                            recipe.id,
                            'account':
                            partner,
                            'script':
                            v['tag'],
                            'product':
                            'dbm',
                            'user':
                            recipe.account.email.replace('@google.com', '')
                        })
                else:
                    impact.append({
                        'day':
                        recipe.birthday,
                        'deployment':
                        recipe.id,
                        'account':
                        None,
                        'script':
                        v['tag'],
                        'product':
                        None,
                        'user':
                        recipe.account.email.replace('@google.com', '')
                    })
                    missing.setdefault(v['tag'], 0)
                    missing[v['tag']] += 1

        if impact:
            if kwargs['test']:
                print(impact)
            else:

                print('WRITING TO ST_Scripts')
                rows_to_table('service',
                              'google.com:starthinker',
                              'dashboard',
                              'ST_Scripts',
                              [(i['day'], i['deployment'], i['user'],
                                i['product'], i['script'], i['account'])
                               for i in impact],
                              schema=[
                                  {
                                      'mode': 'REQUIRED',
                                      'name': 'Day',
                                      'type': 'Date'
                                  },
                                  {
                                      'mode': 'REQUIRED',
                                      'name': 'Deployment',
                                      'type': 'INTEGER'
                                  },
                                  {
                                      'mode': 'REQUIRED',
                                      'name': 'User',
                                      'type': 'STRING'
                                  },
                                  {
                                      'mode': 'NULLABLE',
                                      'name': 'Product',
                                      'type': 'STRING'
                                  },
                                  {
                                      'mode': 'NULLABLE',
                                      'name': 'Recipe',
                                      'type': 'STRING'
                                  },
                                  {
                                      'mode': 'NULLABLE',
                                      'name': 'Account',
                                      'type': 'INTEGER'
                                  },
                              ],
                              skip_rows=0,
                              disposition='WRITE_TRUNCATE'
                              if id_max == 0 else 'WRITE_APPEND',
                              wait=True)

            print('MISSING', missing)
            print('Coverage:',
                  (len(impact) * 100) / (len(missing) + len(impact)))
        else:
            print('No recipes newer than:', id_max)