Beispiel #1
0
def report_to_rows(report):
    """ Helper to convert DCM files into iterator of rows, memory efficient.

  Usage example:

  ```
  filename, report = report_file(...)
  rows = report_to_rows(report)
  ```

  Args:
    * report: (iterator or file) Either an iterator or file that will be
      converted to rows.

  Returns:
    * Iterator of lists representing each row.

  """

    # if reading from stream
    if type(report) is GeneratorType:
        leftovers = ''
        for chunk in report:
            data, extra = chunk.rsplit('\n', 1)
            for row in csv_to_rows(leftovers + data):
                yield row
            leftovers = extra

    # if reading from buffer
    else:
        for row in csv_to_rows(report):
            yield row
Beispiel #2
0
def sdf_to_bigquery(config,
                    auth,
                    sdf_zip_file,
                    project_id,
                    dataset,
                    time_partitioned_table,
                    create_single_day_table,
                    table_suffix=''):
    with zipfile.ZipFile(sdf_zip_file, 'r', zipfile.ZIP_DEFLATED) as d:
        file_names = d.namelist()
        for file_name in file_names:
            if config.verbose:
                print('SDF: Loading: ' + file_name)
            with d.open(file_name) as sdf_file:
                rows = csv_to_rows(sdf_file.read().decode('utf-8'))
                if not rows:
                    if config.verbose:
                        print('SDF: Empty file ' + file_name)
                    continue
                table_name = file_name.split('.')[0].replace(
                    '-', '_') + table_suffix
                schema = sdf_schema(next(rows))

                # Check if each SDF should have a dated table
                if create_single_day_table:
                    table_name_dated = table_name + date.today().strftime(
                        '%Y_%m_%d')

                    # Create table and upload data
                    table_create(auth, project_id, dataset, table_name_dated)
                    rows_to_table(config,
                                  auth,
                                  project_id,
                                  dataset,
                                  table_name_dated,
                                  rows,
                                  schema=schema,
                                  skip_rows=1,
                                  disposition='WRITE_TRUNCATE')

                # Create end result table if it doesn't already exist
                if not table_exists(config, auth, project_id, dataset,
                                    table_name):
                    table_create(config,
                                 auth,
                                 project_id,
                                 dataset,
                                 table_name,
                                 is_time_partition=time_partitioned_table)

                rows_to_table(config,
                              auth,
                              project_id,
                              dataset,
                              table_name,
                              rows,
                              schema=schema,
                              skip_rows=1,
                              disposition='WRITE_APPEND'
                              if time_partitioned_table else 'WRITE_TRUNCATE')
Beispiel #3
0
def conversions_download():
  if project.verbose:
    print('CONVERSION DOWNLOAD')

  # pull from bigquery if specified
  if 'bigquery' in project.task:
    if project.verbose:
      print('READING BIGQUERY')
    rows = query_to_rows(
        project.task['auth'],
        project.id,
        project.task['bigquery']['dataset'],
        'SELECT * FROM %s' % project.task['bigquery']['table'],
        legacy=project.task['bigquery'].get('legacy', True))
    for row in rows:
      yield row

  # pull from sheets if specified
  if 'sheets' in project.task:
    if project.verbose:
      print('READING SHEET')
    rows = sheets_read(project.task['auth'], project.task['sheets']['url'],
                       project.task['sheets']['tab'],
                       project.task['sheets']['range'])
    for row in rows:
      yield row

  # pull from csv if specified
  if 'csv' in project.task:
    if project.verbose:
      print('READING CSV FILE')
    with io.open(project.task['csv']['file']) as f:
      for row in csv_to_rows(f):
        if row[0] not in CSV_HEADERS:
          yield row
Beispiel #4
0
def email_read():

  # process only most recent message
  email = get_email_messages(
    project.task['auth'],
    project.task['read']['from'],
    project.task['read']['to'],
    project.task['read'].get('subject', None),
    project.task['read'].get('link', None),
    project.task['read'].get('attachment', None),
    download=True
  )

  # only take the most recent email
  try: email = next(email)
  except:
    traceback.print_exc()
    if project.verbose: print('NO EMAILS FOUND')
    exit()

  if project.verbose: print('EMAIL:', email['subject'])

  # loop all attached files
  for filename, data in email_files(email):

    if project.verbose: print('EMAIL FILENAME:', filename)

    # decompress if necessary
    if filename.endswith('.gz'):
      data = gzip.GzipFile(fileobj=data, mode='rb')
      filename = filename[:-3]

    # if excel file, save each sheet individually
    if filename.endswith('.xlsx'):

      for sheet, rows in excel_to_rows(data):
        rows = rows_trim(rows)
        rows = rows_header_sanitize(rows)

        if project.verbose: print('EMAIL WRITE', filename)
        put_rows(project.task['auth'], project.task['read']['out'], rows, column_header_sanitize(sheet))

    # if csv, save directly
    elif filename.endswith('.csv'):
      rows = csv_to_rows(data.read().decode())
      rows = rows_header_sanitize(rows)

      if project.verbose: print('EMAIL WRITE', filename)
      put_rows(project.task['auth'], project.task['read']['out'], rows)

    else:
      if project.verbose: print('UNSUPPORTED FILE:', filename)
Beispiel #5
0
def lineitem_read(auth, advertisers=[], insertion_orders=[], lineitems=[]):
  """ Reads line item configurations from DBM.
  
  Bulletproofing: https://developers.google.com/bid-manager/v1/lineitems/downloadlineitems 

  Args:
    * auth: (string) Either user or service.
    * advertisers (list) List of advertiser ids ( exclusive with insertion_orders and lineitems ).
    * insertion_orders (list) List of insertion_order ids ( exclusive with advertisers and lineitems ).
    * lineitems (list) List of ilineitem ids ( exclusive with insertion_orders and advertisers ).
  
  Returns:
    * Iterator of lists: https://developers.google.com/bid-manager/guides/entity-write/format

  """

  service = get_service('doubleclickbidmanager', API_VERSION, auth)

  body = {
    'format':'CSV',
    'fileSpec':'EWF'
  }

  if advertisers: 
    body['filterType'] = 'ADVERTISER_ID'
    body['filterIds'] = list(advertisers) # in case its a generator

  elif insertion_orders: 
    body['filterType'] = 'INSERTION_ORDER_ID'
    body['filterIds'] = list(insertion_orders) # in case its a generator

  elif lineitems: 
    body['filterType'] = 'LINE_ITEM_ID'
    body['filterIds'] = list(lineitems) # in case its a generator

  #print(body)

  result = API_Retry(service.lineitems().downloadlineitems(body=body))

  for count, row in enumerate(csv_to_rows(result.get('lineItems', ''))):
    if count == 0: continue # skip header
    row[0] = int(row[0] or 0) # LineItem ID
    row[2] = int(row[2] or 0) # Partner ID	
    row[11] = float(row[11] or 0) # IO Budget Amount
    row[18] = float(row[18] or 0) # LineItem Budget Amount
    row[21] = float(row[21] or 0) # LineItem Pacing Amount
    row[23] = int(row[23] or 0) # LineItem Frequency Exposures
    row[25] = int(row[25] or 0) # LineItem Frequency Amount
    row[26] = float(row[26] or 0) # Bid Price (CPM)
    row[28] = float(row[28] or 0) # Partner Revenue Amount
    yield row
Beispiel #6
0
def get_single_sdf_rows(auth, version, partner_id, file_types, filter_type, filter_ids_obj, desired_file_type):
  sdf_zip_file = sdf_download(auth, version, partner_id, file_types, filter_type, filter_ids_obj)

  with zipfile.ZipFile(sdf_zip_file, 'r', zipfile.ZIP_DEFLATED) as d: 
    file_names = d.namelist()
    for file_name  in file_names:

      # make sure to only get the one file
      if desired_file_type != file_name.split('-')[1].split('.')[0] or 'Skipped' in file_name: continue

      if project.verbose: print('SDF: Loading: ' + file_name)
      with d.open(file_name) as sdf_file:
        rows = csv_to_rows(sdf_file.read().decode('utf-8'))

        return rows
def sdf_read(auth, file_types, filter_type, filter_ids, version='3.1'):
    """ Read sdf file from DV360 api

  https://developers.google.com/bid-manager/v1/sdf/download

  Args:
    * auth:  (string) Either user or service.
    * file_types: (list[string]) List of the requested file types 
    * filter_type: (string) Value of the type of filter ids to be expected
    * filter_ids: (list[int]) Filter ids for the api call

  Returns:
    * Rows of the sdf files requested

  """
    service = get_service('doubleclickbidmanager', API_VERSION, auth)

    body = {
        "fileTypes": file_types,
        "filterType": filter_type,
        "filterIds": filter_ids,
        "version": version
    }

    result = API_Retry(service.sdf().download(body=body))

    # Clean the date field in each row
    for key in result.keys():
        date_idx_list = []
        for count, row in enumerate(csv_to_rows(result.get(key, ''))):
            # Get date fields from header
            if count == 0:
                date_idx_list = _get_idx_sdf_date_fields(row)
                yield row
            # Clean and yield data rows
            else:
                yield _clean_sdf_row(row, date_idx_list)
Beispiel #8
0
def email_read():

    # process only most recent message
    try:
        message = next(
            get_email_messages(project.task['auth'],
                               project.task['read']['from'],
                               project.task['read']['to'],
                               project.task['read'].get('subject', None)))
    except StopIteration as e:
        if project.verbose: print('NO EMAILS FOUND')
        exit()

    if project.verbose: print('EMAIL:', get_subject(message))

    files = []

    if project.task['read'].get('attachment'):
        files.extend(
            get_email_attachments(project.task['auth'], message,
                                  project.task['read']['attachment']))

    if project.task['read'].get('link'):
        files.extend(
            get_email_links(project.task['auth'],
                            message,
                            project.task['read']['link'],
                            download=True))

    for filename, data in files:

        if project.verbose: print('EMAIL FILENAME:', filename)

        # decompress if necessary
        if filename.endswith('.gz'):
            data = gzip.GzipFile(fileobj=data, mode='rb')
            filename = filename[:-3]

        # if excel file, save each sheet individually
        if filename.endswith('.xlsx'):

            for sheet, rows in excel_to_rows(data):
                rows = rows_trim(rows)
                rows = rows_header_sanitize(rows)
                put_rows(project.task['auth'], project.task['read']['out'],
                         rows, sheet)

        # if CM report
        elif project.task['read']['from'] == '*****@*****.**':
            rows = cm_report_to_rows(data.read().decode())
            rows = cm_report_clean(rows)

            # if bigquery, remove header and determine schema
            schema = None
            if 'bigquery' in project.task['read']['out']:
                schema = cm_report_schema(next(rows))
                project.task['read']['out']['bigquery']['schema'] = schema
                project.task['read']['out']['bigquery']['skip_rows'] = 1

            put_rows(project.task['auth'], project.task['read']['out'], rows)

        # if dv360 report
        elif project.task['read']['from'] == '*****@*****.**':
            rows = dv360_report_to_rows(data.getvalue().decode())
            rows = dv360_report_clean(rows)
            put_rows(project.task['auth'], project.task['read']['out'], rows)

        # if csv
        elif filename.endswith('.csv'):
            rows = csv_to_rows(data.read().decode())
            rows = rows_header_sanitize(rows)
            put_rows(project.task['auth'], project.task['read']['out'], rows)

        else:
            if project.verbose: print('UNSUPPORTED FILE:', filename)
Beispiel #9
0
def main():
    # get parameters
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent("""\
    Command line to get table schema from BigQuery.

    Helps developers upload data to BigQuery and pull schemas.  These are the
    most common BigQuery tasks when developing solutions.

    Examples:
      Display table schema: `python helper.py --project [id] --dataset [name] --table [name] -s [credentials]`
      Upload csv table: `python helper.py --project [id] --dataset [name] --table [name] --csv [file] --schema [file] -s [credentials]`
      Upload excel sheet: `python helper.py --project [id] --dataset [name] --table [name] --excel_file [file] --excel_sheet [name] --schema [file] -s [credentials]`

  """))

    parser.add_argument('--dataset',
                        help='name of BigQuery dataset',
                        default=None)
    parser.add_argument('--table', help='name of BigQuery table', default=None)
    parser.add_argument('--csv', help='CSV file path', default=None)
    parser.add_argument('--schema', help='SCHEMA file path', default=None)
    parser.add_argument('--excel_workbook',
                        help='Excel file path',
                        default=None)
    parser.add_argument('--excel_sheet', help='Excel sheet name', default=None)

    # initialize project
    parser = commandline_parser(parser,
                                arguments=('-u', '-c', '-s', '-v', '-p'))
    args = parser.parse_args()
    config = Configuration(user=args.user,
                           client=args.client,
                           service=args.service,
                           verbose=args.verbose,
                           project=args.project)

    auth = 'service' if args.service else 'user'

    schema = json.loads(args.schema) if args.schema else None

    if args.csv:

        with open(args.csv, 'r') as csv_file:
            rows = csv_to_rows(csv_file.read())

            if not schema:
                rows, schema = get_schema(rows)
                print('DETECETED SCHEMA', json.dumps(schema))
                print('Please run again with the above schema provided.')
                exit()

            rows_to_table(config, auth, config.project, args.dataset,
                          args.table, rows, schema)

    elif args.excel_workbook and args.excel_sheet:
        with open(args.excel_workbook, 'r') as excel_file:
            rows = excel_to_rows(excel_file, args.excel_sheet)

            if not schema:
                rows, schema = get_schema(rows)
                print('DETECETED SCHEMA', json.dumps(schema))
                print('Please run again with the above schema provided.')
                exit()

            rows_to_table(config, auth, config.project, args.dataset,
                          args.table, rows, schema)

    else:
        # print schema
        print(
            json.dumps(table_to_schema(config, auth, config.project,
                                       args.dataset, args.table),
                       indent=2))