Python column_header_sanitize Examples, starthinker.util.csv.column_header_sanitize Python Examples

Example #1

0

Show file

def report_clean(rows, datastudio=False, nulls=False):
    """ Helper to fix DBM report issues for BigQuery and ensure schema compliance.

  Memory efficiently cleans each row by fixing:
  * Strips header and footer to preserve only data rows.
  * Changes 'Date' to 'Report_Day' to avoid using reserved name in BigQuery.
  * Changes data format to match data studio if datastusio=True.
  * Changes cell string Unknown to blank ( None ) if nulls=True.

  Usage example:

  ```
  filename, report = report_file(...)
  rows = report_to_rows(report)
  rows = report_clean(rows,  project.task.get('datastudio', False))
  ```

  Args:
    * rows: (iterator) Rows to clean.
   
  Returns:
    * Iterator of cleaned rows.

  """

    if project.verbose: print 'DBM Report Clean'

    first = True
    last = False
    date = None
    for row in rows:
        # stop if no data returned
        if row == ['No data returned by the reporting service.']: break

        # stop at blank row ( including sum row )
        if not row or row[0] is None or row[0] == '': break

        # find 'Date' column if it exists
        if first:
            try:
                date = row.index('Date')
            except ValueError:
                pass
            if datastudio: row = [column_header_sanitize(cell) for cell in row]

        # check if data studio formatting is applied
        if datastudio and date is not None:
            row[date] = 'Report_Day' if first else row[date].replace('/', '')

        # remove unknown columns ( which throw off schema on import types )
        if nulls:
            row = ['' if cell.strip() == 'Unknown' else cell for cell in row]

        # return the row
        yield row

        # not first row anymore
        first = False

Example #2

0

Show file

File: __init__.py Project: CAPITAL-Skrax/starthinker

def report_clean(rows):
  """ Helper to fix DCM report issues for BigQuery and ensure schema compliance.

  Memory efficiently cleans each row by fixing:
  * Strips header and footer to preserve only data rows.
  * Changes 'Date' to 'Report_Day' to avoid using reserved name in BigQuery.
  * removes '-' as columns
  * Changes data format to match data studio if datastusio=True.

  Usage example:

  ```
  filename, report = report_file(...)
  rows = report_to_rows(report)
  rows = report_clean(rows)
  ```

  Args:
    * rows: (iterator) Rows to clean.

  Returns:
    * Iterator of cleaned rows.

  """

  if project.verbose: print('DCM REPORT CLEAN')

  first = True
  last = False

  # find start of report
  for row in rows:
    if row and row[0] == 'Report Fields': break

  # process the report
  for row in rows:
    # quit if empty report
    if 'No data returned by the reporting service.' in row: break

    # stop parsing if end of data
    if not row or row[0] == 'Grand Total:': break

    # find 'Date' column if it exists
    if first:
      try:
        date_column = row.index('Date')
        row[date_column] = 'Report_Day'
      except ValueError: pass
      row = [column_header_sanitize(cell) for cell in row]

    # remove not set columns ( which throw off schema on import types )
    row = ['' if cell.strip() in ('(not set)', '-') else cell for cell in row]

    # return the row
    yield row

    # not first row anymore
    first = False

Example #3

0

Show file

File: run.py Project: google/starthinker

def dt_schema(header):
  schema = []
  for h in header:
    h = column_header_sanitize(h)
    schema.append({
        'name': h,
        'type': DT_Field_Lookup.get(h, 'STRING'),
        'mode': 'NULLABLE'
    })
  return schema

Example #4

0

Show file

def email_read():

  # process only most recent message
  email = get_email_messages(
    project.task['auth'],
    project.task['read']['from'],
    project.task['read']['to'],
    project.task['read'].get('subject', None),
    project.task['read'].get('link', None),
    project.task['read'].get('attachment', None),
    download=True
  )

  # only take the most recent email
  try: email = next(email)
  except:
    traceback.print_exc()
    if project.verbose: print('NO EMAILS FOUND')
    exit()

  if project.verbose: print('EMAIL:', email['subject'])

  # loop all attached files
  for filename, data in email_files(email):

    if project.verbose: print('EMAIL FILENAME:', filename)

    # decompress if necessary
    if filename.endswith('.gz'):
      data = gzip.GzipFile(fileobj=data, mode='rb')
      filename = filename[:-3]

    # if excel file, save each sheet individually
    if filename.endswith('.xlsx'):

      for sheet, rows in excel_to_rows(data):
        rows = rows_trim(rows)
        rows = rows_header_sanitize(rows)

        if project.verbose: print('EMAIL WRITE', filename)
        put_rows(project.task['auth'], project.task['read']['out'], rows, column_header_sanitize(sheet))

    # if csv, save directly
    elif filename.endswith('.csv'):
      rows = csv_to_rows(data.read().decode())
      rows = rows_header_sanitize(rows)

      if project.verbose: print('EMAIL WRITE', filename)
      put_rows(project.task['auth'], project.task['read']['out'], rows)

    else:
      if project.verbose: print('UNSUPPORTED FILE:', filename)

Example #5

0

Show file

File: __init__.py Project: quan/starthinker

def report_schema(headers):
    """ Helper to determine the schema of a given set of report headers.

  Using a match table generated from the DCM proto, each report header is
  matched
  to its type and a schema is assembled. If not found defaults to STRING.

  Usage example:

  ```
  filename, report = report_file(...)
  rows = report_to_rows(report)
  rows = report_clean(rows)
  schema = report_schema(next(rows))
  ```

  Args:
    * headers: (list) First row of a report.

  Returns:
    * JSON schema definition.

  """
    schema = []

    for header_name in headers:
        header_sanitized = column_header_sanitize(header_name)

        # first try exact match
        header_type = DCM_Field_Lookup.get(header_sanitized)

        # second try to match end for custom field names ( activity reports )
        if header_type is None:
            for field_name, field_type in DCM_Field_Lookup.items():
                if header_sanitized.endswith('_' + field_name):
                    header_type = field_type
                    break

        # finally default it to STRING
        if header_type is None:
            header_type = 'STRING'

        schema.append({
            'name': header_sanitized,
            'type': header_type,
            'mode': 'NULLABLE'
        })

    return schema

Example #6

0

Show file

File: run.py Project: hernanperalta/starthinker

def get_schema(sheet):
    return [{
        "name": column_header_sanitize(column.title),
        "type": SMARTSHEET_TYPES.get(str(column.type), "STRING"),
        "mode": "NULLABLE"
    } for column in sheet.columns]

Example #7

0

Show file

File: run.py Project: google/starthinker

def get_schema(sheet_or_report):
    return [{
        'name': column_header_sanitize(column.title),
        'type': SMARTSHEET_TYPES.get(str(column.type), 'STRING'),
        'mode': 'NULLABLE'
    } for column in sheet_or_report.columns]

Example #8

0

Show file

File: __init__.py Project: quesadillasraclette/starthinker

def report_clean(rows, datastudio=False, nulls=False):
    """ Helper to fix DBM report issues for BigQuery and ensure schema compliance.

  Memory efficiently cleans each row by fixing:
  * Strips header and footer to preserve only data rows.
  * Changes 'Date' to 'Report_Day' to avoid using reserved name in BigQuery.
  * Changes date values to use '-' instead of '/' for BigQuery compatibility.
  * Changes cell string Unknown to blank ( None ) if nulls=True.

  Usage example:

  ```
  filename, report = report_file(...)
  rows = report_to_rows(report)
  rows = report_clean(rows,  project.task.get('datastudio', False))
  ```

  Args:
    * rows: (iterator) Rows to clean.
   
  Returns:
    * Iterator of cleaned rows.

  """

    if project.verbose: print 'DBM Report Clean'

    first = True
    last = False
    date = None
    for row in rows:
        # stop if no data returned
        if row == ['No data returned by the reporting service.']: break

        # stop at blank row ( including sum row )
        if not row or row[0] is None or row[0] == '': break

        # sanitizie header row
        if first:
            try:
                date_column = row.index('Date')
                row[date_column] = 'Report_Day'
            except ValueError:
                pass
            if datastudio: row = [column_header_sanitize(cell) for cell in row]

        # for all data rows clean up cells
        else:
            # check if data studio formatting is applied reformat the dates
            if datastudio:
                row = [
                    cell.replace('/', '-') if isinstance(cell, basestring)
                    and len(cell) == 4 + 1 + 2 + 1 + 2 and cell[4] == '/'
                    and cell[7] == '/' else cell for cell in row
                ]  # 5x faster than regexp

        # remove unknown columns ( which throw off schema on import types )
        if nulls:
            row = ['' if cell.strip() == 'Unknown' else cell for cell in row]

        # return the row
        yield row

        # not first row anymore
        first = False