Example #1
0
def get_schema(rows, header=True, infer_type=True):
  '''
  CAUTION: Memory suck. This function sabotages iteration by iterating thorough the new object and returning a new iterator
  RECOMMEND: Define the schema yourself, it will also ensure data integrity downstream.
  '''

  schema = []
  row_buffer = []

  # everything else defaults to STRING
  type_to_bq = {int: 'INTEGER', bool: 'BOOLEAN', float: 'FLOAT'} if infer_type else {} # empty lookup defaults to STRING below

  # first non null value determines type
  non_null_column = set()

  first = True
  ct_columns = 0

  for row in rows:

    # buffer the iterator to be returned with schema
    row += [None] * (ct_columns - len(row))
    row_buffer.append(row)

    # define schema field names and set defaults ( if no header enumerate fields )
    if first:
      ct_columns = len(row)
      for index, value in enumerate(row_header_sanitize(row)):
        schema.append({ "name": value if header else 'Field_%d' % index, "type": "STRING" })

    # then determine type of each column
    if not first and header:
      for index, value in enumerate(row):
        # if null, set only mode
        if value is None or value == '':
          schema[index]['mode'] = 'NULLABLE'
        else:
          column_type = type_to_bq.get(type(value), 'STRING')
          # if type is set, check to make sure its consistent
          if index in non_null_column:
            # change type only if its inconsistent
            if column_type != schema[index]['type']:
              # mixed integers and floats default to floats
              if column_type in ('INTEGER', 'FLOAT') and schema[index]['type'] in ('INTEGER', 'FLOAT'):
                schema[index]['type'] = 'FLOAT'
              # any strings are always strings
              else:
                schema[index]['type'] = 'STRING'
          # if first non null value, then just set type
          else:
            schema[index]['type'] = column_type
            non_null_column.add(index)

    # no longer first row
    first = False

  return row_buffer, schema
Example #2
0
def make_schema(header):
    return [{
        'name': name,
        'type': 'STRING',
        'mode': 'NULLABLE'
    } for name in row_header_sanitize(header)]