def get_schema(rows, header=True, infer_type=True): ''' CAUTION: Memory suck. This function sabotages iteration by iterating thorough the new object and returning a new iterator RECOMMEND: Define the schema yourself, it will also ensure data integrity downstream. ''' schema = [] row_buffer = [] # everything else defaults to STRING type_to_bq = {int: 'INTEGER', bool: 'BOOLEAN', float: 'FLOAT'} if infer_type else {} # empty lookup defaults to STRING below # first non null value determines type non_null_column = set() first = True ct_columns = 0 for row in rows: # buffer the iterator to be returned with schema row += [None] * (ct_columns - len(row)) row_buffer.append(row) # define schema field names and set defaults ( if no header enumerate fields ) if first: ct_columns = len(row) for index, value in enumerate(row_header_sanitize(row)): schema.append({ "name": value if header else 'Field_%d' % index, "type": "STRING" }) # then determine type of each column if not first and header: for index, value in enumerate(row): # if null, set only mode if value is None or value == '': schema[index]['mode'] = 'NULLABLE' else: column_type = type_to_bq.get(type(value), 'STRING') # if type is set, check to make sure its consistent if index in non_null_column: # change type only if its inconsistent if column_type != schema[index]['type']: # mixed integers and floats default to floats if column_type in ('INTEGER', 'FLOAT') and schema[index]['type'] in ('INTEGER', 'FLOAT'): schema[index]['type'] = 'FLOAT' # any strings are always strings else: schema[index]['type'] = 'STRING' # if first non null value, then just set type else: schema[index]['type'] = column_type non_null_column.add(index) # no longer first row first = False return row_buffer, schema
def make_schema(header): return [{ 'name': name, 'type': 'STRING', 'mode': 'NULLABLE' } for name in row_header_sanitize(header)]