def descriptor_and_rows_to_dataframe(descriptor, rows): # Prepare primary_key = None schema = Schema(descriptor) if len(schema.primary_key) == 1: primary_key = schema.primary_key[0] elif len(schema.primary_key) > 1: raise RuntimeError('Multi-column primary keys are not supported') # Get data/index data_rows = [] index_rows = [] jtstypes_map = {} for row in rows: values = [] index = None for field, value in zip(schema.fields, row): try: value = field.cast_value(value) except InvalidObjectType: value = json.loads(value) if value is None and field.type in ('number', 'integer'): jtstypes_map[field.name] = 'number' value = np.NaN if field.name == primary_key: index = value else: values.append(value) data_rows.append(tuple(values)) index_rows.append(index) # Get dtypes dtypes = [] for field in schema.fields: if field.name != primary_key: field_name = field.name if six.PY2: field_name = field.name.encode('utf-8') dtype = jtstype_to_dtype(jtstypes_map.get(field.name, field.type)) dtypes.append((field_name, dtype)) # Create dataframe index = None columns = schema.headers array = np.array(data_rows, dtype=dtypes) if primary_key: index_field = schema.get_field(primary_key) index_dtype = jtstype_to_dtype(index_field.type) index_class = pd.Index if index_field.type in ['datetime', 'date']: index_class = pd.DatetimeIndex index = index_class(index_rows, name=primary_key, dtype=index_dtype) columns = filter(lambda column: column != primary_key, schema.headers) dataframe = pd.DataFrame(array, index=index, columns=columns) return dataframe
def test_get_field(): schema = Schema(DESCRIPTOR_MIN) assert schema.get_field('id').name == 'id' assert schema.get_field('height').name == 'height' assert schema.get_field('undefined') is None