def table(self): """Build and cache a table from query results""" if self._table is None: schema_obj = Schema(self._schema) table = [] if 'results' in self.raw_data: field_names = [field.name for field in schema_obj.fields] result_vars = self.raw_data['head']['vars'] for binding in self.raw_data['results']['bindings']: rdf_terms = table_schema.order_terms_in_binding( result_vars, binding) values = [] for rdf_term in rdf_terms: if rdf_term is not None: values.append(rdf_term['value']) else: values.append(None) table_row = schema_obj.cast_row(values) table.append(OrderedDict(zip(field_names, table_row))) elif 'boolean' in self.raw_data: # Results of an ASK query table = [{'boolean': self.raw_data['boolean']}] self._table = table return self._table
def iter(self, bucket): # Get response descriptor = self.describe(bucket) schema = Schema(descriptor) tablename = mappers.bucket_to_tablename(self.__prefix, bucket) response = self.__service.tabledata().list( projectId=self.__project, datasetId=self.__dataset, tableId=tablename).execute() # Yield rows for fields in response['rows']: row = [] values = [field['v'] for field in fields['f']] for index, field in enumerate(schema.fields): value = values[index] # Here we fix bigquery "1.234234E9" like datetimes if field.type == 'date': value = datetime.datetime.utcfromtimestamp( int(float(value))) fmt = '%Y-%m-%d' if field.format.startswith('fmt:'): fmt = field.format.replace('fmt:', '') value = value.strftime(fmt) elif field.type == 'datetime': value = datetime.datetime.utcfromtimestamp( int(float(value))) value = '%sZ' % value.isoformat() row.append(value) yield schema.cast_row(row)
def sunc_rows(descriptor, rows): result = [] schema = Schema(descriptor) for row in rows: row = schema.cast_row(row) values = [] for index, field in enumerate(descriptor['fields']): value = row[index] if field['type'] == 'date': value = datetime.datetime.fromordinal(value.toordinal()) values.append(value) result.append(values) return result
def __write_rows_buffer(self, bucket, rows_buffer): # Process data to byte stream csv descriptor = self.describe(bucket) schema = Schema(descriptor) bytes = io.BufferedRandom(io.BytesIO()) writer = unicodecsv.writer(bytes, encoding='utf-8') for values in rows_buffer: row = [] values = schema.cast_row(values) for index, field in enumerate(schema.fields): value = values[index] # Here we convert date to datetime if field.type == 'date': value = datetime.datetime.fromordinal(value.toordinal()) value = '%sZ' % value.isoformat() row.append(value) writer.writerow(row) bytes.seek(0) # Prepare job body tablename = mappers.bucket_to_tablename(self.__prefix, bucket) body = { 'configuration': { 'load': { 'destinationTable': { 'projectId': self.__project, 'datasetId': self.__dataset, 'tableId': tablename }, 'sourceFormat': 'CSV', } } } # Prepare job media body mimetype = 'application/octet-stream' media_body = MediaIoBaseUpload(bytes, mimetype=mimetype) # Make request to Big Query response = self.__service.jobs().insert( projectId=self.__project, body=body, media_body=media_body).execute() self.__wait_response(response)
def _iter_rows(self): if self._schema is not None: # Not empty results schema_obj = Schema(self._schema) if 'results' in self.raw_data: field_names = [field.name for field in schema_obj.fields] result_vars = self.raw_data['head']['vars'] for binding in self.raw_data['results']['bindings']: rdf_terms = table_schema.order_terms_in_binding( result_vars, binding) values = [] for rdf_term in rdf_terms: if rdf_term is not None: values.append(rdf_term['value']) else: values.append(None) table_row = schema_obj.cast_row(values) # when the column is a string value, the jsontableschema # library is incorrectly mapping the several literal # string values ('null', 'none', '-', etc.) to the python # `None` value - a deeper fix might be to reconsider using # that library, or maybe fixing this issue in that # library (since it's probably not a good idea to render # a number of strings un-representable) - this fixes the # problem for our result sets. Essentially, this zips # over each result set and checks whether we mapped a # non-null value to `None` in a string field, and if # so it restores the non-null value before continuing table_row = map( lambda field, original, mapped: original if (not mapped) and original and field.type == 'string' else mapped, schema_obj.fields, values, table_row) yield OrderedDict(zip(field_names, table_row)) elif 'boolean' in self.raw_data: # Results of an ASK query yield {'boolean': self.raw_data['boolean']}
def _iter_rows(self): if self._schema is not None: # Not empty results schema_obj = Schema(self._schema) if 'results' in self.raw_data: field_names = [field.name for field in schema_obj.fields] result_vars = self.raw_data['head']['vars'] for binding in self.raw_data['results']['bindings']: rdf_terms = table_schema.order_terms_in_binding( result_vars, binding) values = [] for rdf_term in rdf_terms: if rdf_term is not None: values.append(rdf_term['value']) else: values.append(None) table_row = schema_obj.cast_row(values) yield OrderedDict(zip(field_names, table_row)) elif 'boolean' in self.raw_data: # Results of an ASK query yield {'boolean': self.raw_data['boolean']}
def sync_rows(descriptor, rows): result = [] schema = Schema(descriptor) for row in rows: result.append(schema.cast_row(row)) return result
def test_cast_row_too_short(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string'] with pytest.raises(exceptions.InvalidCastError): schema.cast_row(source)
def test_cast_row_null_values(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '', '-', 'string', 'null'] target = ['string', None, None, 'string', None] assert schema.cast_row(source) == target
def test_cast_row(): schema = Schema(DESCRIPTOR_MAX) source = ['string', '10.0', '1', 'string', 'string'] target = ['string', Decimal(10.0), 1, 'string', 'string'] assert schema.cast_row(source) == target
def test_cast_row_wrong_type_multiple_errors(): schema = Schema(DESCRIPTOR_MAX) source = ['string', 'notdecimal', '10.6', 'string', 'string'] with pytest.raises(exceptions.MultipleInvalid) as excinfo: schema.cast_row(source, no_fail_fast=True) assert len(excinfo.value.errors) == 2
def test_cast_row_wrong_type_no_fail_fast_true(): schema = Schema(DESCRIPTOR_MAX) source = ['string', 'notdecimal', '10.6', 'string', 'string'] with pytest.raises(exceptions.MultipleInvalid): schema.cast_row(source, no_fail_fast=True)