Esempio n. 1
0
    def table(self):
        """Build and cache a table from query results"""
        if self._table is None:
            schema_obj = Schema(self._schema)

            table = []
            if 'results' in self.raw_data:
                field_names = [field.name for field in schema_obj.fields]
                result_vars = self.raw_data['head']['vars']

                for binding in self.raw_data['results']['bindings']:
                    rdf_terms = table_schema.order_terms_in_binding(
                        result_vars, binding)

                    values = []
                    for rdf_term in rdf_terms:
                        if rdf_term is not None:
                            values.append(rdf_term['value'])
                        else:
                            values.append(None)

                    table_row = schema_obj.cast_row(values)
                    table.append(OrderedDict(zip(field_names, table_row)))
            elif 'boolean' in self.raw_data:
                # Results of an ASK query
                table = [{'boolean': self.raw_data['boolean']}]

            self._table = table

        return self._table
    def iter(self, bucket):

        # Get response
        descriptor = self.describe(bucket)
        schema = Schema(descriptor)
        tablename = mappers.bucket_to_tablename(self.__prefix, bucket)
        response = self.__service.tabledata().list(
            projectId=self.__project,
            datasetId=self.__dataset,
            tableId=tablename).execute()

        # Yield rows
        for fields in response['rows']:
            row = []
            values = [field['v'] for field in fields['f']]
            for index, field in enumerate(schema.fields):
                value = values[index]
                # Here we fix bigquery "1.234234E9" like datetimes
                if field.type == 'date':
                    value = datetime.datetime.utcfromtimestamp(
                        int(float(value)))
                    fmt = '%Y-%m-%d'
                    if field.format.startswith('fmt:'):
                        fmt = field.format.replace('fmt:', '')
                    value = value.strftime(fmt)
                elif field.type == 'datetime':
                    value = datetime.datetime.utcfromtimestamp(
                        int(float(value)))
                    value = '%sZ' % value.isoformat()
                row.append(value)
            yield schema.cast_row(row)
Esempio n. 3
0
    def iter(self, bucket):

        # Get response
        descriptor = self.describe(bucket)
        schema = Schema(descriptor)
        tablename = mappers.bucket_to_tablename(self.__prefix, bucket)
        response = self.__service.tabledata().list(
            projectId=self.__project,
            datasetId=self.__dataset,
            tableId=tablename).execute()

        # Yield rows
        for fields in response['rows']:
            row = []
            values = [field['v'] for field in fields['f']]
            for index, field in enumerate(schema.fields):
                value = values[index]
                # Here we fix bigquery "1.234234E9" like datetimes
                if field.type == 'date':
                    value = datetime.datetime.utcfromtimestamp(
                        int(float(value)))
                    fmt = '%Y-%m-%d'
                    if field.format.startswith('fmt:'):
                        fmt = field.format.replace('fmt:', '')
                    value = value.strftime(fmt)
                elif field.type == 'datetime':
                    value = datetime.datetime.utcfromtimestamp(
                        int(float(value)))
                    value = '%sZ' % value.isoformat()
                row.append(value)
            yield schema.cast_row(row)
Esempio n. 4
0
def sunc_rows(descriptor, rows):
    result = []
    schema = Schema(descriptor)
    for row in rows:
        row = schema.cast_row(row)
        values = []
        for index, field in enumerate(descriptor['fields']):
            value = row[index]
            if field['type'] == 'date':
                value = datetime.datetime.fromordinal(value.toordinal())
            values.append(value)
        result.append(values)
    return result
def sunc_rows(descriptor, rows):
    result = []
    schema = Schema(descriptor)
    for row in rows:
        row = schema.cast_row(row)
        values = []
        for index, field in enumerate(descriptor['fields']):
            value = row[index]
            if field['type'] == 'date':
                value = datetime.datetime.fromordinal(value.toordinal())
            values.append(value)
        result.append(values)
    return result
    def __write_rows_buffer(self, bucket, rows_buffer):

        # Process data to byte stream csv
        descriptor = self.describe(bucket)
        schema = Schema(descriptor)
        bytes = io.BufferedRandom(io.BytesIO())
        writer = unicodecsv.writer(bytes, encoding='utf-8')
        for values in rows_buffer:
            row = []
            values = schema.cast_row(values)
            for index, field in enumerate(schema.fields):
                value = values[index]
                # Here we convert date to datetime
                if field.type == 'date':
                    value = datetime.datetime.fromordinal(value.toordinal())
                    value = '%sZ' % value.isoformat()
                row.append(value)
            writer.writerow(row)
        bytes.seek(0)

        # Prepare job body
        tablename = mappers.bucket_to_tablename(self.__prefix, bucket)
        body = {
            'configuration': {
                'load': {
                    'destinationTable': {
                        'projectId': self.__project,
                        'datasetId': self.__dataset,
                        'tableId': tablename
                    },
                    'sourceFormat': 'CSV',
                }
            }
        }

        # Prepare job media body
        mimetype = 'application/octet-stream'
        media_body = MediaIoBaseUpload(bytes, mimetype=mimetype)

        # Make request to Big Query
        response = self.__service.jobs().insert(
            projectId=self.__project,
            body=body,
            media_body=media_body).execute()
        self.__wait_response(response)
Esempio n. 7
0
    def __write_rows_buffer(self, bucket, rows_buffer):

        # Process data to byte stream csv
        descriptor = self.describe(bucket)
        schema = Schema(descriptor)
        bytes = io.BufferedRandom(io.BytesIO())
        writer = unicodecsv.writer(bytes, encoding='utf-8')
        for values in rows_buffer:
            row = []
            values = schema.cast_row(values)
            for index, field in enumerate(schema.fields):
                value = values[index]
                # Here we convert date to datetime
                if field.type == 'date':
                    value = datetime.datetime.fromordinal(value.toordinal())
                    value = '%sZ' % value.isoformat()
                row.append(value)
            writer.writerow(row)
        bytes.seek(0)

        # Prepare job body
        tablename = mappers.bucket_to_tablename(self.__prefix, bucket)
        body = {
            'configuration': {
                'load': {
                    'destinationTable': {
                        'projectId': self.__project,
                        'datasetId': self.__dataset,
                        'tableId': tablename
                    },
                    'sourceFormat': 'CSV',
                }
            }
        }

        # Prepare job media body
        mimetype = 'application/octet-stream'
        media_body = MediaIoBaseUpload(bytes, mimetype=mimetype)

        # Make request to Big Query
        response = self.__service.jobs().insert(
            projectId=self.__project, body=body,
            media_body=media_body).execute()
        self.__wait_response(response)
Esempio n. 8
0
    def _iter_rows(self):
        if self._schema is not None:  # Not empty results
            schema_obj = Schema(self._schema)
            if 'results' in self.raw_data:
                field_names = [field.name for field in schema_obj.fields]
                result_vars = self.raw_data['head']['vars']

                for binding in self.raw_data['results']['bindings']:
                    rdf_terms = table_schema.order_terms_in_binding(
                        result_vars, binding)

                    values = []
                    for rdf_term in rdf_terms:
                        if rdf_term is not None:
                            values.append(rdf_term['value'])
                        else:
                            values.append(None)

                    table_row = schema_obj.cast_row(values)

                    # when the column is a string value, the jsontableschema
                    # library is incorrectly mapping the several literal
                    # string values ('null', 'none', '-', etc.) to the python
                    # `None` value - a deeper fix might be to reconsider using
                    # that library, or maybe fixing this issue in that
                    # library (since it's probably not a good idea to render
                    # a number of strings un-representable) - this fixes the
                    # problem for our result sets.  Essentially, this zips
                    # over each result set and checks whether we mapped a
                    # non-null value to `None` in a string field, and if
                    # so it restores the non-null value before continuing
                    table_row = map(
                        lambda field, original, mapped: original
                        if (not mapped) and original and field.type == 'string'
                        else mapped, schema_obj.fields, values, table_row)

                    yield OrderedDict(zip(field_names, table_row))
            elif 'boolean' in self.raw_data:
                # Results of an ASK query
                yield {'boolean': self.raw_data['boolean']}
Esempio n. 9
0
    def _iter_rows(self):
        if self._schema is not None:  # Not empty results
            schema_obj = Schema(self._schema)
            if 'results' in self.raw_data:
                field_names = [field.name for field in schema_obj.fields]
                result_vars = self.raw_data['head']['vars']

                for binding in self.raw_data['results']['bindings']:
                    rdf_terms = table_schema.order_terms_in_binding(
                        result_vars, binding)

                    values = []
                    for rdf_term in rdf_terms:
                        if rdf_term is not None:
                            values.append(rdf_term['value'])
                        else:
                            values.append(None)

                    table_row = schema_obj.cast_row(values)
                    yield OrderedDict(zip(field_names, table_row))
            elif 'boolean' in self.raw_data:
                # Results of an ASK query
                yield {'boolean': self.raw_data['boolean']}
def sync_rows(descriptor, rows):
    result = []
    schema = Schema(descriptor)
    for row in rows:
        result.append(schema.cast_row(row))
    return result
def sync_rows(descriptor, rows):
    result = []
    schema = Schema(descriptor)
    for row in rows:
        result.append(schema.cast_row(row))
    return result
def test_cast_row_too_short():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '10.0', '1', 'string']
    with pytest.raises(exceptions.InvalidCastError):
        schema.cast_row(source)
def test_cast_row_null_values():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '', '-', 'string', 'null']
    target = ['string', None, None, 'string', None]
    assert schema.cast_row(source) == target
def test_cast_row():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', '10.0', '1', 'string', 'string']
    target = ['string', Decimal(10.0), 1, 'string', 'string']
    assert schema.cast_row(source) == target
def test_cast_row_wrong_type_multiple_errors():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', 'notdecimal', '10.6', 'string', 'string']
    with pytest.raises(exceptions.MultipleInvalid) as excinfo:
        schema.cast_row(source, no_fail_fast=True)
    assert len(excinfo.value.errors) == 2
def test_cast_row_wrong_type_no_fail_fast_true():
    schema = Schema(DESCRIPTOR_MAX)
    source = ['string', 'notdecimal', '10.6', 'string', 'string']
    with pytest.raises(exceptions.MultipleInvalid):
        schema.cast_row(source, no_fail_fast=True)