def test_make_nullable(): assert {'type': ['boolean', 'null']} \ == json_schema.make_nullable({'type': 'boolean'}) assert {'type': ['null', 'boolean']} \ == json_schema.make_nullable({'type': ['null', 'boolean']}) assert {'type': ['null', 'string']} \ == json_schema.make_nullable({'type': ['null', 'string']}) ## Make sure we're not modifying the original schema = {'type': ['string']} assert json_schema.get_type(schema) == ['string'] assert {'type': ['string', 'null']} \ == json_schema.make_nullable(schema) assert json_schema.get_type(schema) == ['string'] assert { 'definitions': { 'address': { 'type': 'object', 'properties': { 'street_address': {'type': 'string'}, 'city': {'type': 'string'}, 'state': {'type': 'string'} }, 'required': ['street_address', 'city', 'state'] } }, 'type': ['object', 'null'], 'properties': { 'billing_address': {'$ref': '#/definitions/address'}, 'shipping_address': {'$ref': '#/definitions/address'}}} \ == json_schema.make_nullable( { 'definitions': { 'address': { 'type': 'object', 'properties': { 'street_address': {'type': 'string'}, 'city': {'type': 'string'}, 'state': {'type': 'string'} }, 'required': ['street_address', 'city', 'state'] } }, 'type': 'object', 'properties': { 'billing_address': {'$ref': '#/definitions/address'}, 'shipping_address': {'$ref': '#/definitions/address'}}})
def _serialize_table_record_field_name(self, remote_schema, streamed_schema, path, value_json_schema): """ Returns the appropriate remote field (column) name for `field`. :param remote_schema: TABLE_SCHEMA(remote) :param streamed_schema: TABLE_SCHEMA(local) :param path: (string, ...) :value_json_schema: dict, JSON Schema :return: string """ simple_json_schema = json_schema.simple_type(value_json_schema) mapping = self._get_mapping(remote_schema, path, simple_json_schema) if not mapping is None: return mapping ## Numbers are valid as `float` OR `int` ## ie, 123.0 and 456 are valid 'number's if json_schema.INTEGER in json_schema.get_type(simple_json_schema): mapping = self._get_mapping(remote_schema, path, {'type': json_schema.NUMBER}) if not mapping is None: return mapping raise Exception('Unknown column path: {} for table: {}'.format( path, remote_schema['path']))
def json_schema_to_sql_type(self, schema): _type = json_schema.get_type(schema) not_null = True ln = len(_type) if ln == 1: _type = _type[0] if ln == 2 and json_schema.NULL in _type: not_null = False if _type.index(json_schema.NULL) == 0: _type = _type[1] else: _type = _type[0] elif ln > 2: raise PostgresError('Multiple types per column not supported') sql_type = 'text' if 'format' in schema and \ schema['format'] == 'date-time' and \ _type == 'string': sql_type = 'timestamp with time zone' elif _type == 'boolean': sql_type = 'boolean' elif _type == 'integer': sql_type = 'bigint' elif _type == 'number': sql_type = 'double precision' if not_null: sql_type += ' NOT NULL' return sql_type
def json_schema_to_sql_type(self, schema): _type = json_schema.get_type(schema) not_null = True ln = len(_type) if ln == 1: _type = _type[0] if ln == 2 and json_schema.NULL in _type: not_null = False if _type.index(json_schema.NULL) == 0: _type = _type[1] else: _type = _type[0] elif ln > 2: raise SnowflakeError('Multiple types per column not supported') sql_type = 'text' if 'format' in schema and \ schema['format'] == 'date-time' and \ _type == 'string': sql_type = 'TIMESTAMP_TZ' elif _type == 'boolean': sql_type = 'BOOLEAN' elif _type == 'integer': sql_type = 'NUMBER' elif _type == 'number': sql_type = 'FLOAT' if not_null: sql_type += ' NOT NULL' return sql_type
def _serialize_table_record_field_name(self, remote_schema, path, value_json_schema): """ Returns the appropriate remote field (column) name for `path`. :param remote_schema: TABLE_SCHEMA(remote) :param path: (string, ...) :value_json_schema: dict, JSON Schema :return: string """ simple_json_schema = json_schema.simple_type(value_json_schema) mapping = self._get_mapping(remote_schema, path, simple_json_schema) if not mapping is None: return mapping ## Numbers are valid as `float` OR `int` ## ie, 123.0 and 456 are valid 'number's if json_schema.INTEGER in json_schema.get_type(simple_json_schema): mapping = self._get_mapping(remote_schema, path, {'type': json_schema.NUMBER}) if not mapping is None: return mapping raise Exception( "A compatible column for path {} and JSONSchema {} in table {} cannot be found." .format(path, simple_json_schema, remote_schema['path']))
def merge_put_schemas(self, cur, table_schema, table_name, existing_schema, new_schema): new_properties = new_schema['properties'] existing_properties = existing_schema['schema']['properties'] for name, schema in new_properties.items(): ## Mapping exists if self.get_mapping(existing_schema, name, schema) is not None: pass ## New column elif name not in existing_properties: existing_properties[name] = schema self.add_column(cur, table_schema, table_name, name, schema) ## Existing column non-nullable, new column is nullable elif not json_schema.is_nullable(existing_properties[name]) \ and json_schema.get_type(schema) \ == json_schema.get_type(json_schema.make_nullable(existing_properties[name])): existing_properties[name] = json_schema.make_nullable( existing_properties[name]) self.make_column_nullable(cur, table_schema, table_name, name) ## Existing column, types compatible elif json_schema.to_sql(json_schema.make_nullable(schema)) \ == json_schema.to_sql(json_schema.make_nullable(existing_properties[name])): pass ## Column type change elif self.mapping_name(name, schema) not in existing_properties \ and self.mapping_name(name, existing_properties[name]) not in existing_properties: self.split_column(cur, table_schema, table_name, name, schema, existing_properties) ## Error else: raise PostgresError( 'Cannot handle column type change for: {}.{} columns {} and {}. Name collision likely.' .format(table_schema, table_name, name, self.mapping_name(name, schema)))
def _literal_only_schema(schema): ret = deepcopy(schema) ret_type = json_schema.get_type(ret) if json_schema.is_object(ret): ret_type.remove(json_schema.OBJECT) if json_schema.is_iterable(ret): ret_type.remove(json_schema.ARRAY) ret['type'] = ret_type return ret
def add_column_mapping(self, cur, table_name, from_path, to_name, mapped_schema): metadata = self._get_table_metadata(cur, table_name) mapping = { 'type': json_schema.get_type(mapped_schema), 'from': from_path } if 't' == json_schema.shorthand(mapped_schema): mapping['format'] = 'date-time' metadata['mappings'][to_name] = mapping self._set_table_metadata(cur, table_name, metadata)
def add_column_mapping(self, cur, table_name, from_path, to_name, mapped_schema): metadata = self._get_table_metadata(cur, table_name) if not metadata: metadata = {} if not 'mappings' in metadata: metadata['mappings'] = {} metadata['mappings'][to_name] = { 'type': json_schema.get_type(mapped_schema), 'from': from_path } self._set_table_metadata(cur, table_name, metadata)
def add_column_mapping(self, cur, table_schema, table_name, column_name, mapped_name, mapped_schema): metadata = self.get_table_metadata(cur, table_schema, table_name) if not metadata: metadata = {} if not 'mappings' in metadata: metadata['mappings'] = {} metadata['mappings'][mapped_name] = { 'type': json_schema.get_type(mapped_schema), 'from': column_name } self.set_table_metadata(cur, table_schema, table_name, metadata)
def _serialize_table_record_field_name(self, remote_schema, path, value_json_schema_tuple): """ Returns the appropriate remote field (column) name for `path`. :param remote_schema: TABLE_SCHEMA(remote) :param path: (string, ...) :value_json_schema: tuple, JSON Schema :return: string """ # rebuild the dict that needs to be passed further down the call stack if len(value_json_schema_tuple) == 1: value_json_schema = { 'type': value_json_schema_tuple[0] } else: value_json_schema = {'type': value_json_schema_tuple[0], 'format': value_json_schema_tuple[1]} simple_json_schema = json_schema.simple_type(value_json_schema) mapping = self._get_mapping(remote_schema, path, simple_json_schema) if not mapping is None: return mapping ## Numbers are valid as `float` OR `int` ## ie, 123.0 and 456 are valid 'number's if json_schema.INTEGER in json_schema.get_type(simple_json_schema): mapping = self._get_mapping(remote_schema, path, {'type': json_schema.NUMBER}) if not mapping is None: return mapping raise Exception("A compatible column for path {} and JSONSchema {} in table {} cannot be found.".format( path, simple_json_schema, remote_schema['path'] ))
def _literal_only_schema(schema): ret_types = json_schema.get_type(schema) if json_schema.is_object(schema): ret_types.remove(json_schema.OBJECT) if json_schema.is_iterable(schema): ret_types.remove(json_schema.ARRAY) if json_schema.is_nullable(schema): ret_types.remove(json_schema.NULL) ret_schemas = [] for t in ret_types: s = deepcopy(schema) s['type'] = [t] if json_schema.is_nullable(schema): s = json_schema.make_nullable(s) ret_schemas.append(s) return {'anyOf': ret_schemas}
def write_batch(self, stream_buffer): if not self.persist_empty_tables and stream_buffer.count == 0: return None with self.conn.cursor() as cur: try: cur.execute('BEGIN;') self.setup_table_mapping_cache(cur) root_table_name = self.add_table_mapping_helper( (stream_buffer.stream, ), self.table_mapping_cache)['to'] current_table_schema = self.get_table_schema( cur, root_table_name) current_table_version = None if current_table_schema: current_table_version = current_table_schema.get( 'version', None) if set(stream_buffer.key_properties) \ != set(current_table_schema.get('key_properties')): raise PostgresError( '`key_properties` change detected. Existing values are: {}. Streamed values are: {}' .format(current_table_schema.get('key_properties'), stream_buffer.key_properties)) for key_property in stream_buffer.key_properties: canonicalized_key, remote_column_schema = self.fetch_column_from_path( (key_property, ), current_table_schema) if self.json_schema_to_sql_type(remote_column_schema) \ != self.json_schema_to_sql_type(stream_buffer.schema['properties'][key_property]): raise PostgresError(( '`key_properties` type change detected for "{}". ' + 'Existing values are: {}. ' + 'Streamed values are: {}, {}, {}').format( key_property, json_schema.get_type( current_table_schema['schema'] ['properties'][key_property]), json_schema.get_type( stream_buffer.schema['properties'] [key_property]), self.json_schema_to_sql_type( current_table_schema['schema'] ['properties'][key_property]), self.json_schema_to_sql_type( stream_buffer.schema['properties'] [key_property]))) target_table_version = current_table_version or stream_buffer.max_version self.LOGGER.info( 'Stream {} ({}) with max_version {} targetting {}'.format( stream_buffer.stream, root_table_name, stream_buffer.max_version, target_table_version)) root_table_name = stream_buffer.stream if current_table_version is not None and \ stream_buffer.max_version is not None: if stream_buffer.max_version < current_table_version: self.LOGGER.warning( '{} - Records from an earlier table version detected.' .format(stream_buffer.stream)) cur.execute('ROLLBACK;') return None elif stream_buffer.max_version > current_table_version: root_table_name += SEPARATOR + str( stream_buffer.max_version) target_table_version = stream_buffer.max_version self.LOGGER.info('Root table name {}'.format(root_table_name)) written_batches_details = self.write_batch_helper( cur, root_table_name, stream_buffer.schema, stream_buffer.key_properties, stream_buffer.get_batch(), {'version': target_table_version}) cur.execute('COMMIT;') return written_batches_details except Exception as ex: cur.execute('ROLLBACK;') message = 'Exception writing records' self.LOGGER.exception(message) raise PostgresError(message, ex)
def upsert_table_helper(self, connection, schema, metadata): """ Upserts the `schema` to remote by: - creating table if necessary - adding columns - adding column mappings - migrating data from old columns to new, etc. :param connection: remote connection, type left to be determined by implementing class :param schema: TABLE_SCHEMA(local) :param metadata: additional information necessary for downstream operations :return: TABLE_SCHEMA(remote) """ table_path = schema['path'] table_name = self.add_table_mapping(connection, table_path, metadata) existing_schema = self.get_table_schema(connection, table_path, table_name) if existing_schema is None: self.add_table(connection, table_name, metadata) existing_schema = self.get_table_schema(connection, table_path, table_name) self.add_key_properties(connection, table_name, schema.get('key_properties', None)) ## Only process columns which have single, nullable, types single_type_columns = [] for column_name__or__path, column_schema in schema['schema'][ 'properties'].items(): column_path = column_name__or__path if isinstance(column_name__or__path, str): column_path = (column_name__or__path, ) single_type_column_schema = deepcopy(column_schema) column_types = json_schema.get_type(single_type_column_schema) make_nullable = json_schema.is_nullable(column_schema) for type in column_types: if type == json_schema.NULL: continue single_type_column_schema['type'] = [type] if make_nullable: single_type_columns.append( (column_path, json_schema.make_nullable(single_type_column_schema))) else: single_type_columns.append( (column_path, single_type_column_schema)) ## Process new columns against existing raw_mappings = existing_schema.get('mappings', {}) mappings = [] for to, m in raw_mappings.items(): mappings.append({ 'from': tuple(m['from']), 'to': to, 'type': m['type'] }) table_empty = self.is_table_empty(connection, table_name) for column_path, column_schema in single_type_columns: canonicalized_column_name = self._canonicalize_column_identifier( column_path, column_schema, mappings) nullable_column_schema = json_schema.make_nullable(column_schema) ## NEW COLUMN if not column_path in [m['from'] for m in mappings]: ### NON EMPTY TABLE if not table_empty: self.LOGGER.warning( 'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.' .format(column_path, table_name)) column_schema = nullable_column_schema self.add_column(connection, table_name, canonicalized_column_name, column_schema) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, column_schema) mappings.append({ 'from': column_path, 'to': canonicalized_column_name, 'type': json_schema.get_type(column_schema) }) continue ## EXISTING COLUMNS ### SCHEMAS MATCH if [ True for m in mappings if m['from'] == column_path and json_schema.to_sql(m) == json_schema.to_sql(column_schema) ]: continue ### NULLABLE SCHEMAS MATCH ### New column _is not_ nullable, existing column _is_ if [ True for m in mappings if m['from'] == column_path and json_schema.to_sql(m) == json_schema.to_sql(nullable_column_schema) ]: continue ### NULL COMPATIBILITY ### New column _is_ nullable, existing column is _not_ non_null_original_column = [ m for m in mappings if m['from'] == column_path and json_schema.sql_shorthand(m) == json_schema.sql_shorthand(column_schema) ] if non_null_original_column: ## MAKE NULLABLE self.make_column_nullable(connection, table_name, canonicalized_column_name) self.drop_column_mapping(connection, table_name, canonicalized_column_name) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) mappings = [ m for m in mappings if not ( m['from'] == column_path and json_schema.sql_shorthand( m) == json_schema.sql_shorthand(column_schema)) ] mappings.append({ 'from': column_path, 'to': canonicalized_column_name, 'type': json_schema.get_type(nullable_column_schema) }) continue ### FIRST MULTI TYPE ### New column matches existing column path, but the types are incompatible duplicate_paths = [m for m in mappings if m['from'] == column_path] if 1 == len(duplicate_paths): existing_mapping = duplicate_paths[0] existing_column_name = existing_mapping['to'] if existing_column_name: self.drop_column_mapping(connection, table_name, existing_column_name) ## Update existing properties mappings = [m for m in mappings if m['from'] != column_path] mappings.append({ 'from': column_path, 'to': canonicalized_column_name, 'type': json_schema.get_type(nullable_column_schema) }) existing_column_new_normalized_name = self._canonicalize_column_identifier( column_path, existing_mapping, mappings) mappings.append({ 'from': column_path, 'to': existing_column_new_normalized_name, 'type': json_schema.get_type( json_schema.make_nullable(existing_mapping)) }) ## Add new columns ### NOTE: all migrated columns will be nullable and remain that way #### Table Metadata self.add_column_mapping( connection, table_name, column_path, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) #### Columns self.add_column(connection, table_name, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) ## Migrate existing data self.migrate_column(connection, table_name, existing_mapping['to'], existing_column_new_normalized_name) ## Drop existing column self.drop_column(connection, table_name, existing_mapping['to']) ## REST MULTI TYPE elif 1 < len(duplicate_paths): ## Add new column self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) mappings.append({ 'from': column_path, 'to': canonicalized_column_name, 'type': json_schema.get_type(nullable_column_schema) }) ## UNKNOWN else: raise Exception( 'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.' .format(column_path, canonicalized_column_name, table_name)) return self.get_table_schema(connection, table_path, table_name)
def upsert_table_helper(self, connection, schema, metadata, log_schema_changes=True): """ Upserts the `schema` to remote by: - creating table if necessary - adding columns - adding column mappings - migrating data from old columns to new, etc. :param connection: remote connection, type left to be determined by implementing class :param schema: TABLE_SCHEMA(local) :param metadata: additional information necessary for downstream operations, :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes :return: TABLE_SCHEMA(remote) """ table_path = schema['path'] _metadata = deepcopy(metadata) _metadata['schema_version'] = CURRENT_SCHEMA_VERSION table_name = self.add_table_mapping(connection, table_path, _metadata) existing_schema = self._get_table_schema(connection, table_path, table_name) if existing_schema is None: self.add_table(connection, table_name, _metadata) existing_schema = self._get_table_schema(connection, table_path, table_name) self.add_key_properties(connection, table_name, schema.get('key_properties', None)) ## Only process columns which have single, nullable, types single_type_columns = [] for column_name__or__path, column_schema in schema['schema'][ 'properties'].items(): column_path = column_name__or__path if isinstance(column_name__or__path, str): column_path = (column_name__or__path, ) single_type_column_schema = deepcopy(column_schema) column_types = json_schema.get_type(single_type_column_schema) make_nullable = json_schema.is_nullable(column_schema) for type in column_types: if type == json_schema.NULL: continue single_type_column_schema['type'] = [type] if make_nullable: single_type_columns.append( (column_path, json_schema.make_nullable(single_type_column_schema))) else: single_type_columns.append( (column_path, deepcopy(single_type_column_schema))) ## Process new columns against existing raw_mappings = existing_schema.get('mappings', {}) mappings = [] for to, m in raw_mappings.items(): mapping = json_schema.simple_type(m) mapping['from'] = tuple(m['from']) mapping['to'] = to mappings.append(mapping) table_empty = self.is_table_empty(connection, table_name) for column_path, column_schema in single_type_columns: upsert_table_helper__start__column = time.monotonic() canonicalized_column_name = self._canonicalize_column_identifier( column_path, column_schema, mappings) nullable_column_schema = json_schema.make_nullable(column_schema) def log_message(msg): if log_schema_changes: self.LOGGER.info( 'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)' .format( table_name, column_path, canonicalized_column_name, msg, _duration_millis( upsert_table_helper__start__column))) ## NEW COLUMN if not column_path in [m['from'] for m in mappings]: upsert_table_helper__column = "New column" ### NON EMPTY TABLE if not table_empty: upsert_table_helper__column += ", non empty table" self.LOGGER.warning( 'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.' .format(column_path, table_name)) column_schema = nullable_column_schema self.add_column(connection, table_name, canonicalized_column_name, column_schema) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, column_schema) mapping = json_schema.simple_type(column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) log_message(upsert_table_helper__column) continue ## EXISTING COLUMNS ### SCHEMAS MATCH if [ True for m in mappings if m['from'] == column_path and self.json_schema_to_sql_type( m) == self.json_schema_to_sql_type(column_schema) ]: continue ### NULLABLE SCHEMAS MATCH ### New column _is not_ nullable, existing column _is_ if [ True for m in mappings if m['from'] == column_path and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(nullable_column_schema) ]: continue ### NULL COMPATIBILITY ### New column _is_ nullable, existing column is _not_ non_null_original_column = [ m for m in mappings if m['from'] == column_path and json_schema.shorthand(m) == json_schema.shorthand(column_schema) ] if non_null_original_column: ## MAKE NULLABLE self.make_column_nullable(connection, table_name, canonicalized_column_name) self.drop_column_mapping(connection, table_name, canonicalized_column_name) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) mappings = [ m for m in mappings if not (m['from'] == column_path and json_schema.shorthand( m) == json_schema.shorthand(column_schema)) ] mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) log_message( "Made existing column nullable. New column is nullable, existing column is not" ) continue ### FIRST MULTI TYPE ### New column matches existing column path, but the types are incompatible duplicate_paths = [m for m in mappings if m['from'] == column_path] if 1 == len(duplicate_paths): existing_mapping = duplicate_paths[0] existing_column_name = existing_mapping['to'] if existing_column_name: self.drop_column_mapping(connection, table_name, existing_column_name) ## Update existing properties mappings = [m for m in mappings if m['from'] != column_path] mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) existing_column_new_normalized_name = self._canonicalize_column_identifier( column_path, existing_mapping, mappings) mapping = json_schema.simple_type( json_schema.make_nullable(existing_mapping)) mapping['from'] = column_path mapping['to'] = existing_column_new_normalized_name mappings.append(mapping) ## Add new columns ### NOTE: all migrated columns will be nullable and remain that way #### Table Metadata self.add_column_mapping( connection, table_name, column_path, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) #### Columns self.add_column(connection, table_name, existing_column_new_normalized_name, json_schema.make_nullable(existing_mapping)) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) ## Migrate existing data self.migrate_column(connection, table_name, existing_mapping['to'], existing_column_new_normalized_name) ## Drop existing column self.drop_column(connection, table_name, existing_mapping['to']) upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format( existing_column_name, existing_column_new_normalized_name, canonicalized_column_name) ## REST MULTI TYPE elif 1 < len(duplicate_paths): ## Add new column self.add_column_mapping(connection, table_name, column_path, canonicalized_column_name, nullable_column_schema) self.add_column(connection, table_name, canonicalized_column_name, nullable_column_schema) mapping = json_schema.simple_type(nullable_column_schema) mapping['from'] = column_path mapping['to'] = canonicalized_column_name mappings.append(mapping) upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format( column_path) ## UNKNOWN else: raise Exception( 'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.' .format(column_path, canonicalized_column_name, table_name)) log_message(upsert_table_helper__column) return self._get_table_schema(connection, table_path, table_name)
def write_batch(self, stream_buffer): if stream_buffer.count == 0: return None with self.conn.cursor() as cur: try: self._validate_identifier(stream_buffer.stream) cur.execute('BEGIN;') current_table_schema = self.get_table_schema( cur, (stream_buffer.stream, ), stream_buffer.stream) current_table_version = None if current_table_schema: current_table_version = current_table_schema.get( 'version', None) if set(stream_buffer.key_properties) \ != set(current_table_schema.get('key_properties')): raise PostgresError( '`key_properties` change detected. Existing values are: {}. Streamed values are: {}' .format(current_table_schema.get('key_properties'), stream_buffer.key_properties)) for key in stream_buffer.key_properties: if self.json_schema_to_sql_type(current_table_schema['schema']['properties'][key]) \ != self.json_schema_to_sql_type(stream_buffer.schema['properties'][key]): raise PostgresError(( '`key_properties` type change detected for "{}". ' + 'Existing values are: {}. ' + 'Streamed values are: {}, {}, {}' ).format( key, json_schema.get_type( current_table_schema['schema'] ['properties'][key]), json_schema.get_type( stream_buffer.schema['properties'][key]), self.json_schema_to_sql_type( current_table_schema['schema'] ['properties'][key]), self.json_schema_to_sql_type( stream_buffer.schema['properties'][key]))) root_table_name = stream_buffer.stream target_table_version = current_table_version or stream_buffer.max_version if current_table_version is not None and \ stream_buffer.max_version is not None: if stream_buffer.max_version < current_table_version: self.LOGGER.warning( '{} - Records from an earlier table version detected.' .format(stream_buffer.stream)) cur.execute('ROLLBACK;') return None elif stream_buffer.max_version > current_table_version: root_table_name = stream_buffer.stream + SEPARATOR + str( stream_buffer.max_version) target_table_version = stream_buffer.max_version self._validate_identifier(root_table_name) written_batches_details = self.write_batch_helper( cur, root_table_name, stream_buffer.schema, stream_buffer.key_properties, stream_buffer.get_batch(), {'version': target_table_version}) cur.execute('COMMIT;') return written_batches_details except Exception as ex: cur.execute('ROLLBACK;') message = 'Exception writing records' self.LOGGER.exception(message) raise PostgresError(message, ex)
def write_batch(self, stream_buffer): if stream_buffer.count == 0: return with self.conn.cursor() as cur: try: cur.execute('BEGIN;') processed_records = map( partial(self.process_record_message, stream_buffer.use_uuid_pk, self.get_postgres_datetime()), stream_buffer.peek_buffer()) versions = set() max_version = None records_all_versions = [] for record in processed_records: record_version = record.get(SINGER_TABLE_VERSION) if record_version is not None and \ (max_version is None or record_version > max_version): max_version = record_version versions.add(record_version) records_all_versions.append(record) current_table_schema = self.get_schema(cur, self.postgres_schema, stream_buffer.stream) current_table_version = None if current_table_schema: current_table_version = current_table_schema.get( 'version', None) if set(stream_buffer.key_properties) \ != set(current_table_schema.get('key_properties')): raise PostgresError( '`key_properties` change detected. Existing values are: {}. Streamed values are: {}' .format(current_table_schema.get('key_properties'), stream_buffer.key_properties)) if max_version is not None: target_table_version = max_version else: target_table_version = None if current_table_version is not None and \ min(versions) < current_table_version: self.logger.warning( '{} - Records from an earlier table vesion detected.'. format(stream_buffer.stream)) if len(versions) > 1: self.logger.warning( '{} - Multiple table versions in stream, only using the latest.' .format(stream_buffer.stream)) if current_table_version is not None and \ target_table_version > current_table_version: root_table_name = stream_buffer.stream + self.SEPARATOR + str( target_table_version) else: root_table_name = stream_buffer.stream if target_table_version is not None: records = filter( lambda x: x.get(SINGER_TABLE_VERSION) == target_table_version, records_all_versions) else: records = records_all_versions root_table_schema = json_schema.simplify(stream_buffer.schema) ## Add singer columns to root table self.add_singer_columns(root_table_schema, stream_buffer.key_properties) subtables = {} key_prop_schemas = {} for key in stream_buffer.key_properties: if current_table_schema \ and json_schema.get_type(current_table_schema['schema']['properties'][key]) \ != json_schema.get_type(root_table_schema['properties'][key]): raise PostgresError( ('`key_properties` type change detected for "{}". ' + 'Existing values are: {}. ' + 'Streamed values are: {}').format( key, json_schema.get_type( current_table_schema['schema'] ['properties'][key]), json_schema.get_type( root_table_schema['properties'][key]))) key_prop_schemas[key] = root_table_schema['properties'][ key] self.denest_schema(root_table_name, root_table_schema, key_prop_schemas, subtables) root_temp_table_name = self.upsert_table_schema( cur, root_table_name, root_table_schema, stream_buffer.key_properties, target_table_version) nested_upsert_tables = [] for table_name, subtable_json_schema in subtables.items(): temp_table_name = self.upsert_table_schema( cur, table_name, subtable_json_schema, None, None) nested_upsert_tables.append({ 'table_name': table_name, 'json_schema': subtable_json_schema, 'temp_table_name': temp_table_name }) records_map = {} self.denest_records(root_table_name, records, records_map, stream_buffer.key_properties) self.persist_rows(cur, root_table_name, root_temp_table_name, root_table_schema, stream_buffer.key_properties, records_map[root_table_name]) for nested_upsert_table in nested_upsert_tables: key_properties = [] for key in stream_buffer.key_properties: key_properties.append(SINGER_SOURCE_PK_PREFIX + key) self.persist_rows( cur, nested_upsert_table['table_name'], nested_upsert_table['temp_table_name'], nested_upsert_table['json_schema'], key_properties, records_map[nested_upsert_table['table_name']]) cur.execute('COMMIT;') except Exception as ex: cur.execute('ROLLBACK;') message = 'Exception writing records' self.logger.exception(message) raise PostgresError(message, ex) stream_buffer.flush_buffer()