Esempio n. 1
0
def _make_schema_nullable(schema):
    # Redshift does not allow for creation of columns which are non null without a default
    nullable_schema = deepcopy(schema)
    nullable_properties = nullable_schema['properties']

    for field in nullable_properties:
        nullable_properties[field] = json_schema.make_nullable(nullable_properties[field])

        if 'anyOf' in nullable_properties[field]:
            nullable_properties[field]['anyOf'] = [
                json_schema.make_nullable(sub_schema)
                for sub_schema in nullable_properties[field]['anyOf']
            ]

    return nullable_schema
def test_make_nullable():
    assert {'type': ['boolean', 'null']} \
           == json_schema.make_nullable({'type': 'boolean'})
    assert {'type': ['null', 'boolean']} \
           == json_schema.make_nullable({'type': ['null', 'boolean']})
    assert {'type': ['null', 'string']} \
           == json_schema.make_nullable({'type': ['null', 'string']})

    ## Make sure we're not modifying the original
    schema = {'type': ['string']}
    assert json_schema.get_type(schema) == ['string']
    assert {'type': ['string', 'null']} \
           == json_schema.make_nullable(schema)
    assert json_schema.get_type(schema) == ['string']

    assert {
               'definitions': {
                   'address': {
                       'type': 'object',
                       'properties': {
                           'street_address': {'type': 'string'},
                           'city': {'type': 'string'},
                           'state': {'type': 'string'}
                       },
                       'required': ['street_address', 'city', 'state']
                   }
               },
               'type': ['object', 'null'],
               'properties': {
                   'billing_address': {'$ref': '#/definitions/address'},
                   'shipping_address': {'$ref': '#/definitions/address'}}} \
           == json_schema.make_nullable(
        {
            'definitions': {
                'address': {
                    'type': 'object',
                    'properties': {
                        'street_address': {'type': 'string'},
                        'city': {'type': 'string'},
                        'state': {'type': 'string'}
                    },
                    'required': ['street_address', 'city', 'state']
                }
            },
            'type': 'object',
            'properties': {
                'billing_address': {'$ref': '#/definitions/address'},
                'shipping_address': {'$ref': '#/definitions/address'}}})
Esempio n. 3
0
def _denest_schema_helper(table_path, prop_path, table_json_schema, nullable,
                          top_level_schema, key_prop_schemas, subtables,
                          level):

    for prop, item_json_schema in _denest_schema__singular_schemas(
            table_json_schema):

        if json_schema.is_object(item_json_schema):
            _denest_schema_helper(table_path + (prop, ), prop_path + (prop, ),
                                  item_json_schema, nullable, top_level_schema,
                                  key_prop_schemas, subtables, level)

        elif json_schema.is_iterable(item_json_schema):
            _create_subtable(table_path + (prop, ), item_json_schema,
                             key_prop_schemas, subtables, level + 1)

        elif json_schema.is_literal(item_json_schema):
            if nullable:
                item_json_schema = json_schema.make_nullable(item_json_schema)

            p = prop_path + (prop, )
            if p in top_level_schema:
                top_level_schema[p]['anyOf'].append(item_json_schema)
            else:
                top_level_schema[p] = {'anyOf': [item_json_schema]}
    def add_table(self, cur, path, name, metadata):
        sql.valid_identifier(name)

        cur.execute('''
            CREATE TABLE {}.{}.{} ({} {})
            '''.format(
            sql.identifier(self.connection.configured_database),
            sql.identifier(self.connection.configured_schema),
            sql.identifier(name),
            # Snowflake does not allow for creation of tables with no columns
            sql.identifier(self.CREATE_TABLE_INITIAL_COLUMN),
            self.CREATE_TABLE_INITIAL_COLUMN_TYPE))

        self._set_table_metadata(
            cur, name, {
                'path': path,
                'version': metadata.get('version', None),
                'schema_version': metadata['schema_version'],
                'mappings': {}
            })

        self.add_column_mapping(
            cur, name, (self.CREATE_TABLE_INITIAL_COLUMN, ),
            self.CREATE_TABLE_INITIAL_COLUMN,
            json_schema.make_nullable({'type': json_schema.BOOLEAN}))
Esempio n. 5
0
    def sql_type_to_json_schema(self, sql_type, is_nullable):
        if sql_type == 'character varying':
            schema = {'type': [json_schema.STRING]}
            if is_nullable:
                return json_schema.make_nullable(schema)
            return schema

        return PostgresTarget.sql_type_to_json_schema(self, sql_type, is_nullable)
Esempio n. 6
0
    def merge_put_schemas(self, cur, table_schema, table_name, existing_schema,
                          new_schema):
        new_properties = new_schema['properties']
        existing_properties = existing_schema['schema']['properties']
        for name, schema in new_properties.items():
            ## Mapping exists
            if self.get_mapping(existing_schema, name, schema) is not None:
                pass

            ## New column
            elif name not in existing_properties:

                existing_properties[name] = schema
                self.add_column(cur, table_schema, table_name, name, schema)

            ## Existing column non-nullable, new column is nullable
            elif not json_schema.is_nullable(existing_properties[name]) \
                    and json_schema.get_type(schema) \
                    == json_schema.get_type(json_schema.make_nullable(existing_properties[name])):

                existing_properties[name] = json_schema.make_nullable(
                    existing_properties[name])
                self.make_column_nullable(cur, table_schema, table_name, name)

            ## Existing column, types compatible
            elif json_schema.to_sql(json_schema.make_nullable(schema)) \
                    == json_schema.to_sql(json_schema.make_nullable(existing_properties[name])):
                pass

            ## Column type change
            elif self.mapping_name(name, schema) not in existing_properties \
                and self.mapping_name(name, existing_properties[name]) not in existing_properties:

                self.split_column(cur, table_schema, table_name, name, schema,
                                  existing_properties)

            ## Error
            else:
                raise PostgresError(
                    'Cannot handle column type change for: {}.{} columns {} and {}. Name collision likely.'
                    .format(table_schema, table_name, name,
                            self.mapping_name(name, schema)))
Esempio n. 7
0
def test_loading__invalid__column_type_change__pks__nullable():
    main(CONFIG, input_stream=CatStream(20))

    stream = CatStream(20)
    stream.schema = deepcopy(stream.schema)
    stream.schema['schema']['properties']['id'] = json_schema.make_nullable(
        stream.schema['schema']['properties']['id'])

    with pytest.raises(postgres.PostgresError,
                       match=r'.*key_properties. type change detected'):
        main(CONFIG, input_stream=stream)
Esempio n. 8
0
    def split_column(self, cur, table_schema, table_name, column_name,
                     column_schema, existing_properties):
        ## column_name -> column_name__<current-type>, column_name__<new-type>
        existing_column_mapping = self.mapping_name(
            column_name, existing_properties[column_name])
        new_column_mapping = self.mapping_name(column_name, column_schema)

        ## Update existing properties
        existing_properties[
            existing_column_mapping] = json_schema.make_nullable(
                existing_properties[column_name])
        existing_properties[new_column_mapping] = json_schema.make_nullable(
            column_schema)

        ## Add new columns
        ### NOTE: all migrated columns will be nullable and remain that way

        #### Table Metadata
        self.add_column_mapping(cur, table_schema, table_name, column_name,
                                existing_column_mapping,
                                existing_properties[existing_column_mapping])
        self.add_column_mapping(cur, table_schema, table_name, column_name,
                                new_column_mapping,
                                existing_properties[new_column_mapping])

        #### Columns
        self.add_column(cur, table_schema, table_name, existing_column_mapping,
                        existing_properties[existing_column_mapping])

        self.add_column(cur, table_schema, table_name, new_column_mapping,
                        existing_properties[new_column_mapping])

        ## Migrate existing data
        self.migrate_column(cur, table_schema, table_name, column_name,
                            existing_column_mapping)

        ## Drop existing column
        self.drop_column(cur, table_schema, table_name, column_name)

        ## Remove column (field) from existing_properties
        del existing_properties[column_name]
def test_make_nullable():
    assert {'type': ['boolean', 'null']} \
           == json_schema.make_nullable({'type': 'boolean'})
    assert {'type': ['null', 'boolean']} \
           == json_schema.make_nullable({'type': ['null', 'boolean']})
    assert {'type': ['null', 'string']} \
           == json_schema.make_nullable({'type': ['null', 'string']})
    assert {
               'definitions': {
                   'address': {
                       'type': 'object',
                       'properties': {
                           'street_address': {'type': 'string'},
                           'city': {'type': 'string'},
                           'state': {'type': 'string'}
                       },
                       'required': ['street_address', 'city', 'state']
                   }
               },
               'type': ['object', 'null'],
               'properties': {
                   'billing_address': {'$ref': '#/definitions/address'},
                   'shipping_address': {'$ref': '#/definitions/address'}}} \
           == json_schema.make_nullable(
        {
            'definitions': {
                'address': {
                    'type': 'object',
                    'properties': {
                        'street_address': {'type': 'string'},
                        'city': {'type': 'string'},
                        'state': {'type': 'string'}
                    },
                    'required': ['street_address', 'city', 'state']
                }
            },
            'type': 'object',
            'properties': {
                'billing_address': {'$ref': '#/definitions/address'},
                'shipping_address': {'$ref': '#/definitions/address'}}})
Esempio n. 10
0
    def add_table(self, cur, name, metadata):
        self._validate_identifier(name)

        create_table_sql = sql.SQL('CREATE TABLE {}.{} ({} {})').format(
            sql.Identifier(self.postgres_schema),
            sql.Identifier(name),
            # Redshift does not allow for creation of tables with no columns
            sql.Identifier(self.CREATE_TABLE_INITIAL_COLUMN),
            sql.SQL(self.CREATE_TABLE_INITIAL_COLUMN_TYPE))

        cur.execute(sql.SQL('{};').format(create_table_sql))

        self._set_table_metadata(cur, name,
                                 {'version': metadata.get('version', None)})

        self.add_column_mapping(
            cur, name, (self.CREATE_TABLE_INITIAL_COLUMN, ),
            self.CREATE_TABLE_INITIAL_COLUMN,
            json_schema.make_nullable({'type': json_schema.BOOLEAN}))
Esempio n. 11
0
    def add_column(self, cur, table_schema, table_name, column_name,
                   column_schema):
        data_type = json_schema.to_sql(column_schema)

        if not json_schema.is_nullable(column_schema) \
                and not self.is_table_empty(cur, table_schema, table_name):
            self.logger.warning(
                'Forcing new column `{}.{}.{}` to be nullable due to table not empty.'
                .format(table_schema, table_name, column_name))
            data_type = json_schema.to_sql(
                json_schema.make_nullable(column_schema))

        to_execute = sql.SQL('ALTER TABLE {table_schema}.{table_name} ' +
                             'ADD COLUMN {column_name} {data_type};').format(
                                 table_schema=sql.Identifier(table_schema),
                                 table_name=sql.Identifier(table_name),
                                 column_name=sql.Identifier(column_name),
                                 data_type=sql.SQL(data_type))

        cur.execute(to_execute)
Esempio n. 12
0
def _literal_only_schema(schema):

    ret_types = json_schema.get_type(schema)

    if json_schema.is_object(schema):
        ret_types.remove(json_schema.OBJECT)
    if json_schema.is_iterable(schema):
        ret_types.remove(json_schema.ARRAY)
    if json_schema.is_nullable(schema):
        ret_types.remove(json_schema.NULL)

    ret_schemas = []
    for t in ret_types:
        s = deepcopy(schema)
        s['type'] = [t]

        if json_schema.is_nullable(schema):
            s = json_schema.make_nullable(s)

        ret_schemas.append(s)

    return {'anyOf': ret_schemas}
Esempio n. 13
0
    def upsert_table_helper(self, connection, schema, metadata, log_schema_changes=True):
        """
        Upserts the `schema` to remote by:
        - creating table if necessary
        - adding columns
        - adding column mappings
        - migrating data from old columns to new, etc.

        :param connection: remote connection, type left to be determined by implementing class
        :param schema: TABLE_SCHEMA(local)
        :param metadata: additional information necessary for downstream operations,
        :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes
        :return: TABLE_SCHEMA(remote)
        """
        table_path = schema['path']

        with self._set_timer_tags(metrics.job_timer(),
                                  'upsert_table_schema',
                                  table_path) as timer:

            _metadata = deepcopy(metadata)
            _metadata['schema_version'] = CURRENT_SCHEMA_VERSION

            table_name = self.add_table_mapping(connection, table_path, _metadata)

            self._set_metrics_tags__table(timer, table_name)

            existing_schema = self._get_table_schema(connection, table_name)

            existing_table = True
            if existing_schema is None:
                self.add_table(connection, table_path, table_name, _metadata)
                existing_schema = self._get_table_schema(connection, table_name)
                existing_table = False

            self.add_key_properties(connection, table_name, schema.get('key_properties', None))

            ## Build up mappings to compare new columns against existing
            mappings = []

            for to, m in existing_schema.get('mappings', {}).items():
                mapping = json_schema.simple_type(m)
                mapping['from'] = tuple(m['from'])
                mapping['to'] = to
                mappings.append(mapping)

            ## Only process columns which have single, nullable, types
            column_paths_seen = set()
            single_type_columns = []

            for column_path, column_schema in schema['schema']['properties'].items():
                column_paths_seen.add(column_path)
                for sub_schema in column_schema['anyOf']:
                    single_type_columns.append((column_path, deepcopy(sub_schema)))

            ### Add any columns missing from new schema
            for m in mappings:
                if not m['from'] in column_paths_seen:
                    single_type_columns.append((m['from'], json_schema.make_nullable(m)))

            ## Process new columns against existing
            table_empty = self.is_table_empty(connection, table_name)

            for column_path, column_schema in single_type_columns:
                upsert_table_helper__start__column = time.monotonic()

                canonicalized_column_name = self._canonicalize_column_identifier(column_path, column_schema, mappings)
                nullable_column_schema = json_schema.make_nullable(column_schema)

                def log_message(msg):
                    if log_schema_changes:
                        self.LOGGER.info(
                            'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)'.format(
                                table_name,
                                column_path,
                                canonicalized_column_name,
                                msg,
                                _duration_millis(upsert_table_helper__start__column)))

                ## NEW COLUMN
                if not column_path in [m['from'] for m in mappings]:
                    upsert_table_helper__column = "New column"
                    ### NON EMPTY TABLE
                    if not table_empty:
                        upsert_table_helper__column += ", non empty table"
                        self.LOGGER.warning(
                            'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'.format(
                                column_path,
                                table_name))
                        column_schema = nullable_column_schema

                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    column_schema)
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            column_schema)

                    mapping = json_schema.simple_type(column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    log_message(upsert_table_helper__column)

                    continue

                ## EXISTING COLUMNS
                ### SCHEMAS MATCH
                if [True for m in mappings if
                    m['from'] == column_path
                    and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(column_schema)]:
                    continue
                ### NULLABLE SCHEMAS MATCH
                ###  New column _is not_ nullable, existing column _is_
                if [True for m in mappings if
                    m['from'] == column_path
                    and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(nullable_column_schema)]:
                    continue

                ### NULL COMPATIBILITY
                ###  New column _is_ nullable, existing column is _not_
                non_null_original_column = [m for m in mappings if
                                            m['from'] == column_path and json_schema.shorthand(
                                                m) == json_schema.shorthand(column_schema)]
                if non_null_original_column:
                    ## MAKE NULLABLE
                    self.make_column_nullable(connection,
                                              table_name,
                                              canonicalized_column_name)
                    self.drop_column_mapping(connection, table_name, canonicalized_column_name)
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)

                    mappings = [m for m in mappings if not (m['from'] == column_path and json_schema.shorthand(
                        m) == json_schema.shorthand(column_schema))]

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    log_message("Made existing column nullable.")

                    continue

                ### FIRST MULTI TYPE
                ###  New column matches existing column path, but the types are incompatible
                duplicate_paths = [m for m in mappings if m['from'] == column_path]

                if 1 == len(duplicate_paths):
                    existing_mapping = duplicate_paths[0]
                    existing_column_name = existing_mapping['to']

                    if existing_column_name:
                        self.drop_column_mapping(connection, table_name, existing_column_name)

                    ## Update existing properties
                    mappings = [m for m in mappings if m['from'] != column_path]

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    existing_column_new_normalized_name = self._canonicalize_column_identifier(column_path,
                                                                                               existing_mapping,
                                                                                               mappings)

                    mapping = json_schema.simple_type(json_schema.make_nullable(existing_mapping))
                    mapping['from'] = column_path
                    mapping['to'] = existing_column_new_normalized_name
                    mappings.append(mapping)

                    ## Add new columns
                    ### NOTE: all migrated columns will be nullable and remain that way

                    #### Table Metadata
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            existing_column_new_normalized_name,
                                            json_schema.make_nullable(existing_mapping))
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)

                    #### Columns
                    self.add_column(connection,
                                    table_name,
                                    existing_column_new_normalized_name,
                                    json_schema.make_nullable(existing_mapping))

                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    nullable_column_schema)

                    ## Migrate existing data
                    self.migrate_column(connection,
                                        table_name,
                                        existing_mapping['to'],
                                        existing_column_new_normalized_name)

                    ## Drop existing column
                    self.drop_column(connection,
                                     table_name,
                                     existing_mapping['to'])

                    upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format(
                        existing_column_name,
                        existing_column_new_normalized_name,
                        canonicalized_column_name
                    )

                ## REST MULTI TYPE
                elif 1 < len(duplicate_paths):
                    ## Add new column
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)
                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    nullable_column_schema)

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format(
                        column_path
                    )

                ## UNKNOWN
                else:
                    raise Exception(
                        'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'.format(
                            column_path,
                            canonicalized_column_name,
                            table_name
                        ))

                log_message(upsert_table_helper__column)

            if not existing_table:
                for column_names in self.new_table_indexes(schema):
                    self.add_index(connection, table_name, column_names)

            return self._get_table_schema(connection, table_name)
Esempio n. 14
0
    def upsert_table_helper(self, connection, schema, metadata):
        """
        Upserts the `schema` to remote by:
        - creating table if necessary
        - adding columns
        - adding column mappings
        - migrating data from old columns to new, etc.

        :param connection: remote connection, type left to be determined by implementing class
        :param schema: TABLE_SCHEMA(local)
        :param metadata: additional information necessary for downstream operations
        :return: TABLE_SCHEMA(remote)
        """
        table_path = schema['path']

        table_name = self.add_table_mapping(connection, table_path, metadata)

        existing_schema = self.get_table_schema(connection, table_path,
                                                table_name)

        if existing_schema is None:
            self.add_table(connection, table_name, metadata)
            existing_schema = self.get_table_schema(connection, table_path,
                                                    table_name)

        self.add_key_properties(connection, table_name,
                                schema.get('key_properties', None))

        ## Only process columns which have single, nullable, types
        single_type_columns = []
        for column_name__or__path, column_schema in schema['schema'][
                'properties'].items():
            column_path = column_name__or__path
            if isinstance(column_name__or__path, str):
                column_path = (column_name__or__path, )

            single_type_column_schema = deepcopy(column_schema)
            column_types = json_schema.get_type(single_type_column_schema)
            make_nullable = json_schema.is_nullable(column_schema)

            for type in column_types:
                if type == json_schema.NULL:
                    continue

                single_type_column_schema['type'] = [type]

                if make_nullable:
                    single_type_columns.append(
                        (column_path,
                         json_schema.make_nullable(single_type_column_schema)))
                else:
                    single_type_columns.append(
                        (column_path, single_type_column_schema))

        ## Process new columns against existing
        raw_mappings = existing_schema.get('mappings', {})

        mappings = []

        for to, m in raw_mappings.items():
            mappings.append({
                'from': tuple(m['from']),
                'to': to,
                'type': m['type']
            })

        table_empty = self.is_table_empty(connection, table_name)

        for column_path, column_schema in single_type_columns:
            canonicalized_column_name = self._canonicalize_column_identifier(
                column_path, column_schema, mappings)
            nullable_column_schema = json_schema.make_nullable(column_schema)

            ## NEW COLUMN
            if not column_path in [m['from'] for m in mappings]:
                ### NON EMPTY TABLE
                if not table_empty:
                    self.LOGGER.warning(
                        'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'
                        .format(column_path, table_name))
                    column_schema = nullable_column_schema

                self.add_column(connection, table_name,
                                canonicalized_column_name, column_schema)
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        column_schema)
                mappings.append({
                    'from': column_path,
                    'to': canonicalized_column_name,
                    'type': json_schema.get_type(column_schema)
                })

                continue

            ## EXISTING COLUMNS
            ### SCHEMAS MATCH
            if [
                    True for m in mappings if m['from'] == column_path and
                    json_schema.to_sql(m) == json_schema.to_sql(column_schema)
            ]:
                continue
            ### NULLABLE SCHEMAS MATCH
            ###  New column _is not_ nullable, existing column _is_
            if [
                    True for m in mappings
                    if m['from'] == column_path and json_schema.to_sql(m) ==
                    json_schema.to_sql(nullable_column_schema)
            ]:
                continue

            ### NULL COMPATIBILITY
            ###  New column _is_ nullable, existing column is _not_
            non_null_original_column = [
                m for m in mappings
                if m['from'] == column_path and json_schema.sql_shorthand(m) ==
                json_schema.sql_shorthand(column_schema)
            ]
            if non_null_original_column:
                ## MAKE NULLABLE
                self.make_column_nullable(connection, table_name,
                                          canonicalized_column_name)
                self.drop_column_mapping(connection, table_name,
                                         canonicalized_column_name)
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        nullable_column_schema)

                mappings = [
                    m for m in mappings if not (
                        m['from'] == column_path and json_schema.sql_shorthand(
                            m) == json_schema.sql_shorthand(column_schema))
                ]
                mappings.append({
                    'from':
                    column_path,
                    'to':
                    canonicalized_column_name,
                    'type':
                    json_schema.get_type(nullable_column_schema)
                })

                continue

            ### FIRST MULTI TYPE
            ###  New column matches existing column path, but the types are incompatible
            duplicate_paths = [m for m in mappings if m['from'] == column_path]

            if 1 == len(duplicate_paths):

                existing_mapping = duplicate_paths[0]
                existing_column_name = existing_mapping['to']

                if existing_column_name:
                    self.drop_column_mapping(connection, table_name,
                                             existing_column_name)

                ## Update existing properties
                mappings = [m for m in mappings if m['from'] != column_path]
                mappings.append({
                    'from':
                    column_path,
                    'to':
                    canonicalized_column_name,
                    'type':
                    json_schema.get_type(nullable_column_schema)
                })

                existing_column_new_normalized_name = self._canonicalize_column_identifier(
                    column_path, existing_mapping, mappings)
                mappings.append({
                    'from':
                    column_path,
                    'to':
                    existing_column_new_normalized_name,
                    'type':
                    json_schema.get_type(
                        json_schema.make_nullable(existing_mapping))
                })

                ## Add new columns
                ### NOTE: all migrated columns will be nullable and remain that way

                #### Table Metadata
                self.add_column_mapping(
                    connection, table_name, column_path,
                    existing_column_new_normalized_name,
                    json_schema.make_nullable(existing_mapping))
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        nullable_column_schema)

                #### Columns
                self.add_column(connection, table_name,
                                existing_column_new_normalized_name,
                                json_schema.make_nullable(existing_mapping))

                self.add_column(connection, table_name,
                                canonicalized_column_name,
                                nullable_column_schema)

                ## Migrate existing data
                self.migrate_column(connection, table_name,
                                    existing_mapping['to'],
                                    existing_column_new_normalized_name)

                ## Drop existing column
                self.drop_column(connection, table_name,
                                 existing_mapping['to'])

            ## REST MULTI TYPE
            elif 1 < len(duplicate_paths):

                ## Add new column
                self.add_column_mapping(connection, table_name, column_path,
                                        canonicalized_column_name,
                                        nullable_column_schema)
                self.add_column(connection, table_name,
                                canonicalized_column_name,
                                nullable_column_schema)

                mappings.append({
                    'from':
                    column_path,
                    'to':
                    canonicalized_column_name,
                    'type':
                    json_schema.get_type(nullable_column_schema)
                })

            ## UNKNOWN
            else:
                raise Exception(
                    'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'
                    .format(column_path, canonicalized_column_name,
                            table_name))

        return self.get_table_schema(connection, table_path, table_name)
Esempio n. 15
0
def test_loading__column_type_change__nullable(db_cleanup):
    cat_count = 20
    main(CONFIG, input_stream=CatStream(cat_count))

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_columns_sql('cats'))
            columns = cur.fetchall()

            assert set(columns) == {
                ('_sdc_batched_at', 'timestamp with time zone', 'YES'),
                ('_sdc_received_at', 'timestamp with time zone', 'YES'),
                ('_sdc_sequence', 'bigint', 'YES'),
                ('_sdc_table_version', 'bigint', 'YES'),
                ('adoption__adopted_on', 'timestamp with time zone', 'YES'),
                ('adoption__was_foster', 'boolean', 'YES'),
                ('age', 'bigint', 'YES'), ('id', 'bigint', 'NO'),
                ('name', 'text', 'NO'), ('paw_size', 'bigint', 'NO'),
                ('paw_colour', 'text', 'NO'),
                ('flea_check_complete', 'boolean', 'NO'),
                ('pattern', 'text', 'YES')
            }

            cur.execute(
                sql.SQL('SELECT {} FROM {}').format(sql.Identifier('name'),
                                                    sql.Identifier('cats')))
            persisted_records = cur.fetchall()

            ## Assert that the original data is present
            assert cat_count == len(persisted_records)
            assert cat_count == len(
                [x for x in persisted_records if x[0] is not None])

    class NameNullCatStream(CatStream):
        def generate_record(self):
            record = CatStream.generate_record(self)
            record['id'] = record['id'] + cat_count
            record['name'] = None
            return record

    stream = NameNullCatStream(cat_count)
    stream.schema = deepcopy(stream.schema)
    stream.schema['schema']['properties']['name'] = json_schema.make_nullable(
        stream.schema['schema']['properties']['name'])

    main(CONFIG, input_stream=stream)

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_columns_sql('cats'))
            columns = cur.fetchall()

            assert set(columns) == {
                ('_sdc_batched_at', 'timestamp with time zone', 'YES'),
                ('_sdc_received_at', 'timestamp with time zone', 'YES'),
                ('_sdc_sequence', 'bigint', 'YES'),
                ('_sdc_table_version', 'bigint', 'YES'),
                ('adoption__adopted_on', 'timestamp with time zone', 'YES'),
                ('adoption__was_foster', 'boolean', 'YES'),
                ('age', 'bigint', 'YES'), ('id', 'bigint', 'NO'),
                ('name', 'text', 'YES'), ('paw_size', 'bigint', 'NO'),
                ('paw_colour', 'text', 'NO'),
                ('flea_check_complete', 'boolean', 'NO'),
                ('pattern', 'text', 'YES')
            }

            cur.execute(
                sql.SQL('SELECT {} FROM {}').format(sql.Identifier('name'),
                                                    sql.Identifier('cats')))
            persisted_records = cur.fetchall()

            ## Assert that the column is has migrated data
            assert 2 * cat_count == len(persisted_records)
            assert cat_count == len(
                [x for x in persisted_records if x[0] is not None])
            assert cat_count == len(
                [x for x in persisted_records if x[0] is None])

    class NameNonNullCatStream(CatStream):
        def generate_record(self):
            record = CatStream.generate_record(self)
            record['id'] = record['id'] + 2 * cat_count
            return record

    main(CONFIG, input_stream=NameNonNullCatStream(cat_count))

    with psycopg2.connect(**TEST_DB) as conn:
        with conn.cursor() as cur:
            cur.execute(get_columns_sql('cats'))
            columns = cur.fetchall()

            assert set(columns) == {
                ('_sdc_batched_at', 'timestamp with time zone', 'YES'),
                ('_sdc_received_at', 'timestamp with time zone', 'YES'),
                ('_sdc_sequence', 'bigint', 'YES'),
                ('_sdc_table_version', 'bigint', 'YES'),
                ('adoption__adopted_on', 'timestamp with time zone', 'YES'),
                ('adoption__was_foster', 'boolean', 'YES'),
                ('age', 'bigint', 'YES'), ('id', 'bigint', 'NO'),
                ('name', 'text', 'YES'), ('paw_size', 'bigint', 'NO'),
                ('paw_colour', 'text', 'NO'),
                ('flea_check_complete', 'boolean', 'NO'),
                ('pattern', 'text', 'YES')
            }

            cur.execute(
                sql.SQL('SELECT {} FROM {}').format(sql.Identifier('name'),
                                                    sql.Identifier('cats')))
            persisted_records = cur.fetchall()

            ## Assert that the column is has migrated data
            assert 3 * cat_count == len(persisted_records)
            assert 2 * cat_count == len(
                [x for x in persisted_records if x[0] is not None])
            assert cat_count == len(
                [x for x in persisted_records if x[0] is None])