def test_simple_type():
    assert {'type': ['integer', 'null']} \
           == json_schema.simple_type({'type': ['integer', 'null']})
    assert {'type': ['string'], 'format': 'date-time'} \
           == json_schema.simple_type({'type': 'string',
                                       'format': 'date-time',
                                       'something': 1,
                                       'extra': 2})
    def _serialize_table_record_field_name(self, remote_schema,
                                           streamed_schema, path,
                                           value_json_schema):
        """
        Returns the appropriate remote field (column) name for `field`.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param streamed_schema: TABLE_SCHEMA(local)
        :param path: (string, ...)
        :value_json_schema: dict, JSON Schema
        :return: string
        """

        simple_json_schema = json_schema.simple_type(value_json_schema)

        mapping = self._get_mapping(remote_schema, path, simple_json_schema)

        if not mapping is None:
            return mapping

        ## Numbers are valid as `float` OR `int`
        ##  ie, 123.0 and 456 are valid 'number's
        if json_schema.INTEGER in json_schema.get_type(simple_json_schema):
            mapping = self._get_mapping(remote_schema, path,
                                        {'type': json_schema.NUMBER})

            if not mapping is None:
                return mapping

        raise Exception('Unknown column path: {} for table: {}'.format(
            path, remote_schema['path']))
Example #3
0
    def _serialize_table_record_field_name(self, remote_schema, path,
                                           value_json_schema):
        """
        Returns the appropriate remote field (column) name for `path`.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param path: (string, ...)
        :value_json_schema: dict, JSON Schema
        :return: string
        """

        simple_json_schema = json_schema.simple_type(value_json_schema)

        mapping = self._get_mapping(remote_schema, path, simple_json_schema)

        if not mapping is None:
            return mapping

        ## Numbers are valid as `float` OR `int`
        ##  ie, 123.0 and 456 are valid 'number's
        if json_schema.INTEGER in json_schema.get_type(simple_json_schema):
            mapping = self._get_mapping(remote_schema, path,
                                        {'type': json_schema.NUMBER})

            if not mapping is None:
                return mapping

        raise Exception(
            "A compatible column for path {} and JSONSchema {} in table {} cannot be found."
            .format(path, simple_json_schema, remote_schema['path']))
Example #4
0
    def fetch_column_from_path(self, path, table_schema):
        """
        Should only be used for paths which have been established, ie, the schema will
        not be changing etc.
        :param path:
        :param table_schema:
        :return:
        """

        for to, m in table_schema.get('mappings', {}).items():
            if tuple(m['from']) == path:
                return to, json_schema.simple_type(m)

        raise Exception('blahbittyblah')
Example #5
0
    def _serialize_table_record_field_name(self, remote_schema, path, value_json_schema_tuple):
        """
        Returns the appropriate remote field (column) name for `path`.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param path: (string, ...)
        :value_json_schema: tuple, JSON Schema
        :return: string
        """

        # rebuild the dict that needs to be passed further down the call stack
        if len(value_json_schema_tuple) == 1:
            value_json_schema = { 'type': value_json_schema_tuple[0] }
        else:
            value_json_schema = {'type': value_json_schema_tuple[0],
                     'format': value_json_schema_tuple[1]}

        simple_json_schema = json_schema.simple_type(value_json_schema)

        mapping = self._get_mapping(remote_schema,
                                    path,
                                    simple_json_schema)

        if not mapping is None:
            return mapping

        ## Numbers are valid as `float` OR `int`
        ##  ie, 123.0 and 456 are valid 'number's
        if json_schema.INTEGER in json_schema.get_type(simple_json_schema):
            mapping = self._get_mapping(remote_schema,
                                        path,
                                        {'type': json_schema.NUMBER})

            if not mapping is None:
                return mapping

        raise Exception("A compatible column for path {} and JSONSchema {} in table {} cannot be found.".format(
            path,
            simple_json_schema,
            remote_schema['path']
        ))
Example #6
0
    def upsert_table_helper(self, connection, schema, metadata, log_schema_changes=True):
        """
        Upserts the `schema` to remote by:
        - creating table if necessary
        - adding columns
        - adding column mappings
        - migrating data from old columns to new, etc.

        :param connection: remote connection, type left to be determined by implementing class
        :param schema: TABLE_SCHEMA(local)
        :param metadata: additional information necessary for downstream operations,
        :param log_schema_changes: defaults to True, set to false to disable logging of table level schema changes
        :return: TABLE_SCHEMA(remote)
        """
        table_path = schema['path']

        with self._set_timer_tags(metrics.job_timer(),
                                  'upsert_table_schema',
                                  table_path) as timer:

            _metadata = deepcopy(metadata)
            _metadata['schema_version'] = CURRENT_SCHEMA_VERSION

            table_name = self.add_table_mapping(connection, table_path, _metadata)

            self._set_metrics_tags__table(timer, table_name)

            existing_schema = self._get_table_schema(connection, table_name)

            existing_table = True
            if existing_schema is None:
                self.add_table(connection, table_path, table_name, _metadata)
                existing_schema = self._get_table_schema(connection, table_name)
                existing_table = False

            self.add_key_properties(connection, table_name, schema.get('key_properties', None))

            ## Build up mappings to compare new columns against existing
            mappings = []

            for to, m in existing_schema.get('mappings', {}).items():
                mapping = json_schema.simple_type(m)
                mapping['from'] = tuple(m['from'])
                mapping['to'] = to
                mappings.append(mapping)

            ## Only process columns which have single, nullable, types
            column_paths_seen = set()
            single_type_columns = []

            for column_path, column_schema in schema['schema']['properties'].items():
                column_paths_seen.add(column_path)
                for sub_schema in column_schema['anyOf']:
                    single_type_columns.append((column_path, deepcopy(sub_schema)))

            ### Add any columns missing from new schema
            for m in mappings:
                if not m['from'] in column_paths_seen:
                    single_type_columns.append((m['from'], json_schema.make_nullable(m)))

            ## Process new columns against existing
            table_empty = self.is_table_empty(connection, table_name)

            for column_path, column_schema in single_type_columns:
                upsert_table_helper__start__column = time.monotonic()

                canonicalized_column_name = self._canonicalize_column_identifier(column_path, column_schema, mappings)
                nullable_column_schema = json_schema.make_nullable(column_schema)

                def log_message(msg):
                    if log_schema_changes:
                        self.LOGGER.info(
                            'Table Schema Change [`{}`.`{}`:`{}`] {} (took {} millis)'.format(
                                table_name,
                                column_path,
                                canonicalized_column_name,
                                msg,
                                _duration_millis(upsert_table_helper__start__column)))

                ## NEW COLUMN
                if not column_path in [m['from'] for m in mappings]:
                    upsert_table_helper__column = "New column"
                    ### NON EMPTY TABLE
                    if not table_empty:
                        upsert_table_helper__column += ", non empty table"
                        self.LOGGER.warning(
                            'NOT EMPTY: Forcing new column `{}` in table `{}` to be nullable due to table not empty.'.format(
                                column_path,
                                table_name))
                        column_schema = nullable_column_schema

                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    column_schema)
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            column_schema)

                    mapping = json_schema.simple_type(column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    log_message(upsert_table_helper__column)

                    continue

                ## EXISTING COLUMNS
                ### SCHEMAS MATCH
                if [True for m in mappings if
                    m['from'] == column_path
                    and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(column_schema)]:
                    continue
                ### NULLABLE SCHEMAS MATCH
                ###  New column _is not_ nullable, existing column _is_
                if [True for m in mappings if
                    m['from'] == column_path
                    and self.json_schema_to_sql_type(m) == self.json_schema_to_sql_type(nullable_column_schema)]:
                    continue

                ### NULL COMPATIBILITY
                ###  New column _is_ nullable, existing column is _not_
                non_null_original_column = [m for m in mappings if
                                            m['from'] == column_path and json_schema.shorthand(
                                                m) == json_schema.shorthand(column_schema)]
                if non_null_original_column:
                    ## MAKE NULLABLE
                    self.make_column_nullable(connection,
                                              table_name,
                                              canonicalized_column_name)
                    self.drop_column_mapping(connection, table_name, canonicalized_column_name)
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)

                    mappings = [m for m in mappings if not (m['from'] == column_path and json_schema.shorthand(
                        m) == json_schema.shorthand(column_schema))]

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    log_message("Made existing column nullable.")

                    continue

                ### FIRST MULTI TYPE
                ###  New column matches existing column path, but the types are incompatible
                duplicate_paths = [m for m in mappings if m['from'] == column_path]

                if 1 == len(duplicate_paths):
                    existing_mapping = duplicate_paths[0]
                    existing_column_name = existing_mapping['to']

                    if existing_column_name:
                        self.drop_column_mapping(connection, table_name, existing_column_name)

                    ## Update existing properties
                    mappings = [m for m in mappings if m['from'] != column_path]

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    existing_column_new_normalized_name = self._canonicalize_column_identifier(column_path,
                                                                                               existing_mapping,
                                                                                               mappings)

                    mapping = json_schema.simple_type(json_schema.make_nullable(existing_mapping))
                    mapping['from'] = column_path
                    mapping['to'] = existing_column_new_normalized_name
                    mappings.append(mapping)

                    ## Add new columns
                    ### NOTE: all migrated columns will be nullable and remain that way

                    #### Table Metadata
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            existing_column_new_normalized_name,
                                            json_schema.make_nullable(existing_mapping))
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)

                    #### Columns
                    self.add_column(connection,
                                    table_name,
                                    existing_column_new_normalized_name,
                                    json_schema.make_nullable(existing_mapping))

                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    nullable_column_schema)

                    ## Migrate existing data
                    self.migrate_column(connection,
                                        table_name,
                                        existing_mapping['to'],
                                        existing_column_new_normalized_name)

                    ## Drop existing column
                    self.drop_column(connection,
                                     table_name,
                                     existing_mapping['to'])

                    upsert_table_helper__column = "Splitting `{}` into `{}` and `{}`. New column matches existing column path, but the types are incompatible.".format(
                        existing_column_name,
                        existing_column_new_normalized_name,
                        canonicalized_column_name
                    )

                ## REST MULTI TYPE
                elif 1 < len(duplicate_paths):
                    ## Add new column
                    self.add_column_mapping(connection,
                                            table_name,
                                            column_path,
                                            canonicalized_column_name,
                                            nullable_column_schema)
                    self.add_column(connection,
                                    table_name,
                                    canonicalized_column_name,
                                    nullable_column_schema)

                    mapping = json_schema.simple_type(nullable_column_schema)
                    mapping['from'] = column_path
                    mapping['to'] = canonicalized_column_name
                    mappings.append(mapping)

                    upsert_table_helper__column = "Adding new column to split column `{}`. New column matches existing column's path, but no types were compatible.".format(
                        column_path
                    )

                ## UNKNOWN
                else:
                    raise Exception(
                        'UNKNOWN: Cannot handle merging column `{}` (canonicalized as: `{}`) in table `{}`.'.format(
                            column_path,
                            canonicalized_column_name,
                            table_name
                        ))

                log_message(upsert_table_helper__column)

            if not existing_table:
                for column_names in self.new_table_indexes(schema):
                    self.add_index(connection, table_name, column_names)

            return self._get_table_schema(connection, table_name)
    def _serialize_table_records(self, remote_schema, streamed_schema,
                                 records):
        """
        Parse the given table's `records` in preparation for persistence to the remote target.

        Base implementation returns a list of dictionaries, where _every_ dictionary has the
        same keys as `remote_schema`'s properties.

        :param remote_schema: TABLE_SCHEMA(remote)
        :param streamed_schema: TABLE_SCHEMA(local)
        :param records: [{(path_0, path_1, ...): (_json_schema_string_type, value), ...}, ...]
        :return: [{...}, ...]
        """

        datetime_paths = [
            k for k, v in streamed_schema['schema']['properties'].items()
            if json_schema.is_datetime(v)
        ]

        default_paths = {
            k: v.get('default')
            for k, v in streamed_schema['schema']['properties'].items()
            if v.get('default') is not None
        }

        ## Get the default NULL value so we can assign row values when value is _not_ NULL
        NULL_DEFAULT = self.serialize_table_record_null_value(
            remote_schema, streamed_schema, None, None)

        serialized_rows = []

        remote_fields = set(remote_schema['schema']['properties'].keys())
        default_row = dict([(field, NULL_DEFAULT) for field in remote_fields])

        paths = streamed_schema['schema']['properties'].keys()
        for record in records:

            row = deepcopy(default_row)

            for path in paths:
                json_schema_string_type, value = record.get(path, (None, None))

                ## Serialize fields which are not present but have default values set
                if path in default_paths \
                        and value is None:
                    value = default_paths[path]
                    json_schema_string_type = json_schema.python_type(value)

                ## Serialize datetime to compatible format
                if path in datetime_paths \
                        and json_schema_string_type == json_schema.STRING \
                        and value is not None:
                    value = self.serialize_table_record_datetime_value(
                        remote_schema, streamed_schema, path, value)
                    value_json_schema = {
                        'type': json_schema.STRING,
                        'format': json_schema.DATE_TIME_FORMAT
                    }
                elif json_schema_string_type:
                    value_json_schema = {'type': json_schema_string_type}
                else:
                    value_json_schema = json_schema.simple_type(
                        streamed_schema['schema']['properties'][path])

                ## Serialize NULL default value
                value = self.serialize_table_record_null_value(
                    remote_schema, streamed_schema, path, value)

                field_name = self._serialize_table_record_field_name(
                    remote_schema, streamed_schema, path, value_json_schema)

                if field_name in remote_fields \
                        and (not field_name in row
                             or row[field_name] == NULL_DEFAULT):
                    row[field_name] = value

            serialized_rows.append(row)

        return serialized_rows