Exemple #1
0
def schema_for_column(c):
    '''Returns the Schema object for the given Column.'''
    data_type = c.data_type.lower()

    inclusion = 'available'

    result = Schema(inclusion=inclusion)

    if data_type == 'boolean':
        result.type = ['null', 'boolean']

    elif data_type == 'number' or data_type == 'real' or data_type == 'float' or data_type == 'fixed':
        result.type = ['null', 'number']

    elif data_type == 'text':
        result.type = ['null', 'string']

    elif data_type == 'timestamp_ntz':
        result.type = ['null', 'string']
        result.format = 'date-time'

    elif data_type == 'variant' or data_type == 'array':
        result.type = ['null', 'string']

    else:
        result = Schema(None,
                        inclusion='unsupported',
                        description='Unsupported column type {}'.format(data_type))
    
    return result
Exemple #2
0
class TestSchema(unittest.TestCase):

    # Raw data structures for several schema types
    string_dict = {'type': 'string', 'maxLength': 32}

    integer_dict = {'type': 'integer', 'maximum': 1000000}

    array_dict = {'type': 'array', 'items': integer_dict}

    object_dict = {
        'type': 'object',
        'properties': {
            'a_string': string_dict,
            'an_array': array_dict
        }
    }

    # Schema object forms of the same schemas as above
    string_obj = Schema(type='string', maxLength=32)

    integer_obj = Schema(type='integer', maximum=1000000)

    array_obj = Schema(type='array', items=integer_obj)

    object_obj = Schema(type='object',
                        properties={
                            'a_string': string_obj,
                            'an_array': array_obj
                        })

    def test_string_to_dict(self):
        self.assertEquals(self.string_dict, self.string_obj.to_dict())

    def test_integer_to_dict(self):
        self.assertEquals(self.integer_dict, self.integer_obj.to_dict())

    def test_array_to_dict(self):
        self.assertEquals(self.array_dict, self.array_obj.to_dict())

    def test_object_to_dict(self):
        self.assertEquals(self.object_dict, self.object_obj.to_dict())

    def test_string_from_dict(self):
        self.assertEquals(self.string_obj, Schema.from_dict(self.string_dict))

    def test_integer_from_dict(self):
        self.assertEquals(self.integer_obj,
                          Schema.from_dict(self.integer_dict))

    def test_array_from_dict(self):
        self.assertEquals(self.array_obj, Schema.from_dict(self.array_dict))

    def test_object_from_dict(self):
        self.assertEquals(self.object_obj, Schema.from_dict(self.object_dict))

    def test_repr_atomic(self):
        self.assertEquals(self.string_obj, eval(repr(self.string_obj)))

    def test_repr_recursive(self):
        self.assertEquals(self.object_obj, eval(repr(self.object_obj)))
Exemple #3
0
def schema_for_column(c, pks_for_table):
    # Return Schema(None) to avoid calling lower() on a column with no datatype
    if c.data_type is None:
        LOGGER.info('Skipping column %s since it had no datatype',
                    c.column_name)
        return Schema(None)

    data_type = c.data_type.lower()
    result = Schema()

    # Scale of None indicates default of 6 digits
    numeric_scale = c.numeric_scale

    if data_type == 'number' and numeric_scale is not None and numeric_scale <= 0:
        result.type = nullable_column(c.column_name, 'integer', pks_for_table)

        return result

    elif data_type == 'number':
        # NB: Due to scale and precision variations in Oracle version, and
        #     among numeric types, we're using a custom `singer.decimal` string
        #     formatter for this, with no opinion on scale/precision.
        result.type = nullable_column(c.column_name, 'string', pks_for_table)
        result.format = 'singer.decimal'

        return result

    elif data_type == 'date' or data_type.startswith("timestamp"):
        result.type = nullable_column(c.column_name, 'string', pks_for_table)

        result.format = 'date-time'
        return result

    elif data_type in FLOAT_TYPES:
        result.type = nullable_column(c.column_name, 'number', pks_for_table)
        return result

    elif data_type in STRING_TYPES:
        character_used = c.character_used
        result.type = nullable_column(c.column_name, 'string', pks_for_table)

        if character_used == 'C':
            result.maxLength = c.char_length
        return result

    #these column types are insane. they are NOT actually ieee754 floats
    #instead they are represented as decimals, but despite this
    #it appears we can say nothing about their max or min

    #"float", "double_precision", "real"
    elif data_type in ['float', 'double_precision']:
        result.type = nullable_column(c.column_name, 'string', pks_for_table)
        result.format = 'singer.decimal'
        return result

    return Schema(None)
 def test_unknown_inclusion(self):
     selected_cols = {'col1'}
     table_schema = Schema(type='object',
                           properties={
                               'col1': Schema(None, inclusion='unknown'),
                               'col2': Schema(None, inclusion='unsupported')
                           })
     assert_that(
         calling(tap_redshift.resolve.desired_columns).with_args(
             selected_cols, table_schema), raises(Exception))
def schema_for_column(c):
    """Returns the Schema object for the given Column."""
    data_type = c.data_type.lower()

    inclusion = "available"

    if c.is_primary_key == 1:
        inclusion = "automatic"

    result = Schema(inclusion=inclusion)

    if data_type == "bit":
        result.type = ["null", "boolean"]

    #In SQL Server tinyint is unsigned.
    elif data_type in BYTES_FOR_UNSIGNED_INTEGER_TYPE:
        result.type = ["null", "integer"]
        bits = BYTES_FOR_UNSIGNED_INTEGER_TYPE[data_type] * 8
        result.minimum = 0
        result.maximum = 2**bits - 1

    elif data_type in BYTES_FOR_INTEGER_TYPE:
        result.type = ["null", "integer"]
        bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8
        result.minimum = 0 - 2**(bits - 1)
        result.maximum = 2**(bits - 1) - 1

    elif data_type in FLOAT_TYPES:
        result.type = ["null", "number"]

    elif data_type in PRECISE_NUMERIC_TYPES:
        result.type = ["null", "number"]
        result.multipleOf = 10**(0 - c.numeric_scale)
        return result

    elif data_type in STRING_TYPES:
        result.type = ["null", "string"]
        if c.character_maximum_length is not None and c.character_maximum_length > 0:
            result.maxLength = c.character_maximum_length

    elif data_type in DATETIME_TYPES:
        result.type = ["null", "string"]
        result.format = "date-time"

    elif data_type in VARIANT_TYPES:
        result.type = ["null", "object"]

    else:
        result = Schema(
            None,
            inclusion="unsupported",
            description="Unsupported column type",
        )
    return result
Exemple #6
0
def schema_for_column(c):
    '''Returns the Schema object for the given Column.'''
    data_type = c.data_type.lower()
    column_type = c.column_type.lower()

    inclusion = 'available'
    # We want to automatically include all primary key columns
    if c.column_key.lower() == 'pri':
        inclusion = 'automatic'

    result = Schema(inclusion=inclusion)

    if data_type == 'bit' or column_type.startswith('tinyint(1)'):
        result.type = ['null', 'boolean']

    elif data_type in BYTES_FOR_INTEGER_TYPE:
        result.type = ['null', 'integer']
        bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8
        if 'unsigned' in c.column_type:
            result.minimum = 0
            result.maximum = 2**bits - 1
        else:
            result.minimum = 0 - 2**(bits - 1)
            result.maximum = 2**(bits - 1) - 1

    elif data_type in FLOAT_TYPES:
        result.type = ['null', 'number']

    elif data_type == 'decimal':
        result.type = ['null', 'number']
        result.exclusiveMaximum = True
        result.maximum = 10**(c.numeric_precision - c.numeric_scale)
        result.multipleOf = 10**(0 - c.numeric_scale)
        if 'unsigned' in column_type:
            result.minimum = 0
        else:
            result.exclusiveMinimum = True
            result.minimum = -10**(c.numeric_precision - c.numeric_scale)
        return result

    elif data_type in STRING_TYPES:
        result.type = ['null', 'string']
        result.maxLength = c.character_maximum_length

    elif data_type in DATETIME_TYPES:
        result.type = ['null', 'string']
        result.format = 'date-time'

    else:
        result = Schema(
            None,
            inclusion='unsupported',
            description='Unsupported column type {}'.format(column_type))
    return result
 def test_select_desired_column(self):
     selected_cols = {'col1', 'col2', 'col3'}
     table_schema = Schema(type='object',
                           properties={
                               'col1': Schema(None, inclusion='available'),
                               'col2': Schema(None,
                                              inclusion='unsupported'),
                               'col4': Schema(None, inclusion='automatic')
                           })
     desired_columns = tap_redshift.resolve.desired_columns(
         selected_cols, table_schema)
     assert_that(desired_columns, equal_to({'col1', 'col4'}))
Exemple #8
0
def schema_for_column(c):
    """Returns the Schema object for the given Column."""
    data_type = c.data_type.lower()
    column_type = c.column_type.lower()

    inclusion = "available"
    # We want to automatically include all primary key columns
    if c.column_key.lower() == "pri":
        inclusion = "automatic"

    result = Schema(inclusion=inclusion)

    if data_type == "bit" or column_type.startswith("tinyint(1)"):
        result.type = ["null", "boolean"]

    elif data_type in BYTES_FOR_INTEGER_TYPE:
        result.type = ["null", "integer"]
        bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8
        if "unsigned" in c.column_type:
            result.minimum = 0
            result.maximum = 2 ** bits - 1
        else:
            result.minimum = 0 - 2 ** (bits - 1)
            result.maximum = 2 ** (bits - 1) - 1

    elif data_type in FLOAT_TYPES:
        result.type = ["null", "number"]
        result.multipleOf = 10 ** (0 - 6)

    elif data_type == "decimal":
        result.type = ["null", "number"]
        result.multipleOf = 10 ** (0 - c.numeric_scale)
        return result

    elif data_type in STRING_TYPES:
        result.type = ["null", "string"]
        result.maxLength = c.character_maximum_length

    elif data_type in DATETIME_TYPES:
        result.type = ["null", "string"]
        result.format = "date-time"

    elif data_type in VARIANT_TYPES:
        result.type = ["null", "object"]

    else:
        result = Schema(
            None,
            inclusion="unsupported",
            description="Unsupported column type {}".format(column_type),
        )
    return result
    def runTest(self):
        selected_cols = set(['a', 'b', 'd'])
        table_schema = Schema(type='object',
                              properties={
                                  'a': Schema(None, inclusion='available'),
                                  'b': Schema(None, inclusion='unsupported'),
                                  'c': Schema(None, inclusion='automatic')})

        got_cols = tap_mysql.discover_utils.desired_columns(selected_cols, table_schema)

        self.assertEqual(got_cols,
                         {'a', 'c'},
                         'Keep automatic as well as selected, available columns.')
Exemple #10
0
 def test_one_selected_stream(self):
     selected_entry = CatalogEntry(tap_stream_id='a',
                                   schema=Schema(),
                                   metadata=[{'metadata':
                                              {'selected': True},
                                              'breadcrumb': []}])
     catalog = Catalog(
         [selected_entry,
          CatalogEntry(tap_stream_id='b',schema=Schema(),metadata=[]),
          CatalogEntry(tap_stream_id='c',schema=Schema(),metadata=[])])
     state = {}
     selected_streams = catalog.get_selected_streams(state)
     self.assertEquals([e for e in selected_streams],[selected_entry])
Exemple #11
0
def populate_schema_node(schema, field_info, id_field_map, breadcrumb, metadata):
    """
    Populates a node in the schema.  A node corresponds to a JSON object, which has
    properties (children)
    """
    # add metadata
    metadata.append(
        {
            'metadata': {
                'inclusion': 'available'
            },
            'breadcrumb': [i for i in breadcrumb]
        }
    )

    #populate schema
    schema.type = ['null','object']
    schema.properties = {}
    for id_num in field_info.get('composite_fields'):
        child_field_info = id_field_map[id_num]
        breadcrumb.extend(['properties',child_field_info.get('name')])

        child_schema = Schema()
        if child_field_info.get('composite_fields'):
            populate_schema_node(child_schema, child_field_info, id_field_map, breadcrumb, metadata)
        else:
            populate_schema_leaf(child_schema, child_field_info, id_num, breadcrumb, metadata)

        schema.properties[child_field_info.get('name')] = child_schema

        # remove 'properties' and 'child_field_name' from breadcrumb
        breadcrumb.pop()
        breadcrumb.pop()
 def test_tinyint_1_unsigned(self):
     self.assertEqual(self.schema.properties['c_tinyint_1_unsigned'],
                      Schema(['null', 'boolean'],
                             inclusion='available'))
     self.assertEqual(self.get_metadata_for_column('c_tinyint_1_unsigned'),
                      {'selected-by-default': True,
                       'sql-datatype': 'tinyint(1) unsigned'})
Exemple #13
0
    def runTest(self):
        selected_cols = set(["a", "b", "d"])
        table_schema = Schema(
            type="object",
            properties={
                "a": Schema(None, inclusion="available"),
                "b": Schema(None, inclusion="unsupported"),
                "c": Schema(None, inclusion="automatic"),
            },
        )

        got_cols = tap_mysql.desired_columns(selected_cols, table_schema)

        self.assertEqual(
            got_cols, set(["a", "c"]),
            "Keep automatic as well as selected, available columns.")
Exemple #14
0
    def test_retries_on_facebook_request_error_sync_batches(
            self, mocked_schema, mocked_api):
        """ 
            AdCreative.sync_batches calls a `facebook_business` method,`api_batch.execute()`, to get a batch of ad creatives. 
            We mock this method to raise a `FacebookRequestError` and expect the tap to retry this that function up to 5 times,
            which is the current hard coded `max_tries` value.
        """
        # Mock new_batch() function of API
        mocked_api.new_batch = Mock()
        mocked_api.new_batch.return_value = MockBatch(
            exception="FacebookRequestError"
        )  # Raise FacebookRequestError exception

        # Initialize AdCreative and mock catalog_entry
        mock_catalog_entry = CatalogEntry(schema=Schema())
        ad_creative_object = AdCreative('', '', '', '')
        ad_creative_object.catalog_entry = mock_catalog_entry

        # Call sync_batches() function of AdCreatives and verify FacebookRequestError is raised
        with self.assertRaises(FacebookRequestError):
            ad_creative_object.sync_batches([])

        # verify calls inside sync_batches are called 5 times as max 5 reties provided for function
        self.assertEquals(5, mocked_api.new_batch.call_count)
        self.assertEquals(5, mocked_schema.call_count)
Exemple #15
0
def generate_catalog(streams: List["StreamCatalogEntry"]) -> Catalog:
    """Generates a catalog with an entry for each stream in `streams`"""

    entries = []

    for stream_entry in streams:
        if "replication_method" not in stream_entry:
            stream_entry["replication_method"] = "INCREMENTAL"
        if "replication_key" not in stream_entry:
            stream_entry["replication_key"] = "updated_date"

        entries.append(
            CatalogEntry(
                tap_stream_id=stream_entry["tap_stream_id"],
                schema=Schema(),
                replication_method=stream_entry["replication_method"],
                replication_key=stream_entry["replication_key"],
                metadata=[{
                    "breadcrumb": tuple([]),
                    "metadata": {
                        "selected": stream_entry["selected"],
                    },
                }],
            ))

    return Catalog(entries)
 def test_float(self):
     self.assertEqual(self.dt_schema.properties['C_FLOAT'],
                      Schema(['null', 'number'], inclusion='available'))
     self.assertEqual(self.get_dt_metadata_for_column('C_FLOAT'), {
         'selected-by-default': True,
         'sql-datatype': 'float'
     })
 def test_decimal_with_defined_scale_and_precision(self):
     self.assertEqual(self.dt_schema.properties['C_DECIMAL_2'],
                      Schema(['null', 'number'], inclusion='available'))
     self.assertEqual(self.get_dt_metadata_for_column('C_DECIMAL_2'), {
         'selected-by-default': True,
         'sql-datatype': 'number'
     })
 def test_double(self):
     self.assertEqual(self.schema.properties['c_double'],
                      Schema(['null', 'number'], inclusion='available'))
     self.assertEqual(self.get_metadata_for_column('c_double'), {
         'selected-by-default': True,
         'sql-datatype': 'double'
     })
 def test_bit(self):
     self.assertEqual(self.schema.properties['c_bit'],
                      Schema(['null', 'boolean'], inclusion='available'))
     self.assertEqual(self.get_metadata_for_column('c_bit'), {
         'selected-by-default': True,
         'sql-datatype': 'bit(4)'
     })
Exemple #20
0
def add_automatic_properties(catalog_entry, columns):
    catalog_entry.schema.properties[SDC_DELETED_AT] = Schema(
        type=["null", "string"], format="date-time")

    columns.append(SDC_DELETED_AT)

    return columns
 def test_decimal_with_defined_scale_and_precision(self):
     self.assertEqual(
         self.schema.properties['c_decimal_2'],
         Schema(['null', 'number'], inclusion='available', multipleOf=0.01))
     self.assertEqual(self.get_metadata_for_column('c_decimal_2'), {
         'selected-by-default': True,
         'sql-datatype': 'decimal(11,2)'
     })
 def test_decimal(self):
     self.assertEqual(
         self.schema.properties['c_decimal'],
         Schema(['null', 'number'], inclusion='available', multipleOf=1))
     self.assertEqual(self.get_metadata_for_column('c_decimal'), {
         'selected-by-default': True,
         'sql-datatype': 'decimal(10,0)'
     })
 def test_varbinary(self):
     self.assertEqual(
         self.schema.properties['c_varbinary'],
         Schema(['null', 'string'], maxLength=32, inclusion='available'))
     self.assertEqual(self.get_metadata_for_column('c_varbinary'), {
         'selected-by-default': True,
         'sql-datatype': 'varbinary(16)'
     })
 def test_array(self):
     self.assertEqual(
         self.dt_schema.properties['C_ARRAY'],
         Schema(['null', 'object', 'array'], inclusion='available'))
     self.assertEqual(self.get_dt_metadata_for_column('C_ARRAY'), {
         'selected-by-default': True,
         'sql-datatype': 'array'
     })
 def test_varbinary(self):
     self.assertEqual(
         self.dt_schema.properties['C_VARBINARY'],
         Schema(['null', 'string'], format='binary', inclusion='available'))
     self.assertEqual(self.get_dt_metadata_for_column('C_VARBINARY'), {
         'selected-by-default': True,
         'sql-datatype': 'binary'
     })
 def test_time(self):
     self.assertEqual(
         self.dt_schema.properties['C_TIME'],
         Schema(['null', 'string'], format='time', inclusion='available'))
     self.assertEqual(self.get_dt_metadata_for_column('C_TIME'), {
         'selected-by-default': True,
         'sql-datatype': 'time'
     })
Exemple #27
0
def discover_columns(connection, table_info):
    entries = []
    for schema_name in table_info.keys():
        for table_name in table_info[schema_name].keys():
            mdata = {}
            columns = table_info[schema_name][table_name]['columns']
            table_pks = [
                col_name for col_name, col_info in columns.items()
                if col_info.is_primary_key
            ]
            with connection.cursor(
                    cursor_factory=psycopg2.extras.DictCursor) as cur:
                cur.execute(" SELECT current_database()")
                database_name = cur.fetchone()[0]

            metadata.write(mdata, (), 'table-key-properties', table_pks)
            metadata.write(mdata, (), 'schema-name', schema_name)
            metadata.write(mdata, (), 'database-name', database_name)
            metadata.write(mdata, (), 'row-count',
                           table_info[schema_name][table_name]['row_count'])
            metadata.write(mdata, (), 'is-view',
                           table_info[schema_name][table_name].get('is_view'))

            column_schemas = {
                col_name: schema_for_column(col_info)
                for col_name, col_info in columns.items()
            }

            schema = Schema(type='object', properties=column_schemas)
            for c_name in column_schemas.keys():
                mdata = write_sql_data_type_md(mdata, columns[c_name])
                if column_schemas[c_name].type is None:
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'inclusion', 'unsupported')
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'selected-by-default', False)
                elif table_info[schema_name][table_name]['columns'][
                        c_name].is_primary_key:
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'inclusion', 'automatic')
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'selected-by-default', True)
                else:
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'inclusion', 'available')
                    mdata = metadata.write(mdata, ('properties', c_name),
                                           'selected-by-default', True)

            entry = CatalogEntry(table=table_name,
                                 stream=table_name,
                                 metadata=metadata.to_list(mdata),
                                 tap_stream_id=database_name + '-' +
                                 schema_name + '-' + table_name,
                                 schema=schema)

            entries.append(entry)

    return entries
Exemple #28
0
 def test_resumes_currently_syncing_stream(self):
     selected_entry_a = CatalogEntry(tap_stream_id='a',
                                   schema=Schema(),
                                   metadata=[{'metadata':
                                              {'selected': True},
                                              'breadcrumb': []}])
     selected_entry_c = CatalogEntry(tap_stream_id='c',
                                     schema=Schema(),
                                     metadata=[{'metadata':
                                                {'selected': True},
                                                'breadcrumb': []}])
     catalog = Catalog(
         [selected_entry_a,
          CatalogEntry(tap_stream_id='b',schema=Schema(),metadata=[]),
          selected_entry_c])
     state = {'currently_syncing': 'c'}
     selected_streams = catalog.get_selected_streams(state)
     self.assertEquals([e for e in selected_streams][0],selected_entry_c)
 def test_bigint(self):
     self.assertEqual(self.schema.properties['c_bigint'],
                      Schema(['null', 'integer'],
                             inclusion='available',
                             minimum=-9223372036854775808,
                             maximum=9223372036854775807))
     self.assertEqual(self.get_metadata_for_column('c_bigint'),
                      {'selected-by-default': True,
                       'sql-datatype': 'bigint(20)'})
 def test_tinyint(self):
     self.assertEqual(self.schema.properties['c_tinyint'],
                      Schema(['null', 'integer'],
                             inclusion='available',
                             minimum=-128,
                             maximum=127))
     self.assertEqual(self.get_metadata_for_column('c_tinyint'),
                      {'selected-by-default': True,
                       'sql-datatype': 'tinyint(4)'})