Ejemplo n.º 1
0
def add_synthetic_keys_to_stream_schema(stream):
    """Add synthetic keys to stream's schema."""
    stream.schema.properties["_sdc_report_datetime"] = Schema.from_dict({
        "description":
        "DateTime of Report Run",
        "type":
        "string",
        "format":
        "date-time",
    })
    if stream.tap_stream_id in STATISTICS_REPORT_TYPES:
        stream.schema.properties["_sdc_report_currency"] = Schema.from_dict({
            "description":
            "Currency of all costs in report",
            "type":
            "string",
        })
        stream.schema.properties[
            "_sdc_report_ignore_x_device"] = Schema.from_dict({
                "description":
                "Ignore cross-device data. Also can explicitly " +
                "set to null for TransactionID ReportType to get all data.",
                "type":
                "boolean",
            })
    return stream
Ejemplo n.º 2
0
def do_sync(client, catalog, state, config):
    selected_stream_names = get_selected_streams(catalog)
    validate_dependencies(selected_stream_names)
    populate_class_schemas(catalog, selected_stream_names)
    all_sub_stream_names = get_sub_stream_names()

    for stream in catalog.streams:
        stream_name = stream.tap_stream_id
        mdata = metadata.to_map(stream.metadata)
        if stream_name not in selected_stream_names:
            LOGGER.info("%s: Skipping - not selected", stream_name)
            continue

        key_properties = metadata.get(mdata, (), 'table-key-properties')
        sideload_objects = metadata.get(mdata, (), 'sideload-objects')
        if sideload_objects:
            stream_schema = get_side_load_schemas(sideload_objects, stream)
            stream.schema = Schema.from_dict(stream_schema)

        singer.write_schema(stream_name, stream.schema.to_dict(),
                            key_properties)

        sub_stream_names = SUB_STREAMS.get(stream_name)
        if sub_stream_names:
            for sub_stream_name in sub_stream_names:
                if sub_stream_name not in selected_stream_names:
                    continue
                sub_stream = STREAMS[sub_stream_name].stream
                sub_mdata = metadata.to_map(sub_stream.metadata)
                sub_key_properties = metadata.get(sub_mdata, (),
                                                  'table-key-properties')
                sideload_objects = metadata.get(mdata, (), 'sideload-objects')
                if sideload_objects:
                    sub_stream_schema = get_side_load_schemas(
                        sideload_objects, sub_stream)
                    sub_stream.schema = Schema.from_dict(sub_stream_schema)
                singer.write_schema(sub_stream.tap_stream_id,
                                    sub_stream.schema.to_dict(),
                                    sub_key_properties)

        # parent stream will sync sub stream
        if stream_name in all_sub_stream_names:
            continue

        LOGGER.info("%s: Starting sync", stream_name)
        instance = STREAMS[stream_name](client, config)
        counter_value = sync_stream(state, config.get('start_date'), instance)
        singer.write_state(state)
        LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter_value)
        zendesk_metrics.log_aggregate_rates()

    singer.write_state(state)
    LOGGER.info("Finished sync")
    zendesk_metrics.log_aggregate_rates()
Ejemplo n.º 3
0
def add_automatic_properties(catalog_entry, columns):
    catalog_entry.schema.properties[SDC_DELETED_AT] = Schema(
        type=["null", "string"], format="date-time")

    catalog_entry.schema.properties[SYS_UPDATED_AT] = Schema(
        type=["null", "string"], format="date-time")

    catalog_entry.schema.properties[SYS_EVENT_TYPE] = Schema(type="integer")

    catalog_entry.schema.properties[SYS_HASHDIFF] = Schema(
        type=["null", "string"], format="date-time")

    catalog_entry.schema.properties[SYS_HASHKEY] = Schema(
        type=["null", "string"], format="date-time")

    catalog_entry.schema.properties[SYS_LOG_POS] = Schema(type='integer')

    catalog_entry.schema.properties[SYS_LOG_FILE] = Schema(type='integer')

    catalog_entry.schema.properties[SYS_LINENO] = Schema(type='integer')

    columns.append(SDC_DELETED_AT)
    columns.append(SYS_UPDATED_AT)
    columns.append(SYS_EVENT_TYPE)
    columns.append(SYS_HASHKEY)
    columns.append(SYS_HASHDIFF)
    columns.append(SYS_LINENO)
    columns.append(SYS_LOG_POS)
    columns.append(SYS_LOG_FILE)

    return columns
Ejemplo n.º 4
0
def generate_catalog(client, report_config, standard_fields, custom_fields,
                     all_cubes, cubes_lookup, profile_ids):
    """
    Generate a catalog entry for each report specified in `report_config`
    """
    catalog_entries = []
    # for report in PREMADE_REPORTS:
    for report in report_config:
        # change to safe name for bigquery
        temp = report['name'].replace(' ', '_').lower()
        report['name'] = temp

        metrics_dimensions = set(report['metrics'] + report['dimensions'])
        selected_by_default = {
            *report['metrics'][:10],  # Use first 10 metrics in definition
            *report.get('default_dimensions', [])
        }
        premade_fields = [
            field for field in standard_fields
            if field['id'] in metrics_dimensions
        ]
        schema, mdata = generate_premade_catalog_entry(premade_fields,
                                                       all_cubes, cubes_lookup)

        mdata = reduce(
            lambda mdata, field_name: metadata.write(mdata, (
                "properties", field_name), "selected-by-default", True),
            selected_by_default, mdata)

        catalog_entries.append(
            CatalogEntry(schema=Schema.from_dict(schema),
                         key_properties=['_sdc_record_hash'],
                         stream=report['name'],
                         tap_stream_id=report['name'],
                         metadata=metadata.to_list(mdata)))

    # for report in report_config:
    for report in []:
        schema, mdata = generate_catalog_entry(client, standard_fields,
                                               custom_fields, all_cubes,
                                               cubes_lookup, profile_ids)

        catalog_entries.append(
            CatalogEntry(schema=Schema.from_dict(schema),
                         key_properties=['_sdc_record_hash'],
                         stream=report['name'],
                         tap_stream_id=report['id'],
                         metadata=metadata.to_list(mdata)))
    return Catalog(catalog_entries)
Ejemplo n.º 5
0
    def test_one_pk_value_with_bookmark(self):
        catalog_entry = CatalogEntry(tap_stream_id='foo',
                                     schema=Schema.from_dict({
                                         'properties': {
                                             'id': {
                                                 'type': ['integer']
                                             }
                                         }
                                     }),
                                     metadata=[{
                                         'breadcrumb': (),
                                         'metadata': {
                                             'table-key-properties': ['id']
                                         }
                                     }])
        state = {
            'bookmarks': {
                'foo': {
                    'last_pk_fetched': {
                        'id': 4
                    },
                    'max_pk_values': {
                        'id': 10
                    }
                }
            }
        }

        expected = ' WHERE ((`id` > 4)) AND `id` <= 10 ORDER BY `id` ASC'
        actual = generate_pk_clause(catalog_entry, state)

        self.assertEqual(expected, actual)
Ejemplo n.º 6
0
 def test_should_ignore_fields_that_dont_parse(self):
     record = {
         "id": "5d6ca50762a07c00045125fb",
         "created_at": "not a date",
         "edited_at": "2019-09-02T05:13:43.151"
     }
     schema = Schema.from_dict({
         "properties": {
             "created_at": {
                 "format": "date-time",
                 "type": "string"
             },
             "edited_at": {
                 "format": "date-time",
                 "type": "string"
             },
             "id": {
                 "type": "string"
             },
         }
     })
     updated_record = Stream.convert_dates_to_rfc3339(record, schema)
     assert updated_record["created_at"] == "not a date"
     assert updated_record[
         "edited_at"] == "2019-09-02T05:13:43.151000+00:00"
Ejemplo n.º 7
0
def add_automatic_properties(catalog_entry, columns):
    catalog_entry.schema.properties[SDC_DELETED_AT] = Schema(
        type=["null", "string"], format="date-time")

    columns.append(SDC_DELETED_AT)

    return columns
Ejemplo n.º 8
0
    def test_fails_with_null_bookmark(self):
        catalog_entry = CatalogEntry(
            schema=Schema.from_dict({'properties': {}}))
        key_properties = []
        last_pk_fetched = None

        with self.assertRaises(AssertionError):
            generate_pk_bookmark_clause(key_properties, last_pk_fetched,
                                        catalog_entry)
Ejemplo n.º 9
0
    def test_no_pk_values(self):
        catalog_entry = CatalogEntry(schema=Schema.from_dict(
            {'properties': {}}),
                                     metadata=[])
        state = {}

        expected = ''
        actual = generate_pk_clause(catalog_entry, state)

        self.assertEqual(expected, actual)
Ejemplo n.º 10
0
    def test_empty_pk(self):
        catalog_entry = CatalogEntry(
            schema=Schema.from_dict({'properties': {}}))
        key_properties = []
        last_pk_fetched = {}

        expected = ''
        actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched,
                                             catalog_entry)

        self.assertEqual(expected, actual)
Ejemplo n.º 11
0
def generate_catalog(client, standard_fields, custom_fields, all_cubes,
                     cubes_lookup, profile_id):
    schema, mdata = generate_catalog_entry(client, standard_fields,
                                           custom_fields, all_cubes,
                                           cubes_lookup, profile_id)
    # Do the thing to generate the thing
    catalog_entry = CatalogEntry(schema=Schema.from_dict(schema),
                                 key_properties=['_sdc_record_hash'],
                                 stream='report',
                                 tap_stream_id='report',
                                 metadata=metadata.to_list(mdata))
    return Catalog([catalog_entry])
Ejemplo n.º 12
0
 def test_sync_users(self, m):
     loop = asyncio.get_event_loop()
     record_value = load_file_current('teams_output.json', 'data_test')
     with mock.patch('tap_sentry.SentryClient.users',
                     return_value=[record_value]):
         dataSync = SentrySync(self.client)
         schema = load_file('users.json', 'tap_sentry/schemas')
         resp = dataSync.sync_users(Schema(schema))
         with mock.patch('singer.write_record') as patching:
             task = asyncio.gather(resp)
             loop.run_until_complete(task)
             patching.assert_called_with('users', record_value)
Ejemplo n.º 13
0
    def test_one_pk(self):
        catalog_entry = CatalogEntry(schema=Schema.from_dict(
            {'properties': {
                'id1': {
                    'type': ['integer']
                }
            }}))
        key_properties = ['id1']
        last_pk_fetched = {'id1': 4}

        expected = '(`id1` > 4)'
        actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched,
                                             catalog_entry)

        self.assertEqual(expected, actual)
Ejemplo n.º 14
0
def generate_streams(conn, table_info):
    entries = []
    for schema_name in table_info.keys():
        for table in table_info[schema_name].keys():

            with conn.cursor() as cur:
                sql = f"""
SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS kcu
    INNER JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS tc ON tc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME 
                                                                 AND tc.CONSTRAINT_TYPE = 'PRIMARY KEY'
WHERE kcu.TABLE_SCHEMA = '{schema_name}' AND kcu.TABLE_NAME = '{table}'"""
                cur.execute(sql)
                table_pks = [
                    col['COLUMN_NAME'] for col in convert_result_to_dict(cur)
                ]

                sql = """SELECT db_name()"""
                cur.execute(sql)
                database = cur.fetchone()[0]

            meta = {}
            columns = table_info[schema_name][table]['columns']

            metadata.write(meta, (), 'table-key-properties', table_pks)
            metadata.write(meta, (), 'schema-name', schema_name)
            metadata.write(meta, (), 'database-name', database)
            metadata.write(meta, (), 'row-count',
                           table_info[schema_name][table]['row_count'])
            metadata.write(meta, (), 'is-view',
                           table_info[schema_name][table]['is_view'])

            column_schemas = {
                col_name: schema_for_column(col_info, table_pks)
                for col_name, col_info in columns.items()
            }

            schema = Schema(type=object, properties=column_schemas)

            entry = CatalogEntry(table=table,
                                 stream=table,
                                 metadata=metadata.to_list(meta),
                                 tap_stream_id=get_tap_stream_id(
                                     database, schema_name, table),
                                 schema=schema)
            entries.append(entry)

    return Catalog(entries)
Ejemplo n.º 15
0
def resolve_catalog(discovered_catalog, streams_to_sync):
    result = Catalog(streams=[])

    # Iterate over the streams in the input catalog and match each one up
    # with the same stream in the discovered catalog.
    for catalog_entry in streams_to_sync:
        catalog_metadata = metadata.to_map(catalog_entry.metadata)
        replication_key = catalog_metadata.get((), {}).get("replication-key")

        discovered_table = discovered_catalog.get_stream(catalog_entry.tap_stream_id)
        database_name = common.get_database_name(catalog_entry)

        if not discovered_table:
            LOGGER.warning(
                "Database %s table %s was selected but does not exist",
                database_name,
                catalog_entry.table,
            )
            continue

        selected = {
            k
            for k, v in catalog_entry.schema.properties.items()
            if common.property_is_selected(catalog_entry, k) or k == replication_key
        }

        # These are the columns we need to select
        columns = desired_columns(selected, discovered_table.schema)

        result.streams.append(
            CatalogEntry(
                tap_stream_id=catalog_entry.tap_stream_id,
                metadata=catalog_entry.metadata,
                stream=catalog_entry.tap_stream_id,
                table=catalog_entry.table,
                schema=Schema(
                    type="object",
                    properties={
                        col: discovered_table.schema.properties[col] for col in columns
                    },
                ),
            )
        )

    return result
Ejemplo n.º 16
0
 def schema(self):
     if self._schema is None:
         raw_schema = self._client.item_schema(self.api)
         if self._config.merge_fields_array:
             # Replace merge fields object with array to make a separate table.
             mf_desc = raw_schema['properties']['merge_fields'][
                 'description']
             raw_schema['properties']['merge_fields'] = {
                 'description': mf_desc,
                 'type': 'array',
                 'items': {
                     'type': 'object',
                     'properties': {
                         'merge_id': {
                             'type': 'number'
                         },
                         'tag': {
                             'type': 'string'
                         },
                         'name': {
                             'type': 'string'
                         },
                         'type': {
                             'type': 'string'
                         },
                         'value': {
                             'type': 'string'
                         }
                     }
                 }
             }
         if self._config.interests_array:
             # Replace interest object with array to make a separate table.
             int_desc = raw_schema['properties']['interests']['description']
             raw_schema['properties']['interests'] = {
                 'description': int_desc,
                 'type': 'array',
                 'items': {
                     'type': 'object'
                 }
             }
         self._schema = Schema.from_dict(raw_schema)
     return self._schema
Ejemplo n.º 17
0
    def test_two_pks(self):
        catalog_entry = CatalogEntry(schema=Schema.from_dict({
            'properties': {
                'id1': {
                    'type': ['integer']
                },
                'str': {
                    'type': ['string']
                }
            }
        }))
        key_properties = ['id1', 'str']
        last_pk_fetched = {'id1': 4, 'str': 'apples'}

        expected = '(`id1` > 4) OR (`id1` = 4 AND `str` > \'apples\')'
        actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched,
                                             catalog_entry)

        self.assertEqual(expected, actual)
Ejemplo n.º 18
0
    def test_three_pk_values_with_bookmark(self):
        catalog_entry = CatalogEntry(tap_stream_id='foo',
                                     schema=Schema.from_dict({
                                         'properties': {
                                             'id1': {
                                                 'type': ['integer']
                                             },
                                             'id2': {
                                                 'type': ['string']
                                             },
                                             'id3': {
                                                 'type': ['integer']
                                             }
                                         }
                                     }),
                                     metadata=[{
                                         'breadcrumb': (),
                                         'metadata': {
                                             'table-key-properties':
                                             ['id1', 'id2', 'id3']
                                         }
                                     }])
        state = {
            'bookmarks': {
                'foo': {
                    'last_pk_fetched': {
                        'id1': 4,
                        'id2': 6,
                        'id3': 2
                    },
                    'max_pk_values': {
                        'id1': 10,
                        'id2': 8,
                        'id3': 3
                    }
                }
            }
        }

        expected = ' WHERE ((`id1` > 4) OR (`id1` = 4 AND `id2` > \'6\') OR (`id1` = 4 AND `id2` = \'6\' AND `id3` > 2)) AND `id1` <= 10 AND `id2` <= \'8\' AND `id3` <= 3 ORDER BY `id1`, `id2`, `id3` ASC'
        actual = generate_pk_clause(catalog_entry, state)

        self.assertEqual(expected, actual)
Ejemplo n.º 19
0
def get_schema_for_table(table):
    """ Given a table object, output its schema
    """
    schema = {"type": "object", "properties": {}, "required": []}
    table_fields = table.get("table_fields", [])

    # Add __id (record id) to every table

    record_id_field = {
        "id": "__id",
        "type": "integer",
        "required": True,
        "is_multiple": False
    }

    table_fields.append(record_id_field)

    for field in table_fields:
        property_schema = {"inclusion": "automatic"}
        property_schema['type'] = []

        if field["required"]:
            schema["required"].append(field["id"])
        else:
            property_schema['type'].append("null")

        if field["type"] in (STRING_TYPES | DATE_TYPES) \
                or field["is_multiple"]:
            property_schema['type'].append("string")

        if field["type"] in DATE_TYPES:
            property_schema["format"] = "date-time"

        if field["type"] in NUMERIC_TYPES:
            property_schema["type"].append("number")

        if field["type"] in INTEGER_TYPES or field["type"] == "integer":
            property_schema["type"].append("integer")

        schema["properties"][field["id"]] = property_schema

    return Schema.from_dict(schema)
Ejemplo n.º 20
0
 def test_should_not_update_datetime_that_contains_timezone(self):
     record = {
         "id": "5d6ca50762a07c00045125fb",
         "domain": "pageup",
         "created_at": "2019-09-02T05:13:43.151+10:00"
     }
     schema = Schema.from_dict({
         "properties": {
             "created_at": {
                 "format": "date-time",
                 "type": "string"
             },
             "domain": {
                 "type": "string"
             },
             "id": {
                 "type": "string"
             },
         }
     })
     updated_record = Stream.convert_dates_to_rfc3339(record, schema)
     assert updated_record["created_at"] == "2019-09-02T05:13:43.151+10:00"
Ejemplo n.º 21
0
 def test_should_append_timezone_to_nested_datetime_fields(self):
     record = {
         "id": "5d6ca50762a07c00045125fb",
         "date": "2019-08-13T00:00:00",
         "edited": {
             "author": "Sam Witwicky",
             "date": "2019-10-09T06:14:58.877"
         },
     }
     schema = Schema.from_dict({
         "properties": {
             "edited": {
                 "properties": {
                     "author": {
                         "type": "string"
                     },
                     "date": {
                         "format": "date-time",
                         "type": "string"
                     }
                 },
                 "type": "object",
             },
             "date": {
                 "format": "date-time",
                 "type": "string"
             },
             "id": {
                 "type": "string"
             },
         }
     })
     updated_record = Stream.convert_dates_to_rfc3339(record, schema)
     assert updated_record["date"] == "2019-08-13T00:00:00+00:00"
     assert updated_record["edited"][
         "date"] == "2019-10-09T06:14:58.877000+00:00"
Ejemplo n.º 22
0
def generate_catalog(
    client,
    report_config,
    standard_fields,
    custom_fields,
    all_cubes,
    cubes_lookup,
    profile_ids,
):
    """
    Generate a catalog entry for each report specified in `report_config`
    """
    catalog_entries = []

    for report in report_config:
        selected_by_default = {
            *report['metrics'][:10], *report.get('dimensions', [])
        }
        schema, mdata = generate_catalog_entry(client, standard_fields,
                                               custom_fields, all_cubes,
                                               cubes_lookup, profile_ids)

        mdata = reduce(
            lambda mdata, field_name: metadata.write(mdata, (
                "properties", field_name), "selected-by-default", True),
            selected_by_default, mdata)

        catalog_entries.append(
            CatalogEntry(
                schema=Schema.from_dict(schema),
                key_properties=['_sdc_record_hash'],
                stream=report['name'],
                tap_stream_id=report['name'],
                metadata=metadata.to_list(mdata),
            ))
    return Catalog(catalog_entries)
def discover_catalog(mysql_conn: Dict,
                     dbs: str = None,
                     tables: Optional[str] = None):
    """Returns a Catalog describing the structure of the database."""

    if dbs:
        filter_dbs_clause = ",".join(
            [f"'{db_name}'" for db_name in dbs.split(",")])
        table_schema_clause = f"WHERE table_schema IN ({filter_dbs_clause})"
    else:
        table_schema_clause = """
        WHERE table_schema NOT IN (
        'information_schema',
        'performance_schema',
        'mysql',
        'sys'
        )"""

    tables_clause = ''

    if tables is not None and tables != '':
        filter_tables_clause = ",".join(
            [f"'{table_name}'" for table_name in tables.split(",")])
        tables_clause = f" AND table_name IN ({filter_tables_clause})"

    with connect_with_backoff(mysql_conn) as open_conn:
        with open_conn.cursor() as cur:
            cur.execute(f"""
            SELECT table_schema,
                   table_name,
                   table_type,
                   table_rows
                FROM information_schema.tables
                {table_schema_clause}{tables_clause}
            """)

            table_info = {}

            for (db_name, table, table_type, rows) in cur.fetchall():
                if db_name not in table_info:
                    table_info[db_name] = {}

                table_info[db_name][table] = {
                    'row_count': rows,
                    'is_view': table_type == 'VIEW'
                }

            cur.execute(f"""
                SELECT table_schema,
                       table_name,
                       column_name,
                       data_type,
                       character_maximum_length,
                       numeric_precision,
                       numeric_scale,
                       column_type,
                       column_key
                    FROM information_schema.columns
                    {table_schema_clause}{tables_clause}
                    ORDER BY table_schema, table_name
            """)

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(
                    columns, lambda c: (c.table_schema, c.table_name)):
                cols = list(cols)
                (table_schema, table_name) = k

                schema = Schema(type='object',
                                properties={
                                    c.column_name: schema_for_column(c)
                                    for c in cols
                                })
                mdata = create_column_metadata(cols)
                md_map = metadata.to_map(mdata)

                md_map = metadata.write(md_map, (), 'database-name',
                                        table_schema)

                is_view = table_info[table_schema][table_name]['is_view']

                if table_schema in table_info and table_name in table_info[
                        table_schema]:
                    row_count = table_info[table_schema][table_name].get(
                        'row_count')

                    if row_count is not None:
                        md_map = metadata.write(md_map, (), 'row-count',
                                                row_count)

                    md_map = metadata.write(md_map, (), 'is-view', is_view)

                column_is_key_prop = lambda c, s: (
                    c.column_key == 'PRI' and s.properties[
                        c.column_name].inclusion != 'unsupported')

                key_properties = [
                    c.column_name for c in cols
                    if column_is_key_prop(c, schema)
                ]

                if not is_view:
                    md_map = metadata.write(md_map, (), 'table-key-properties',
                                            key_properties)

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(
                        table_schema, table_name),
                    schema=schema)

                entries.append(entry)

    return Catalog(entries)
def schema_for_column(column):  # pylint: disable=too-many-branches
    """Returns the Schema object for the given Column."""

    data_type = column.data_type.lower()
    column_type = column.column_type.lower()

    inclusion = 'available'
    # We want to automatically include all primary key columns
    if column.column_key.lower() == 'pri':
        inclusion = 'automatic'

    result = Schema(inclusion=inclusion)

    if data_type == 'bit' or column_type.startswith('tinyint(1)'):
        result.type = ['null', 'boolean']

    elif data_type in BYTES_FOR_INTEGER_TYPE:
        result.type = ['null', 'integer']
        bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8
        if 'unsigned' in column.column_type:
            result.minimum = 0
            result.maximum = 2**bits - 1
        else:
            result.minimum = 0 - 2**(bits - 1)
            result.maximum = 2**(bits - 1) - 1

    elif data_type in FLOAT_TYPES:
        result.type = ['null', 'number']

    elif data_type == 'json':
        result.type = ['null', 'object']

    elif data_type == 'decimal':
        result.type = ['null', 'number']
        result.multipleOf = 10**(0 - column.numeric_scale)
        return result

    elif data_type in STRING_TYPES:
        result.type = ['null', 'string']
        result.maxLength = column.character_maximum_length

    elif data_type in DATETIME_TYPES:
        result.type = ['null', 'string']
        result.format = 'date-time'

    elif data_type == 'time':
        result.type = ['null', 'string']
        result.format = 'time'

    elif data_type in BINARY_TYPES:
        result.type = ['null', 'string']
        result.format = 'binary'

    elif data_type in SPATIAL_TYPES:
        result.type = ['null', 'object']
        result.format = 'spatial'

    else:
        result = Schema(None,
                        inclusion='unsupported',
                        description=f'Unsupported column type {column_type}')
    return result
Ejemplo n.º 25
0
def schema_for_column(c):
    '''Returns the Schema object for the given Column.'''
    data_type = c.data_type.lower()
    # column_type = c.column_type.lower()

    inclusion = 'available'
    # We want to automatically include all primary key columns
    # if c.column_key.lower() == 'pri':
    #     inclusion = 'automatic'

    result = Schema(inclusion=inclusion)

    # if data_type == 'bit' or column_type.startswith('tinyint(1)'):
    #     result.type = ['null', 'boolean']

    if data_type in BYTES_FOR_INTEGER_TYPE:
        result.type = ['null', 'integer']
        bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8
        # if 'unsigned' in c.column_type:
        #     result.minimum = 0
        #     result.maximum = 2 ** bits - 1
        # else:
        #     result.minimum = 0 - 2 ** (bits - 1)
        #     result.maximum = 2 ** (bits - 1) - 1

    elif data_type in FLOAT_TYPES:
        result.type = ['null', 'number']

    elif data_type == 'decimal':
        result.type = ['null', 'number']
        result.multipleOf = 10**(0 - c.numeric_scale)
        return result

    elif data_type in STRING_TYPES:
        result.type = ['null', 'string']
        result.maxLength = c.character_maximum_length

    elif data_type in DATETIME_TYPES:
        result.type = ['null', 'string']
        result.format = 'date-time'

    else:
        result = Schema(None,
                        inclusion='unsupported',
                        description='Unsupported column type')
    return result
Ejemplo n.º 26
0
def discover(conn, config):

    with connect_with_backoff(conn) as open_conn:
        with open_conn.cursor() as cur:
            cur.execute("""
            SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE From INFORMATION_SCHEMA.TABLES
            """)

            table_info = {}

            schemas = cur.fetchall()
            for (db, schema, table, table_type) in schemas:
                if db not in table_info:
                    table_info[db] = {}
                if schema not in table_info[db]:
                    table_info[db][schema] = {}

                table_info[db][schema][table] = {
                    # 'row_count': rows,
                    'is_view': table_type == 'VIEW'
                }

            cur.execute("""
            SELECT
       C.TABLE_SCHEMA, C.TABLE_NAME, C.COLUMN_NAME, C.DATA_TYPE, C.CHARACTER_MAXIMUM_LENGTH, C.NUMERIC_PRECISION,
       C.NUMERIC_PRECISION, TC.CONSTRAINT_TYPE
FROM INFORMATION_SCHEMA.COLUMNS C
    LEFT JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE CCU On C.COLUMN_NAME = CCU.COLUMN_NAME
    LEFT JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS TC ON CCU.CONSTRAINT_NAME = Tc.CONSTRAINT_NAME
ORDER BY C.TABLE_SCHEMA, C.TABLE_NAME
            """)
            # res = cur.fetchall()

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(
                    columns, lambda c: (c.table_schema, c.table_name)):
                cols = list(cols)
                (table_schema, table_name) = k
                schema = Schema(type='object',
                                properties={
                                    c.column_name: schema_for_column(c)
                                    for c in cols
                                })
                md = create_column_metadata(cols)
                md_map = metadata.to_map(md)

                md_map = metadata.write(md_map, (), 'database-name',
                                        table_schema)

                is_view = table_info[db][table_schema][table_name]['is_view']

                if table_schema in table_info and table_name in table_info[
                        table_schema]:
                    row_count = table_info[table_schema][table_name].get(
                        'row_count')

                    if row_count is not None:
                        md_map = metadata.write(md_map, (), 'row-count',
                                                row_count)

                    md_map = metadata.write(md_map, (), 'is-view', is_view)

                column_is_key_prop = lambda c, s: (
                    c.constraint_type == 'PRI' and s.properties[
                        c.column_name].inclusion != 'unsupported')

                key_properties = [
                    c.column_name for c in cols
                    if column_is_key_prop(c, schema)
                ]

                if not is_view:
                    md_map = metadata.write(md_map, (), 'table-key-properties',
                                            key_properties)

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(
                        table_schema, table_name),
                    schema=schema)

                entries.append(entry)

        return Catalog(entries)

    raw_schemas = load_schemas()
    streams = []

    for schema_name, schema in raw_schemas.items():
        # TODO: populate any metadata and stream's key properties here..
        stream_metadata = []
        stream_key_properties = []

        # create and add catalog entry
        catalog_entry = {
            'stream': schema_name,
            'tap_stream_id': schema_name,
            'schema': schema,
            'metadata': [],
            'key_properties': []
        }
        streams.append(catalog_entry)

    return {'streams': streams}
Ejemplo n.º 27
0
def schema_for_column(column):  # pylint: disable=too-many-branches
    """Returns the Schema object for the given Column."""

    data_type = column.data_type.lower()
    column_type = column.column_type.lower()

    inclusion = "available"
    # We want to automatically include all primary key columns
    if column.column_key.lower() == "pri":
        inclusion = "automatic"

    result = Schema(inclusion=inclusion)

    if data_type == "bit" or column_type.startswith("tinyint(1)"):
        result.type = ["null", "boolean"]

    elif data_type in BYTES_FOR_INTEGER_TYPE:
        result.type = ["null", "integer"]
        bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8
        if "unsigned" in column.column_type:
            result.minimum = 0
            result.maximum = 2 ** bits - 1
        else:
            result.minimum = 0 - 2 ** (bits - 1)
            result.maximum = 2 ** (bits - 1) - 1

    elif data_type in FLOAT_TYPES:
        result.type = ["null", "number"]

    elif data_type == "json":
        result.type = ["null", "object"]

    elif data_type == "decimal":
        result.type = ["null", "number"]
        result.multipleOf = 10 ** (0 - column.numeric_scale)
        return result

    elif data_type in STRING_TYPES:
        result.type = ["null", "string"]
        if data_type in ("longtext", "mediumtext"):
            result.maxLength = 65535
        else:
            result.maxLength = column.character_maximum_length

    elif data_type in DATETIME_TYPES:
        result.type = ["null", "string"]
        result.format = "date-time"

    elif data_type == "time":
        result.type = ["null", "string"]
        result.format = "time"

    elif data_type in BINARY_TYPES:
        result.type = ["null", "string"]
        result.format = "binary"
        if data_type in ("longblob", "blob"):
            result.maxLength = 65535

    else:
        result = Schema(
            None,
            inclusion="unsupported",
            description=f"Unsupported column type {column_type}",
        )
    return result
Ejemplo n.º 28
0
def load_static_schemas(streams):
    """ Load default schemas for all streams
    """
    for stream in streams:
        LOGGER.info('Loading schema for %s', stream.tap_stream_id)
        stream.discovered_schema.update(load_static_schema(stream))


STATIC_STREAMS = "members pipes cards tables".split()

catalog_entries = [
    CatalogEntry(tap_stream_id=stream,
                 stream=stream,
                 key_properties=["id"],
                 schema=Schema.from_dict(load_static_schema(stream)))
    for stream in STATIC_STREAMS
]

CATALOG = Catalog(catalog_entries)

LOGGER.info("There are %s static streams", len(CATALOG.streams))
LOGGER.info("STREAMS: %s", [stream.stream for stream in CATALOG.streams])


def get_schema_for_table(table):
    """ Given a table object, output its schema
    """
    schema = {"type": "object", "properties": {}, "required": []}
    table_fields = table.get("table_fields", [])
Ejemplo n.º 29
0
 def schema(self):
     if self._schema is None:
         self._schema = Schema.from_dict(self._client.item_schema(self.api))
     return self._schema
Ejemplo n.º 30
0
def schema_for_column(col_info: Column, table_pks: List[str]):
    data_type = col_info.sql_data_type
    result = Schema()

    if data_type in INTEGER_TYPES:
        result.type = nullable_column(col_info.column_name, 'integer',
                                      table_pks)
        result.minimum = -1 * (2**(col_info.numeric_precision - 1))
        result.maximum = 2**(col_info.numeric_precision - 1)
        return result
    elif data_type in DATE_TIME_TYPES:
        result.type = nullable_column(col_info.column_name, 'string',
                                      table_pks)
        if data_type == 'time':
            result.format = 'time'
        else:
            result.format = 'date-time'
        return result
    elif data_type in FLOAT_TYPES:
        result.type = nullable_column(col_info.column_name, 'number',
                                      table_pks)
        result.exclusiveMaximum = True
        result.exclusiveMinimum = True
        result.maximum = 10**(col_info.numeric_precision -
                              col_info.numeric_scale)
        result.minimum = -10**(col_info.numeric_precision -
                               col_info.numeric_scale)
        result.multipleOf = 10**(0 - col_info.numeric_scale)
        return result
    elif data_type == 'bit':
        result.type = nullable_column(col_info.column_name, 'boolean',
                                      table_pks)
        return result
    elif data_type in STRING_TYPES:
        result.type = nullable_column(col_info.column_name, 'string',
                                      table_pks)
        result.maxLength = col_info.character_maximum_length
        return result
    elif data_type in INFERRED_STRING_TYPES:
        result.type = nullable_column(col_info.column_name, 'string',
                                      table_pks)
        return result
    return Schema(None)