def test_decimal_unsigned(self):
     self.assertEqual(
         self.schema.properties['c_decimal_2_unsigned'],
         Schema(['null', 'number'], inclusion='available', multipleOf=0.01))
     self.assertEqual(self.get_metadata_for_column('c_decimal_2_unsigned'),
                      {
                          'selected-by-default': True,
                          'sql-datatype': 'decimal(5,2) unsigned'
                      })
 def test_bigint(self):
     self.assertEqual(self.schema.properties['c_bigint'],
                      Schema(['null', 'integer'],
                             inclusion='available',
                             minimum=-9223372036854775808,
                             maximum=9223372036854775807))
     self.assertEqual(self.get_metadata_for_column('c_bigint'),
                      {'selected-by-default': True,
                       'sql-datatype': 'bigint(20)'})
 def test_bigint_unsigned(self):
     self.assertEqual(self.schema.properties['c_bigint_unsigned'],
                      Schema(['null', 'integer'],
                             inclusion='available',
                             minimum=0,
                             maximum=18446744073709551615))
     self.assertEqual(self.get_metadata_for_column('c_bigint_unsigned'),
                      {'selected-by-default': True,
                       'sql-datatype': 'bigint(20) unsigned'})
 def test_int(self):
     self.assertEqual(self.schema.properties['c_int'],
                      Schema(['null', 'integer'],
                             inclusion='available',
                             minimum=-2147483648,
                             maximum=2147483647))
     self.assertEqual(self.get_metadata_for_column('c_int'),
                      {'selected-by-default': True,
                       'sql-datatype': 'int(11)'})
 def test_mediumint(self):
     self.assertEqual(self.schema.properties['c_mediumint'],
                      Schema(['null', 'integer'],
                             inclusion='available',
                             minimum=-8388608,
                             maximum=8388607))
     self.assertEqual(self.get_metadata_for_column('c_mediumint'),
                      {'selected-by-default': True,
                       'sql-datatype': 'mediumint(9)'})
 def test_smallint(self):
     self.assertEqual(self.schema.properties['c_smallint'],
                      Schema(['null', 'integer'],
                             inclusion='available',
                             minimum=-32768,
                             maximum=32767))
     self.assertEqual(self.get_metadata_for_column('c_smallint'),
                      {'selected-by-default': True,
                       'sql-datatype': 'smallint(6)'})
def load_schemas():
    """ Load schemas from schemas folder """
    schemas = {}
    for filename in os.listdir(get_abs_path('schemas')):
        path = get_abs_path('schemas') + '/' + filename
        file_raw = filename.replace('.json', '')
        with open(path) as file:
            schemas[file_raw] = Schema.from_dict(json.load(file))
    return schemas
Exemple #8
0
def load_schemas() -> Dict[str, Any]:
    """ Load schemas from schemas folder """
    schemas = {}
    for filename in os.listdir(_get_abs_path("schemas")):
        path = _get_abs_path("schemas") + "/" + filename
        file_raw = filename.replace(".json", "")
        with open(path) as file:
            schemas[file_raw] = Schema.from_dict(json.load(file))
    return schemas
def discover_catalog(connection):
    cursor = connection.cursor()
    cursor.execute("""
        SELECT table_schema,
               table_name,
               column_name,
               data_type,
               character_maximum_length,
               numeric_precision,
               numeric_scale
            FROM information_schema.columns
            WHERE table_schema != 'INFORMATION_SCHEMA'
            ORDER BY table_schema, table_name
        """)

    columns = []
    rec = cursor.fetchone()
    while rec is not None:
        columns.append(Column(*rec))
        rec = cursor.fetchone()

    entries = []
    for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)):
        cols = list(cols)
        (table_schema, table_name) = k
        schema = Schema(type='object',
                        properties={c.column_name: schema_for_column(c) for c in cols})
        md = create_column_metadata(cols)
        md_map = metadata.to_map(md)

        if "events" in table_name.lower():
            key_properties = ['UUID']
            replication_key = "EVENT_TIME"
        elif "merge" in table_name.lower():
            key_properties = []
            replication_key = "MERGE_EVENT_TIME"
        else:
            replication_key = ""
            key_properties = []

        md_map = metadata.write(md_map,
                                (),
                                'table-key-properties',
                                key_properties)

        entry = CatalogEntry(
            stream=table_name,
            metadata=metadata.to_list(md_map),
            tap_stream_id=table_schema + "-" + table_name,
            replication_key=replication_key, # This is a non-discoverable key.
            replication_method="INCREMENTAL", # This is a non-discoverable key.
            schema=schema)

        entries.append(entry)

    return Catalog(entries)
Exemple #10
0
 def test_should_output_records(self, mock_stdout, requests_mock):
     requests_mock.get(
         "https://api.nikabot.com/api/v1/users?limit=1000&page=0",
         json=json.loads(USERS_RESPONSE))
     requests_mock.get(
         "https://api.nikabot.com/api/v1/users?limit=1000&page=1",
         json=json.loads(EMPTY_RESPONSE))
     config = {"access_token": "my-access-token", "page_size": 1000}
     state = {}
     catalog = Catalog(streams=[
         CatalogEntry(
             tap_stream_id="users",
             stream="users",
             schema=Schema.from_dict({}),
             key_properties=["id"],
             metadata=[{
                 "breadcrumb": [],
                 "metadata": {
                     "selected": True
                 }
             }],
         )
     ])
     sync(config, state, catalog)
     assert mock_stdout.mock_calls == [
         call(
             '{"type": "SCHEMA", "stream": "users", "schema": {}, "key_properties": ["id"]}\n'
         ),
         call(
             '{"type": "RECORD", "stream": "users", "record": {"id": "5de459977292020014fb601c", "name": "Billy", "deleted": true, "presence": "away", "user_id": "UR5B0QABX", "team_id": "T034F9NPW", "is_restricted": false, "is_ultra_restricted": false, "is_admin": false, "is_nikabot_admin": false, "tz": "Australia/Canberra", "tz_label": "Australian Eastern Standard Time", "tz_offset": 36000, "is_checkin_excluded": true, "created_at": "2019-12-02T00:23:51.087", "groups": [], "updated_at": "2020-06-14T22:47:29.617"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n'
         ),
         call(
             '{"type": "RECORD", "stream": "users", "record": {"id": "68QMxnnt8YcpPdfmM", "name": "paul.heasley", "deleted": false, "presence": "active", "user_id": "U04AX35QP", "team_id": "T034F9NPW", "is_restricted": false, "is_ultra_restricted": false, "is_admin": false, "is_nikabot_admin": true, "tz": "Australia/Canberra", "tz_label": "Australian Eastern Standard Time", "tz_offset": 36000, "is_checkin_excluded": false, "create_date": "2019-09-02T05:13:47.88", "created_at": "2019-09-02T05:13:47.882", "role": "0.1", "groups": ["TA Stream", "TA Squad 1", "TA Squad 2", "TA Squad 3", "TA Squad 4", "Learning Applications", "Notification Capability"], "updated_at": "2020-06-15T06:07:58.272"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n'
         ),
     ]
     assert LOGGER.info.mock_calls == [
         call("Syncing stream: %s", "users"),
         call(
             "Making %s request to %s with params %s",
             "GET",
             "https://api.nikabot.com/api/v1/users",
             {
                 "limit": "1000",
                 "page": "0"
             },
         ),
         call(
             "Making %s request to %s with params %s",
             "GET",
             "https://api.nikabot.com/api/v1/users",
             {
                 "limit": "1000",
                 "page": "1"
             },
         ),
     ]
Exemple #11
0
 def test_bit(self):
     self.assertEqual(self.schema.properties["c_bit"],
                      Schema(["null", "boolean"], inclusion="available"))
     self.assertEqual(
         self.get_metadata_for_column("c_bit"),
         {
             "selected-by-default": True,
             "sql-datatype": "bit(4)"
         },
     )
Exemple #12
0
 def test_double(self):
     self.assertEqual(self.schema.properties["c_double"],
                      Schema(["null", "number"], inclusion="available"))
     self.assertEqual(
         self.get_metadata_for_column("c_double"),
         {
             "selected-by-default": True,
             "sql-datatype": "double"
         },
     )
 def test_time(self):
     self.assertEqual(
         self.schema.properties['c_time'],
         Schema(['null', 'string'],
                format='date-time',
                inclusion='available'))
     self.assertEqual(self.get_metadata_for_column('c_time'), {
         'selected-by-default': True,
         'sql-datatype': 'time'
     })
Exemple #14
0
 def test_decimal_with_defined_scale_and_precision(self):
     self.assertEqual(
         self.schema.properties["c_decimal_2"],
         Schema(["null", "number"], inclusion="available", multipleOf=0.01),
     )
     self.assertEqual(
         self.get_metadata_for_column("c_decimal_2"),
         {
             "selected-by-default": True,
             "sql-datatype": "decimal(11,2)"
         },
     )
Exemple #15
0
 def test_decimal_unsigned(self):
     self.assertEqual(
         self.schema.properties["c_decimal_2_unsigned"],
         Schema(["null", "number"], inclusion="available", multipleOf=0.01),
     )
     self.assertEqual(
         self.get_metadata_for_column("c_decimal_2_unsigned"),
         {
             "selected-by-default": True,
             "sql-datatype": "decimal(5,2) unsigned"
         },
     )
 def test_decimal_with_defined_scale_and_precision(self):
     self.assertEqual(self.schema.properties['c_decimal_2'],
                      Schema(['null', 'number'],
                             inclusion='available',
                             maximum=1000000000,
                             exclusiveMaximum=True,
                             minimum=-1000000000,
                             exclusiveMinimum=True,
                             multipleOf=0.01))
     self.assertEqual(self.get_metadata_for_column('c_decimal_2'),
                      {'selected-by-default': True,
                       'sql-datatype': 'decimal(11,2)'})
Exemple #17
0
 def test_tinyint_1_unsigned(self):
     self.assertEqual(
         self.schema.properties["c_tinyint_1_unsigned"],
         Schema(["null", "boolean"], inclusion="available"),
     )
     self.assertEqual(
         self.get_metadata_for_column("c_tinyint_1_unsigned"),
         {
             "selected-by-default": True,
             "sql-datatype": "tinyint(1) unsigned"
         },
     )
Exemple #18
0
def discover(client, custom_reports):
    raw_schemas = load_schemas()
    streams = []
    for stream_id, schema in raw_schemas.items():
        stream_instance = STREAMS[stream_id]
        stream_metadata = metadata.get_standard_metadata(
            schema=schema.to_dict(),
            key_properties=stream_instance.key_properties,
            valid_replication_keys=stream_instance.replication_key,
            replication_method=stream_instance.replication_method)
        streams.append(
            CatalogEntry(
                tap_stream_id=stream_id,
                stream=stream_id,
                schema=schema,
                key_properties=stream_instance.key_properties,
                metadata=stream_metadata,
                replication_key=stream_instance.replication_key,
                is_view=None,
                database=None,
                table=None,
                row_count=None,
                stream_alias=None,
                replication_method=stream_instance.replication_method,
            ))
    if custom_reports:
        for report in custom_reports:
            schema = build_schema(client, report)
            schema = Schema.from_dict(schema)
            key_properties = report.get('key_properties')
            replication_key = report.get('valid_replication_keys')
            stream_metadata = metadata.get_standard_metadata(
                schema=schema.to_dict(),
                key_properties=key_properties,
                valid_replication_keys=replication_key,
                replication_method=None)
            streams.append(
                CatalogEntry(
                    tap_stream_id=report['stream_id'],
                    stream=report['stream_id'],
                    schema=schema,
                    key_properties=report.get('key_properties'),
                    metadata=stream_metadata,
                    replication_key=report.get('valid_replication_keys'),
                    is_view=None,
                    database=None,
                    table=None,
                    row_count=None,
                    stream_alias=report,
                    replication_method=None,
                ))
    return Catalog(streams)
Exemple #19
0
 def test_time(self):
     self.assertEqual(
         self.schema.properties["c_time"],
         Schema(["null", "string"],
                format="date-time",
                inclusion="available"),
     )
     self.assertEqual(
         self.get_metadata_for_column("c_time"),
         {
             "selected-by-default": True,
             "sql-datatype": "time"
         },
     )
Exemple #20
0
def produce_column_metadata(connection, table_info, table_schema, table_name,
                            pk_constraints, column_schemas, cols):
    mdata = {}

    table_pks = pk_constraints.get(table_schema, {}).get(table_name, [])

    #NB> sadly, some system tables like XDB$STATS have P constraints for columns that do not exist so we must protect against this
    table_pks = list(
        filter(
            lambda pk: column_schemas.get(pk, Schema(None)).type is not None,
            table_pks))

    database_name = get_database_name(connection)

    metadata.write(mdata, (), 'table-key-properties', table_pks)
    metadata.write(mdata, (), 'schema-name', table_schema)
    metadata.write(mdata, (), 'database-name', database_name)

    if table_schema in table_info and table_name in table_info[table_schema]:
        metadata.write(mdata, (), 'is-view',
                       table_info[table_schema][table_name]['is_view'])

        row_count = table_info[table_schema][table_name].get('row_count')

        if row_count is not None:
            metadata.write(mdata, (), 'row-count', row_count)

    for c in cols:
        c_name = c.column_name
        # Write the data_type or "None" when the column has no datatype
        metadata.write(mdata, ('properties', c_name), 'sql-datatype',
                       (c.data_type or "None"))
        if column_schemas[c_name].type is None:
            mdata = metadata.write(mdata, ('properties', c_name), 'inclusion',
                                   'unsupported')
            mdata = metadata.write(mdata, ('properties', c_name),
                                   'selected-by-default', False)
        elif c_name in pk_constraints.get(table_schema,
                                          {}).get(table_name, []):
            mdata = metadata.write(mdata, ('properties', c_name), 'inclusion',
                                   'automatic')
            mdata = metadata.write(mdata, ('properties', c_name),
                                   'selected-by-default', True)
        else:
            mdata = metadata.write(mdata, ('properties', c_name), 'inclusion',
                                   'available')
            mdata = metadata.write(mdata, ('properties', c_name),
                                   'selected-by-default', True)

    return mdata
Exemple #21
0
def mock_catalog():
    return Catalog(
        streams=[
            CatalogEntry(
                tap_stream_id="records",
                stream="records",
                schema=Schema.from_dict(json.loads(SCHEMA)),
                key_properties=["id"],
                metadata=[{"breadcrumb": [], "metadata": {"selected": True}}],
                replication_key="date",
                replication_method="INCREMENTAL",
            )
        ]
    )
Exemple #22
0
def generate_schema(table_spec, samples):
    metadata_schema = {
        '_smart_source_bucket': {'type': 'string'},
        '_smart_source_file': {'type': 'string'},
        '_smart_source_lineno': {'type': 'integer'},
    }
    prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer', False)
    data_schema = conversion.generate_schema(samples, prefer_number_vs_integer=prefer_number_vs_integer)
    inferred_schema = {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }

    merged_schema = override_schema_with_config(inferred_schema, table_spec)
    return Schema.from_dict(merged_schema)
Exemple #23
0
def build_schema(query_resource):
    return Schema.from_dict({
        'type': ['null', 'object'],
        'additionalProperties': False,
        'properties': {
            **{
                key: {'type': ['null', 'string']}
                for key in query_resource['params']['groupBys']
            },
            **{
                key: {'type': ['null', 'number']}
                for key in query_resource['params']['metrics']
            },
        }
    })
Exemple #24
0
 def test_int(self):
     self.assertEqual(
         self.schema.properties["c_int"],
         Schema(["null", "integer"],
                inclusion="available",
                minimum=-2147483648,
                maximum=2147483647),
     )
     self.assertEqual(
         self.get_metadata_for_column("c_int"),
         {
             "selected-by-default": True,
             "sql-datatype": "int(11)"
         },
     )
Exemple #25
0
    def test_bigint_unsigned(self):
        self.assertEqual(
            self.schema.properties["c_bigint_unsigned"],
            Schema(["null", "integer"],
                   inclusion="available",
                   minimum=0,
                   maximum=18446744073709551615),
        )

        self.assertEqual(
            self.get_metadata_for_column("c_bigint_unsigned"),
            {
                "selected-by-default": True,
                "sql-datatype": "bigint(20) unsigned"
            },
        )
Exemple #26
0
def discover():
    schemas, field_metadata = get_schemas()
    catalog = Catalog([])

    for stream_name, schema_dict in schemas.items():
        schema = Schema.from_dict(schema_dict)
        mdata = field_metadata[stream_name]

        catalog.streams.append(
            CatalogEntry(stream=stream_name,
                         tap_stream_id=stream_name,
                         key_properties=STREAMS[stream_name]['key_properties'],
                         schema=schema,
                         metadata=mdata))

    return catalog
Exemple #27
0
 def test_should_output_nothing_given_no_streams_selected(
         self, mock_stdout):
     config = {"access_token": "my-access-token", "page_size": 1000}
     state = {}
     catalog = Catalog(streams=[
         CatalogEntry(
             tap_stream_id="users",
             stream="users",
             schema=Schema.from_dict({}),
             key_properties=["id"],
             metadata=[],
         )
     ])
     sync(config, state, catalog)
     mock_stdout.assert_not_called()
     assert LOGGER.info.mock_calls == [call("Skipping stream: %s", "users")]
Exemple #28
0
def resolve_catalog(discovered_catalog, streams_to_sync):
    result = Catalog(streams=[])

    # Iterate over the streams in the input catalog and match each one up
    # with the same stream in the discovered catalog.
    for catalog_entry in streams_to_sync:
        catalog_metadata = metadata.to_map(catalog_entry.metadata)
        replication_key = catalog_metadata.get((), {}).get("replication-key")

        discovered_table = discovered_catalog.get_stream(
            catalog_entry.tap_stream_id)
        database_name = common.get_database_name(catalog_entry)

        if not discovered_table:
            LOGGER.warning(
                "Database %s table %s was selected but does not exist",
                database_name,
                catalog_entry.table,
            )
            continue

        selected = {
            k
            for k, v in discovered_table.schema.properties.items()
            if common.property_is_selected(catalog_entry, k)
            or k == replication_key
        }

        # These are the columns we need to select
        columns = desired_columns(selected, discovered_table.schema)
        result.streams.append(
            CatalogEntry(
                tap_stream_id=catalog_entry.tap_stream_id,
                metadata=catalog_entry.metadata,
                stream=catalog_entry.tap_stream_id,
                table=catalog_entry.table,
                schema=Schema(
                    type="object",
                    properties={
                        col: discovered_table.schema.properties[col]
                        for col in columns
                    },
                ),
            ))

    return result
Exemple #29
0
def discover():
    entries = []

    for stream in streams:
        schema = Schema.from_dict(stream.get_schema())
        stream_metadata = []
        key_properties = stream.key_properties
        for prop, json_schema in schema.properties.items():
            inclusion = 'available'
            if prop in key_properties or prop == 'start_date':
                inclusion = 'automatic'

            stream_metadata.append({
                'breadcrumb': [],
                'metadata': {
                    'inclusion': 'available',
                    'table-key-properties': key_properties,
                    'schema-name': stream.tap_stream_id,
                    'selected': True,
                }
            })

            stream_metadata.append({
                'breadcrumb': ['properties', prop],
                'metadata': {
                    'inclusion': inclusion
                }
            })

        entries.append(
            CatalogEntry(
                tap_stream_id=stream.tap_stream_id,
                stream=stream.tap_stream_id,
                schema=schema,
                key_properties=key_properties,
                metadata=stream_metadata,
                replication_key=None,
                is_view=None,
                database=None,
                table=None,
                row_count=None,
                stream_alias=None,
                replication_method=None,
            )
        )
    return Catalog(entries)
def load_schemas(config):
    """ Load schemas from schemas folder """
    schemas = {}
    schema_dir_path = get_abs_path(config['schema_dir'])
    if os.path.isdir(schema_dir_path):
        for filename in os.listdir(schema_dir_path):
            path = get_abs_path(config['schema_dir']) + '/' + filename
            file_raw = filename.replace('.json', '')
            if os.path.isfile(path):
                with open(path) as file:
                    try:
                        schemas[file_raw] = Schema.from_dict(json.load(file))
                    except json.decoder.JSONDecodeError as err:
                        LOGGER.warning("Schema file : " + file_raw + " is invalid or not JSON : " + err.msg)
    else:
        LOGGER.warning(schema_dir_path + " : Is not a valid directory")
    return schemas