def schema_for_column(c): column_schema = {} #NB> from the post postgres docs: The current implementation does not enforce the declared number of dimensions either. #these means we can say nothing about an array column. its items may be more arrays or primitive types like integers #and this can vary on a row by row basis if c.array_dimensions > 0: column_schema["type"] = ["null", "array"] column_schema["items"] = {} return Schema.from_dict(column_schema) return Schema.from_dict(schema_for_column_datatype(c))
def generate_schemas(config: Dict) -> Dict: """Generate schemas based on the files or urls supplied in the config Args: config (Dict): Configuration Returns: Dict: Return a dictionary with stream_id: schema, key value pairs """ # Storage place for the individual schemas schemas = {} # Create a geo datasource instance geo_source = GeoSource(path=config['path'], config=config) # Each layer is a stream for layer_name, layer in geo_source.layers.items(): LOGGER.info(f'Found layer {layer_name}') schemas[layer_name] = Schema.from_dict({ 'type': ['null', 'object'], 'additionalProperties': False, 'selected': True, 'properties': layer.schema }) return schemas
def test_should_output_records(self, mock_stdout, requests_mock): requests_mock.get("https://api.nikabot.com/api/v1/groups?limit=1000&page=0", json=json.loads(GROUPS_RESPONSE)) requests_mock.get("https://api.nikabot.com/api/v1/groups?limit=1000&page=1", json=json.loads(EMPTY_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog( streams=[ CatalogEntry( tap_stream_id="groups", stream="groups", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{"breadcrumb": [], "metadata": {"selected": True}}], ) ] ) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call('{"type": "SCHEMA", "stream": "groups", "schema": {}, "key_properties": ["id"]}\n'), call( '{"type": "RECORD", "stream": "groups", "record": {"id": "f1b4b37cc2658672770b789f", "team_id": "T034F9NPW", "name": "TA Squad 5"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), call( '{"type": "RECORD", "stream": "groups", "record": {"id": "3176700ac4f2203b825fae6c", "team_id": "T034F9NPW", "name": "Platform Toolkit"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), ]
def test_should_output_records(self, mock_stdout, requests_mock): requests_mock.get( "https://api.nikabot.com/api/v1/projects?limit=1000&page=0", json=json.loads(PROJECTS_RESPONSE)) requests_mock.get( "https://api.nikabot.com/api/v1/projects?limit=1000&page=1", json=json.loads(EMPTY_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog(streams=[ CatalogEntry( tap_stream_id="projects", stream="projects", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{ "breadcrumb": [], "metadata": { "selected": True } }], ) ]) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call( '{"type": "SCHEMA", "stream": "projects", "schema": {}, "key_properties": ["id"]}\n' ), call( '{"type": "RECORD", "stream": "projects", "record": {"id": "5d6ca95e62a07c00045126e7", "project_name": "CAP - Analytics", "team_id": "T034F9NPW", "author": "U6K26HMGV", "pto": {"status": false}, "custom_ref": "", "create_date": "2019-09-02T05:32:14.23", "client": "", "type": "Capability Custodian", "created_at": "2019-09-02T05:32:14.23", "assigned_groups": ["Analytics"]}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), call( '{"type": "RECORD", "stream": "projects", "record": {"id": "5d6ca97c62a07c00045126e8", "project_name": "CAP - Authentication", "team_id": "T034F9NPW", "author": "U6K26HMGV", "pto": {"status": false}, "custom_ref": "", "create_date": "2019-09-02T05:32:44.172", "client": "", "type": "Capability Custodian", "created_at": "2019-09-02T05:32:44.172", "assigned_groups": ["Authentication"]}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), ]
def test_should_output_records(self, mock_stdout, requests_mock): requests_mock.get( "https://api.nikabot.com/api/v1/roles?limit=1000&page=0", json=json.loads(ROLES_RESPONSE)) requests_mock.get( "https://api.nikabot.com/api/v1/roles?limit=1000&page=1", json=json.loads(EMPTY_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog(streams=[ CatalogEntry( tap_stream_id="roles", stream="roles", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{ "breadcrumb": [], "metadata": { "selected": True } }], ) ]) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call( '{"type": "SCHEMA", "stream": "roles", "schema": {}, "key_properties": ["id"]}\n' ), call( '{"type": "RECORD", "stream": "roles", "record": {"id": "d893ebf32d49c35c1d754774", "team_id": "T034F9NPW", "name": "0.5"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), call( '{"type": "RECORD", "stream": "roles", "record": {"id": "cfabd9aa6f3e6381a716da58", "team_id": "T034F9NPW", "name": "0.1"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), ]
def test_insights_start_dates(self): insights = AdsInsights( name='insights', account=None, stream_alias="insights", options={}, annotated_schema=Schema.from_dict({ 'selected': True, 'properties': { 'something': { 'type': 'object' } } }), state={'bookmarks': { 'insights': { 'date_start': '2017-01-31' } }}) params = list(itertools.islice(insights.job_params(), 5)) self.assertEqual(params[0]['time_ranges'], [{ 'since': '2017-01-03', 'until': '2017-01-03' }]) self.assertEqual(params[4]['time_ranges'], [{ 'since': '2017-01-07', 'until': '2017-01-07' }])
def test_should_output_records(self, mock_stdout, requests_mock): requests_mock.get("https://api.nikabot.com/api/v1/teams", json=json.loads(TEAMS_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog(streams=[ CatalogEntry( tap_stream_id="teams", stream="teams", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{ "breadcrumb": [], "metadata": { "selected": True } }], ) ]) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call( '{"type": "SCHEMA", "stream": "teams", "schema": {}, "key_properties": ["id"]}\n' ), call( '{"type": "RECORD", "stream": "teams", "record": {"id": "5d6ca50762a07c00045125fb", "domain": "pageup", "bot_token": "e31d3b7ae51ff1feec8be578f23eb017e8143f66a7a085342c664544b81618ec41b87810d61a9c1f6133fe0c7d88aa3976232bb2a2665c4f89c38058b51cd20c", "activated_by": "U6K26HMGV", "status": "ACTIVE", "platform_id": "T034F9NPW", "created_at": "2019-09-02T05:13:43.151", "subscription": {"active_until": "2020-07-08T23:59:59", "status": "active", "number_of_users": 69, "subscriber_id": "U93KT77T6"}, "icon": {"image_34": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_34.png", "image_44": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_44.png", "image_68": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_68.png", "image_88": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_88.png", "image_102": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_102.png", "image_132": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_132.png", "image_230": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_230.png", "image_original": "https://avatars.slack-edge.com/2017-09-15/241678543093_b2ad80be9268cdbd89c3_original.png"}}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), ]
def test_object_from_dict_with_defaults(self): schema = Schema.from_dict(self.object_dict, inclusion='automatic') self.assertEquals('whatever', schema.inclusion, msg='The schema value should override the default') self.assertEquals('automatic', schema.properties['a_string'].inclusion) self.assertEquals('automatic', schema.properties['an_array'].items.inclusion)
def _map_to_schema(self, swagger: JsonResult) -> Schema: schema_with_refs = { **swagger["definitions"]["RecordDTO"], **{ "definitions": swagger["definitions"] } } schema = resolve_schema_references(schema_with_refs) return Schema.from_dict(schema)
def load_schemas(): # TODO: This example loads schemas from a project folder, replace with whatever discovery mechanism is available schemas = {} for filename in os.listdir(get_abs_path('schemas')): path = get_abs_path('schemas') + '/' + filename file_raw = filename.replace('.json', '') with open(path) as file: schemas[file_raw] = Schema.from_dict(json.load(file)) return schemas
def load_schemas() -> Dict[str, Any]: """ Load schemas from schemas folder """ schemas = {} for filename in os.listdir(_get_abs_path("schemas")): path = _get_abs_path("schemas") + "/" + filename file_raw = filename.replace(".json", "") with open(path) as file: schemas[file_raw] = Schema.from_dict(json.load(file)) return schemas
def load_schemas(): """ Load schemas from schemas folder """ schemas = {} for filename in os.listdir(get_abs_path('schemas')): path = get_abs_path('schemas') + '/' + filename file_raw = filename.replace('.json', '') with open(path) as file: schemas[file_raw] = Schema.from_dict(json.load(file)) return schemas
def discover(config): streams = [] for table_spec in config['tables']: modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_input_files_for_table( table_spec, modified_since) sample_rate = table_spec.get('sample_rate', 10) max_sampling_read = table_spec.get('max_sampling_read', 1000) max_sampled_files = table_spec.get('max_sampled_files', 5) prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer', False) samples = file_utils.sample_files(table_spec, target_files, sample_rate=sample_rate, max_records=max_sampling_read, max_files=max_sampled_files) metadata_schema = { '_smart_source_bucket': { 'type': 'string' }, '_smart_source_file': { 'type': 'string' }, '_smart_source_lineno': { 'type': 'integer' }, } data_schema = conversion.generate_schema( samples, prefer_number_vs_integer=prefer_number_vs_integer) inferred_schema = { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) } merged_schema = override_schema_with_config(inferred_schema, table_spec) schema = Schema.from_dict(merged_schema) stream_metadata = [] key_properties = table_spec.get('key_properties', []) streams.append( CatalogEntry( tap_stream_id=table_spec['name'], stream=table_spec['name'], schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def test_should_output_records(self, mock_stdout, requests_mock): requests_mock.get( "https://api.nikabot.com/api/v1/users?limit=1000&page=0", json=json.loads(USERS_RESPONSE)) requests_mock.get( "https://api.nikabot.com/api/v1/users?limit=1000&page=1", json=json.loads(EMPTY_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog(streams=[ CatalogEntry( tap_stream_id="users", stream="users", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{ "breadcrumb": [], "metadata": { "selected": True } }], ) ]) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call( '{"type": "SCHEMA", "stream": "users", "schema": {}, "key_properties": ["id"]}\n' ), call( '{"type": "RECORD", "stream": "users", "record": {"id": "5de459977292020014fb601c", "name": "Billy", "deleted": true, "presence": "away", "user_id": "UR5B0QABX", "team_id": "T034F9NPW", "is_restricted": false, "is_ultra_restricted": false, "is_admin": false, "is_nikabot_admin": false, "tz": "Australia/Canberra", "tz_label": "Australian Eastern Standard Time", "tz_offset": 36000, "is_checkin_excluded": true, "created_at": "2019-12-02T00:23:51.087", "groups": [], "updated_at": "2020-06-14T22:47:29.617"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), call( '{"type": "RECORD", "stream": "users", "record": {"id": "68QMxnnt8YcpPdfmM", "name": "paul.heasley", "deleted": false, "presence": "active", "user_id": "U04AX35QP", "team_id": "T034F9NPW", "is_restricted": false, "is_ultra_restricted": false, "is_admin": false, "is_nikabot_admin": true, "tz": "Australia/Canberra", "tz_label": "Australian Eastern Standard Time", "tz_offset": 36000, "is_checkin_excluded": false, "create_date": "2019-09-02T05:13:47.88", "created_at": "2019-09-02T05:13:47.882", "role": "0.1", "groups": ["TA Stream", "TA Squad 1", "TA Squad 2", "TA Squad 3", "TA Squad 4", "Learning Applications", "Notification Capability"], "updated_at": "2020-06-15T06:07:58.272"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), ] assert LOGGER.info.mock_calls == [ call("Syncing stream: %s", "users"), call( "Making %s request to %s with params %s", "GET", "https://api.nikabot.com/api/v1/users", { "limit": "1000", "page": "0" }, ), call( "Making %s request to %s with params %s", "GET", "https://api.nikabot.com/api/v1/users", { "limit": "1000", "page": "1" }, ), ]
def discover(client, custom_reports): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): stream_instance = STREAMS[stream_id] stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=stream_instance.key_properties, valid_replication_keys=stream_instance.replication_key, replication_method=stream_instance.replication_method) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=stream_instance.key_properties, metadata=stream_metadata, replication_key=stream_instance.replication_key, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=stream_instance.replication_method, )) if custom_reports: for report in custom_reports: schema = build_schema(client, report) schema = Schema.from_dict(schema) key_properties = report.get('key_properties') replication_key = report.get('valid_replication_keys') stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys=replication_key, replication_method=None) streams.append( CatalogEntry( tap_stream_id=report['stream_id'], stream=report['stream_id'], schema=schema, key_properties=report.get('key_properties'), metadata=stream_metadata, replication_key=report.get('valid_replication_keys'), is_view=None, database=None, table=None, row_count=None, stream_alias=report, replication_method=None, )) return Catalog(streams)
def mock_catalog(): return Catalog( streams=[ CatalogEntry( tap_stream_id="records", stream="records", schema=Schema.from_dict(json.loads(SCHEMA)), key_properties=["id"], metadata=[{"breadcrumb": [], "metadata": {"selected": True}}], replication_key="date", replication_method="INCREMENTAL", ) ] )
def generate_schema(table_spec, samples): metadata_schema = { '_smart_source_bucket': {'type': 'string'}, '_smart_source_file': {'type': 'string'}, '_smart_source_lineno': {'type': 'integer'}, } prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer', False) data_schema = conversion.generate_schema(samples, prefer_number_vs_integer=prefer_number_vs_integer) inferred_schema = { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) } merged_schema = override_schema_with_config(inferred_schema, table_spec) return Schema.from_dict(merged_schema)
def build_schema(query_resource): return Schema.from_dict({ 'type': ['null', 'object'], 'additionalProperties': False, 'properties': { **{ key: {'type': ['null', 'string']} for key in query_resource['params']['groupBys'] }, **{ key: {'type': ['null', 'number']} for key in query_resource['params']['metrics'] }, } })
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=STREAMS[stream_name]['key_properties'], schema=schema, metadata=mdata)) return catalog
def test_should_output_nothing_given_no_streams_selected( self, mock_stdout): config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog(streams=[ CatalogEntry( tap_stream_id="users", stream="users", schema=Schema.from_dict({}), key_properties=["id"], metadata=[], ) ]) sync(config, state, catalog) mock_stdout.assert_not_called() assert LOGGER.info.mock_calls == [call("Skipping stream: %s", "users")]
def discover(): entries = [] for stream in streams: schema = Schema.from_dict(stream.get_schema()) stream_metadata = [] key_properties = stream.key_properties for prop, json_schema in schema.properties.items(): inclusion = 'available' if prop in key_properties or prop == 'start_date': inclusion = 'automatic' stream_metadata.append({ 'breadcrumb': [], 'metadata': { 'inclusion': 'available', 'table-key-properties': key_properties, 'schema-name': stream.tap_stream_id, 'selected': True, } }) stream_metadata.append({ 'breadcrumb': ['properties', prop], 'metadata': { 'inclusion': inclusion } }) entries.append( CatalogEntry( tap_stream_id=stream.tap_stream_id, stream=stream.tap_stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, ) ) return Catalog(entries)
def load_schemas(config): """ Load schemas from schemas folder """ schemas = {} schema_dir_path = get_abs_path(config['schema_dir']) if os.path.isdir(schema_dir_path): for filename in os.listdir(schema_dir_path): path = get_abs_path(config['schema_dir']) + '/' + filename file_raw = filename.replace('.json', '') if os.path.isfile(path): with open(path) as file: try: schemas[file_raw] = Schema.from_dict(json.load(file)) except json.decoder.JSONDecodeError as err: LOGGER.warning("Schema file : " + file_raw + " is invalid or not JSON : " + err.msg) else: LOGGER.warning(schema_dir_path + " : Is not a valid directory") return schemas
def load_schemas() -> dict: """Load schemas from schemas folder. Returns: dict -- Scemas """ schemas: dict = {} # For every file in the schemas directory for filename in os.listdir(get_abs_path('schemas')): abs_path: str = get_abs_path('schemas') file_raw: str = filename.replace('.json', '') # Open and load the schema with open(f'{abs_path}/{filename}') as schema_file: schemas[file_raw] = Schema.from_dict(json.load(schema_file)) return schemas
def discover(config): client = Client(config) streams = [] for _, stream in STREAM_OBJECTS.items(): raw_schema = load_schema(stream.tap_stream_id) schema = Schema.from_dict(raw_schema) streams.append( CatalogEntry(stream=stream.tap_stream_id, tap_stream_id=stream.tap_stream_id, key_properties=stream.pk_fields, schema=schema, metadata=metadata.get_standard_metadata( schema=raw_schema, schema_name=stream.tap_stream_id, key_properties=stream.pk_fields, valid_replication_keys=stream.replication_keys, replication_method=stream.replication_method))) return Catalog(streams)
def from_dict(cls, data): # TODO: We may want to store streams as a dict where the key is a # tap_stream_id and the value is a CatalogEntry. This will allow # faster lookup based on tap_stream_id. This would be a breaking # change, since callers typically access the streams property # directly. streams = [] for stream in data['streams']: entry = CatalogEntry() entry.tap_stream_id = stream.get('tap_stream_id') entry.stream = stream.get('stream') entry.replication_key = stream.get('replication_key') entry.key_properties = stream.get('key_properties') entry.database = stream.get('database_name') entry.table = stream.get('table_name') entry.schema = Schema.from_dict(stream.get('schema')) entry.is_view = stream.get('is_view') streams.append(entry) return Catalog(streams)
def get_schemas(): schemas = {} schemas_metadata = {} for stream_name, stream_metadata in STREAMS.items(): schema = None schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) meta = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) schemas[stream_name] = Schema.from_dict(schema) schemas_metadata[stream_name] = meta return schemas, schemas_metadata
def get_schema(self, sheet_name, worksheet_name=None): data = self.get_data(sheet_name, worksheet_name) # add object to schema builder so he can infer schema builder = SchemaBuilder() if len(data) == 0: # build sample record to be used for schema inference if the # spreadsheet is empty sample_record = { key: "some string" for key in self.headers[worksheet_name] } builder.add_object(sample_record) else: for record in data: builder.add_object(record) # create a singer Schema from Json Schema singer_schema = Schema.from_dict(builder.to_schema()) self.schema[worksheet_name] = singer_schema.to_dict() return self.schema[worksheet_name]
def test_should_output_no_records_given_no_records_available( self, mock_stdout, requests_mock): requests_mock.get( "https://api.nikabot.com/api/v1/users?limit=1000&page=0", json=json.loads(EMPTY_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog(streams=[ CatalogEntry( tap_stream_id="users", stream="users", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{ "breadcrumb": [], "metadata": { "selected": True } }], ) ]) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call( '{"type": "SCHEMA", "stream": "users", "schema": {}, "key_properties": ["id"]}\n' ) ] assert LOGGER.info.mock_calls == [ call("Syncing stream: %s", "users"), call( "Making %s request to %s with params %s", "GET", "https://api.nikabot.com/api/v1/users", { "limit": "1000", "page": "0" }, ), ]
def test_insights_job_params_stops(self): start_date = tap_facebook.TODAY.subtract(days=2) insights = AdsInsights(name='insights', account=None, stream_alias="insights", options={}, annotated_schema=Schema.from_dict({ 'selected': True, 'properties': { 'something': { 'type': 'object' } } }), state={ 'bookmarks': { 'insights': { 'date_start': start_date.to_date_string() } } }) self.assertEqual(31, len(list(insights.job_params())))
def load_schemas(): ### Load schemas from schemas folder ### schemas = {} schemas_path = get_abs_path('schemas') # If the '/schemas/ folder is missing, create it. if not os.path.isdir(schemas_path): os.mkdir(schemas_path) # If no schemas are found in the /schemas/ folder, then generate them using create_schemas.py if len([ name for name in os.listdir(schemas_path) if os.path.isfile(os.path.join(schemas_path, name)) ]) == 0: create_schemas() # now grab the .json files in /schemas/ and output the catalog.json file. for filename in os.listdir(schemas_path): path = schemas_path + '/' + filename file_raw = filename.replace('.json', '') with open(path) as file: schemas[file_raw] = Schema.from_dict(json.load(file)) return schemas # returns a 'dict' that contains <class 'singer.schema.Schema'> objects