def get_schemas(): schemas = {} field_metadata = {} flat_streams = flatten_streams() for stream_name, stream_metadata in flat_streams.items(): replication_ind = stream_metadata.get('replication_ind', True) if replication_ind: schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get( 'replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata return schemas, field_metadata
def get_schemas(client, spreadsheet_id): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata if stream_name == 'spreadsheet_metadata': api = stream_metadata.get('api', 'sheets') params = stream_metadata.get('params', {}) querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ spreadsheet_id), querystring) # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ endpoint=stream_name) sheets = spreadsheet_md_results.get('sheets') if sheets: # Loop thru each worksheet in spreadsheet for sheet in sheets: # GET sheet_json_schema for each worksheet (from function above) sheet_json_schema, columns = get_sheet_metadata( sheet, spreadsheet_id, client) # SKIP empty sheets (where sheet_json_schema and columns are None) if sheet_json_schema and columns: sheet_title = sheet.get('properties', {}).get('title') schemas[sheet_title] = sheet_json_schema sheet_mdata = metadata.new() sheet_mdata = metadata.get_standard_metadata( schema=sheet_json_schema, key_properties=['__sdc_row'], valid_replication_keys=None, replication_method='FULL_TABLE') field_metadata[sheet_title] = sheet_mdata return schemas, field_metadata
def test_create_column_metadata(self): cols = [{'pos': 1, 'name': 'col1', 'type': 'int2', 'nullable': 'NO'}, {'pos': 2, 'name': 'col2', 'type': 'float8', 'nullable': 'YES'}, {'pos': 3, 'name': 'col3', 'type': 'timestamptz', 'nullable': 'NO'}] db_name = 'test-db' table_name = 'test_table' key_properties = ['col1'] is_view = False expected_mdata = metadata.new() metadata.write(expected_mdata, (), 'selected-by-default', False) metadata.write(expected_mdata, (), 'valid-replication-keys', ['col3']) metadata.write(expected_mdata, (), 'table-key-properties', key_properties) metadata.write(expected_mdata, (), 'is-view', is_view) metadata.write(expected_mdata, (), 'schema-name', table_name) metadata.write(expected_mdata, (), 'database-name', db_name) for col in cols: schema = tap_redshift.schema_for_column(col) metadata.write(expected_mdata, ( 'properties', col['name']), 'selected-by-default', True) metadata.write(expected_mdata, ( 'properties', col['name']), 'sql-datatype', col['type']) metadata.write(expected_mdata, ( 'properties', col['name']), 'inclusion', schema.inclusion) actual_mdata = tap_redshift.create_column_metadata( db_name, cols, is_view, table_name, key_properties) assert_that(actual_mdata, equal_to(metadata.to_list(expected_mdata)))
def discover(ctx): check_credentials_are_authorized(ctx) catalog = Catalog([]) for stream in streams.STREAMS: schema = Schema.from_dict(streams.load_schema(stream.tap_stream_id), inclusion="available") mdata = metadata.new() for prop in schema.properties: if prop in streams.PK_FIELDS[stream.tap_stream_id]: mdata = metadata.write(mdata, ('properties', prop), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', prop), 'inclusion', 'available') catalog.streams.append( CatalogEntry( stream=stream.tap_stream_id, tap_stream_id=stream.tap_stream_id, key_properties=streams.PK_FIELDS[stream.tap_stream_id], schema=schema, metadata=metadata.to_list(mdata))) return catalog
def get_schemas(): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 # mdata = metadata.get_standard_metadata( mdata = get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None), ) for field in stream_metadata['default_selected_fields']: write(mdata, ('properties', field), 'selected', 'true') field_metadata[stream_name] = to_list(mdata) return schemas, field_metadata
def get_schemas(): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) # Add additional metadata if stream_name in ('ad_analytics_by_campaign', 'ad_analytics_by_creative'): mdata_map = metadata.to_map(mdata) mdata_map[('properties', 'date_range')]['inclusion'] = 'automatic' mdata_map[('properties', 'pivot')]['inclusion'] = 'automatic' mdata_map[('properties', 'pivot_value')]['inclusion'] = 'automatic' mdata = metadata.to_list(mdata_map) field_metadata[stream_name] = mdata return schemas, field_metadata
def get_metadata(self): keys = self.schema.get('properties').keys() self.key_properties = [k for k in keys if 'date' in k] mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', self.key_properties) mdata = metadata.write(mdata, (), 'forced-replication-method', 'INCREMENTAL') for field_name in keys: if field_name in self.key_properties: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') mdata = metadata.write(mdata, ('properties', field_name), 'selected-by-default', True) return metadata.to_list(mdata)
def load_discovered_schema(stream): schema = load_schema(stream.tap_stream_id) mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', stream.key_properties) mdata = metadata.write(mdata, (), 'forced-replication-method', stream.replication_method) if stream.replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [stream.replication_key]) for field_name, props in schema['properties'].items(): if field_name in stream.key_properties or field_name == stream.replication_key: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') # The engagements stream has nested data that we synthesize; The engagement field needs to be automatic if stream.tap_stream_id == "engagements": mdata = metadata.write(mdata, ('properties', 'engagement'), 'inclusion', 'automatic') return schema, metadata.to_list(mdata)
def load_discovered_schema(stream): schema = load_schema(stream.tap_stream_id) mdata = metadata.new() mdata = metadata.write(mdata, (), "table-key-properties", stream.key_properties) mdata = metadata.write(mdata, (), "forced-replication-method", stream.replication_method) if stream.replication_key: mdata = metadata.write(mdata, (), "valid-replication-keys", [stream.replication_key]) for field_name, props in schema["properties"].items(): if field_name in stream.key_properties or field_name == stream.replication_key: mdata = metadata.write(mdata, ("properties", field_name), "inclusion", "automatic") else: mdata = metadata.write(mdata, ("properties", field_name), "inclusion", "available") # The engagements stream has nested data that we synthesize; The engagement field needs to be automatic if stream.tap_stream_id == "engagements": mdata = metadata.write(mdata, ("properties", "engagement"), "inclusion", "automatic") return schema, metadata.to_list(mdata)
def discover_catalog(name, automatic_inclusion, **kwargs): unsupported = kwargs.get("unsupported", frozenset([])) stream_automatic_inclusion = kwargs.get("stream_automatic_inclusion", False) root = os.path.dirname(os.path.realpath(__file__)) path = os.path.join(root, 'schemas/{}.json'.format(name)) mdata = metadata.new() with open(path, "r") as f: discovered_schema = json.load(f) for field in discovered_schema["schema"]["properties"]: if field in automatic_inclusion: mdata = metadata.write(mdata, ('properties', field), 'inclusion', 'automatic') elif field in unsupported: mdata = metadata.write(mdata, ('properties', field), 'inclusion', 'unsupported') else: mdata = metadata.write(mdata, ('properties', field), 'inclusion', 'available') if stream_automatic_inclusion: mdata = metadata.write(mdata, (), 'inclusion', 'automatic') discovered_schema["metadata"] = metadata.to_list(mdata) return discovered_schema
def get_schemas(client, properties_flag, denest_properties_flag): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): # When the client detects disable_engage_endpoint, skip discovering the stream if stream_name == 'engage' and client.disable_engage_endpoint: LOGGER.warning( 'Mixpanel returned a 402 indicating the Engage endpoint and stream is unavailable. Skipping.' ) continue schema = get_schema(client, properties_flag, denest_properties_flag, stream_name) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata return schemas, field_metadata
def discover(): """ Allow discovery of all streams and metadata """ raw_schemas = load_schemas() streams = [] for schema_name, schema in raw_schemas.items(): mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', ['id']) mdata = metadata.write(mdata, ('properties', 'id'), 'inclusion', 'automatic') mdata = metadata.write(mdata, (), 'valid-replication-keys', ['updated_at']) mdata = metadata.write(mdata, ('properties', 'updated_at'), 'inclusion', 'automatic') for field_name in schema['properties'].keys(): if field_name not in {'id', 'updated_at'}: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') # create and add catalog entry catalog_entry = { 'stream': schema_name, 'tap_stream_id': schema_name, 'schema': schema, 'metadata': metadata.to_list(mdata), 'key_properties': ['id'] } streams.append(catalog_entry) return {'streams': streams}
def generate_catalog(self): cls = self.__class__ # get the reference schemas refs = load_schema_references() # resolve the schema reference and make final schema schema = singer.resolve_schema_references(load_schema(cls.TABLE), refs) mdata = metadata.new() # use 'get_standard_metadata' with primary key, replication key and replication method mdata = metadata.get_standard_metadata( schema=schema, key_properties=self.KEY_PROPERTIES, valid_replication_keys=self.REPLICATION_KEYS if self.REPLICATION_KEYS else None, replication_method=self.REPLICATION_METHOD) mdata_map = metadata.to_map(mdata) # make 'automatic' inclusion for replication keys for replication_key in self.REPLICATION_KEYS: mdata_map[('properties', replication_key)]['inclusion'] = 'automatic' return [{ 'tap_stream_id': cls.TABLE, 'stream': cls.TABLE, 'key_properties': cls.KEY_PROPERTIES, 'schema': schema, 'metadata': metadata.to_list(mdata_map) }]
def _populate_metadata(schema_name: str, schema: Dict) -> Dict: """ Populates initial metadata for each field in a schema. Args: schema_name: The schema name to generate metadata for e.g. 'general_ledger_accounts'. schema: The corresponding JSON schema. Returns: Metadata dictionary for the selected stream. Fields are disabled by default. """ mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', KEY_PROPERTIES[schema_name]) mdata = metadata.write(mdata, (), 'selected', False) for field_name in schema['properties']: if field_name in KEY_PROPERTIES[schema_name]: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') mdata = metadata.write(mdata, ('properties', field_name), 'selected', False) return mdata
def load_metadata(self): schema = self.load_schema() mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', self.key_properties) mdata = metadata.write(mdata, (), 'forced-replication-method', self.replication_method) if self.replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [self.replication_key]) for field_name in schema['properties'].keys(): if field_name in self.key_properties or field_name == self.replication_key: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') # For period stream adjust schema for time period if hasattr(self, 'period') and self.period == 'hourRange': mdata.pop(('properties', 'day')) elif hasattr(self, 'period') and self.period == 'dayRange': mdata.pop(('properties', 'hour')) return metadata.to_list(mdata)
def get_metadata(schema, key_properties, replication_method, replication_key): mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', key_properties) mdata = metadata.write(mdata, (), 'forced-replication-method', replication_method) if replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [replication_key]) for field_name in schema['properties'].keys(): if field_name in key_properties \ or field_name in [replication_key, "updated"]: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return metadata.to_list(mdata)
def get_schemas(): schemas = {} field_metadata = {} for stream_name, stream_class in STREAMS.items(): base_schema_path = 'schemas/{}.json'.format(stream_name) schema_file_path = stream_class.json_schema or base_schema_path schema_path = get_abs_path(schema_file_path) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_class.key_properties or None, valid_replication_keys=stream_class.replication_keys or None, replication_method=stream_class.replication_method or None) mdata_map = metadata.to_map(mdata) # update inclusion of "replication keys" as "automatic" for replication_key in (stream_class.replication_keys or []): mdata_map[('properties', replication_key)]['inclusion'] = 'automatic' field_metadata[stream_name] = metadata.to_list(mdata_map) return schemas, field_metadata
def generate_metadata(schema): mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', ['id']) for field_name, props in schema['properties'].items(): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') return metadata.to_list(mdata)
def load_metadata(schema): mdata = metadata.new() for field_name in schema.get('properties', {}).keys(): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') if field_name == "RECORDNO": mdata = metadata.write(mdata, (), 'table-key-properties', "RECORDNO") return metadata.to_list(mdata)
def generate_metadata(schema_name, schema): pk_fields = SCHEMA_PRIMARY_KEYS[schema_name] mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', pk_fields) for field_name in schema['properties'].keys(): if field_name in pk_fields: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return metadata.to_list(mdata)
def populate_metadata(schema_name, schema): mdata = metadata.new() #mdata = metadata.write(mdata, (), 'forced-replication-method', KEY_PROPERTIES[schema_name]) mdata = metadata.write(mdata, (), 'table-key-properties', KEY_PROPERTIES[schema_name]) for field_name in schema['properties'].keys(): if field_name in KEY_PROPERTIES[schema_name]: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return mdata
def load_metadata(table_spec, schema): mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', table_spec['key_properties']) for field_name in schema.get('properties', {}).keys(): if table_spec.get('key_properties', []) and field_name in table_spec.get('key_properties', []): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return metadata.to_list(mdata)
def generate_metadata(stream, schema): mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', stream.pk_fields) for field_name in schema.properties.keys(): if field_name in stream.pk_fields: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return metadata.to_list(mdata)
def populate_metadata(schema_name, schema): mdata = metadata.new() mdata = metadata.write(mdata, (), "table-key-properties", ["id"]) for field_name in schema["properties"].keys(): mdata = metadata.write( mdata, ("properties", field_name), "inclusion", "automatic" if field_name == "id" else "available", ) return mdata
def populate_metadata(schema_name, schema): mdata = metadata.new() # mdata = metadata.write(mdata, (), 'forced-replication-method', KEY_PROPERTIES[schema_name]) mdata = metadata.write(mdata, (), "table-key-properties", KEY_PROPERTIES[schema_name]) for field_name in schema["properties"].keys(): if field_name in KEY_PROPERTIES[schema_name]: mdata = metadata.write( mdata, ("properties", field_name), "inclusion", "automatic" ) else: mdata = metadata.write( mdata, ("properties", field_name), "inclusion", "available" ) return mdata
def load_metadata(schema): mdata = metadata.new() key_properties = [ sampling.SDC_SOURCE_FILE_COLUMN, sampling.SDC_SOURCE_LINENO_COLUMN ] mdata = metadata.write(mdata, (), 'table-key-properties', key_properties) # Make all fields automatic for field_name in schema.get('properties', {}).keys(): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') return metadata.to_list(mdata)
def load_metadata(table_name, schema): mdata = metadata.new() key_properties = get_key_properties(table_name) mdata = metadata.write(mdata, (), 'table-key-properties', key_properties) for field_name in schema.get('properties', {}).keys(): if field_name in key_properties: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return metadata.to_list(mdata)
def get_discovery_metadata(stream, schema): mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', stream.key_properties) mdata = metadata.write(mdata, (), 'forced-replication-method', stream.replication_method) if stream.replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [stream.replication_key]) for field_name in schema['properties'].keys(): if field_name in stream.key_properties or field_name == stream.replication_key: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return metadata.to_list(mdata)
def generate_catalog(self): cls = self.__class__ mdata = metadata.new() metadata.write(mdata, (), 'inclusion', 'available') for prop in cls.SCHEMA['properties']: # pylint:disable=unsubscriptable-object metadata.write(mdata, ('properties', prop), 'inclusion', 'available') return [{ 'tap_stream_id': cls.TABLE, 'stream': cls.TABLE, 'key_properties': cls.KEY_PROPERTIES, 'schema': cls.SCHEMA, 'metadata': metadata.to_list(mdata) }]
def load_metadata(self): schema = self.load_schema() mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', self.key_properties) mdata = metadata.write(mdata, (), 'forced-replication-method', self.replication_method) if self.replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [self.replication_key]) for field_name in schema['properties'].keys(): if field_name in self.key_properties or field_name == self.replication_key: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return metadata.to_list(mdata)