def __init__(self, log_name): self.log_name = log_name load_package_config('/nail/srv/configs/data_pipeline_tools.yaml') self.config = get_config() self.log = logging.getLogger(self.log_name) self._setup_logging() self.schematizer = get_schematizer()
def __init__(self): super(BaseParseReplicationStream, self).__init__() self.db_connections = get_connection( config.env_config.topology_path, config.env_config.rbr_source_cluster, config.env_config.schema_tracker_cluster, config.env_config.rbr_state_cluster, is_avoid_internal_packages_set(), config.env_config.rbr_source_cluster_topology_name, ) self.schema_wrapper = SchemaWrapper( db_connections=self.db_connections, schematizer_client=get_schematizer() ) self.register_dry_run = config.env_config.register_dry_run self.publish_dry_run = config.env_config.publish_dry_run self._running = True self._profiler_running = False self._changelog_mode = config.env_config.changelog_mode if get_config().kafka_producer_buffer_size > config.env_config.recovery_queue_size: # Printing here, since this executes *before* logging is # configured. sys.stderr.write("Shutting down because kafka_producer_buffer_size was greater than \ recovery_queue_size") sys.exit(1)
def schematizer(): schematizer = get_schematizer() # schematizer is a Singleton. Rerun the ctor of Schematizer per module. schematizer._client = get_config().schematizer_client # swaggerpy client schematizer._cache = _Cache() schematizer._avro_schema_cache = {} return schematizer
def meta_attribute_avro_schema(self, meta_attribute_avro_schema_json): return get_schematizer().register_schema_from_schema_json( namespace="test_namespace", source="meta_me_meta", schema_json=meta_attribute_avro_schema_json, source_owner_email="*****@*****.**", contains_pii=False)
def test_missing_mandatory_meta_attributes( self, valid_message_data, meta_param, mandatory_meta_attr_ids ): with mock.patch.object( get_schematizer(), 'get_meta_attributes_by_schema_id', return_value=mandatory_meta_attr_ids ): with pytest.raises(MissingMetaAttributeException) as e: self._get_dry_run_message_with_meta( valid_message_data, meta_param ) assert e.value.args assert ( "Meta Attributes with IDs `{0}` are not found for " "schema_id `{1}`.".format( ", ".join(str(m) for m in ( {id for id in mandatory_meta_attr_ids} - {m.schema_id for m in meta_param} )), valid_message_data['schema_id'] )) in e.value.args[0]
def monitor_schema(self): return get_schematizer().register_schema( namespace=self._monitor_schema['namespace'], source=self._monitor_schema['name'], schema_str=simplejson.dumps(self._monitor_schema), source_owner_email='*****@*****.**', contains_pii=False)
def __init__(self): super(BaseParseReplicationStream, self).__init__() self.db_connections = get_connection( config.env_config.topology_path, config.env_config.rbr_source_cluster, config.env_config.schema_tracker_cluster, config.env_config.rbr_state_cluster, config.env_config.rbr_source_cluster_topology_name, ) self.schema_wrapper = SchemaWrapper( db_connections=self.db_connections, schematizer_client=get_schematizer()) self.register_dry_run = config.env_config.register_dry_run self.publish_dry_run = config.env_config.publish_dry_run self._running = True self._profiler_running = False self._changelog_mode = config.env_config.changelog_mode if get_config( ).kafka_producer_buffer_size > config.env_config.recovery_queue_size: # Printing here, since this executes *before* logging is # configured. sys.stderr.write( "Shutting down because kafka_producer_buffer_size was greater than \ recovery_queue_size") sys.exit(1)
def process_commandline_options(self, args=None): super(CompactionSetter, self).process_commandline_options(args=args) load_package_config(self.options.config_path) self.dry_run = self.options.dry_run self.whitelist_topic = self.options.whitelist_topic self.schematizer = get_schematizer()
def process_commandline_options(self, args=None): super(FullRefreshRequester, self).process_commandline_options(args=args) if (self.options.avg_rows_per_second_cap is not None and self.options.avg_rows_per_second_cap <= 0): raise ValueError( "--avg-rows-per-second-cap must be greater than 0") if self.options.batch_size <= 0: raise ValueError("--batch-size option must be greater than 0.") if not self.options.source_id and not (self.options.source_name and self.options.namespace): raise ValueError( "--source-id or both of--source-name and --namespace must be defined" ) if self.options.source_id and (self.options.source_name or self.options.namespace): raise ValueError( "Cannot use both --source-id and either of --namespace and --source-name" ) load_package_config(self.options.config_path) self.schematizer = get_schematizer() source_ids = self.get_source_ids() if len(source_ids) == 0: raise ValueError( "Found no sources with namespace_name {} and source_name {}". format(self.options.namespace, self.options.source_name)) elif len(source_ids) > 1: raise ValueError( "Pair of namespace_name {} and source_name {} somehow received more than one" " source. Investigation as to how is recommended.".format( self.options.namespace, self.options.source_name)) self.source_id = source_ids[0]
def registration_schema(self): schema_json = self._registration_schema return get_schematizer().register_schema( namespace=schema_json['namespace'], source=schema_json['name'], schema_str=simplejson.dumps(schema_json), source_owner_email='*****@*****.**', contains_pii=False)
def test_setup_contains_pii_from_schematizer_once(self, message): schematizer_client = get_schematizer() with attach_spy_on_func(schematizer_client, 'get_schema_by_id') as spy: message.contains_pii assert spy.call_count == 1 with attach_spy_on_func(schematizer_client, 'get_schema_by_id') as spy: message.contains_pii assert spy.call_count == 0
def get_schema_json(cls): return get_schematizer().register_schema( schema_str=cls.SOURCE_SCHEMA, namespace='test_namespace', source="test_source_{}".format(randint(0, 100)), source_owner_email='*****@*****.**', contains_pii=False )
def meta_attribute_avro_schema(self, meta_attribute_avro_schema_json): return get_schematizer().register_schema_from_schema_json( namespace="test_namespace", source="meta_me_meta", schema_json=meta_attribute_avro_schema_json, source_owner_email="*****@*****.**", contains_pii=False )
def mock_get_topics_by_criteria(self, topics): with mock.patch.object( get_schematizer(), 'get_topics_by_criteria', return_value=topics, autospec=True ) as mock_schematizer: yield mock_schematizer
def test_set_meta_with_valid_meta_attributes(self, valid_message_data, meta_param, mandatory_meta_attr_ids): with mock.patch.object(get_schematizer(), 'get_meta_attributes_by_schema_id', return_value=mandatory_meta_attr_ids): dry_run_message = self._get_dry_run_message_with_meta( valid_message_data, meta_param) assert dry_run_message._meta == meta_param
def registration_schema(self): schema_json = self._registration_schema return get_schematizer().register_schema( namespace=schema_json['namespace'], source=schema_json['name'], schema_str=simplejson.dumps(schema_json), source_owner_email='*****@*****.**', contains_pii=False )
def check_schematizer_has_correct_source_info(context): schematizer = get_schematizer() sources = schematizer.get_sources_by_namespace(context.data['namespace']) source = next(src for src in reversed(sources) if src.name == context.data['table_name']) topic = unlist(schematizer.get_topics_by_source_id(source.source_id)) schema = schematizer.get_latest_schema_by_topic_name(topic.name) context.data['kafka_topic'] = topic.name setup_kafka_topic(topic.name) assert schema.topic.source.name == context.data['table_name'] assert schema.topic.source.namespace.name == context.data['namespace'] assert schema.schema_json == context.data['expected_avro_schema']
def test_set_meta_with_valid_meta_attributes( self, valid_message_data, meta_param, mandatory_meta_attr_ids ): with mock.patch.object( get_schematizer(), 'get_meta_attributes_by_schema_id', return_value=mandatory_meta_attr_ids ): dry_run_message = self._get_dry_run_message_with_meta( valid_message_data, meta_param ) assert dry_run_message._meta == meta_param
def _register_schema(self, namespace, source, containers): avro_schema = { 'type': 'record', 'name': source, 'namespace': namespace, 'doc': 'test', 'fields': [{'type': 'int', 'doc': 'test', 'name': 'id'}] } reg_schema = get_schematizer().register_schema_from_schema_json( namespace=namespace, source=source, schema_json=avro_schema, source_owner_email='*****@*****.**', contains_pii=False ) containers.create_kafka_topic(str(reg_schema.topic.name)) return reg_schema
def _setup_schematizer_topics(self): if self.options.namespace or self.options.source: schematizer = get_schematizer() additional_topics = schematizer.get_topics_by_criteria( namespace_name=self.options.namespace, source_name=self.options.source ) if self.options.only_newest: additional_topics = self._filter_by_most_recently_updated(additional_topics) logger.info( "Received {} new topics from --source and --namespace options".format( len(additional_topics) ) ) for topic in additional_topics: if str(topic.name) not in self.topic_to_offsets_map: self.topic_to_offsets_map[str(topic.name)] = None
def get_transaction_id_schema_id(gtid_enabled): if gtid_enabled: file_name = GLOBAL_TRANSACTION_ID_SCHEMA_FILEPATH source = 'global_transaction_id' else: file_name = LOG_TRANSACTION_ID_SCHEMA_FILEPATH source = 'log_transaction_id' with open(file_name, 'r') as schema_file: avro_schema = simplejson.loads(schema_file.read()) schema = get_schematizer().register_schema_from_schema_json( namespace='yelp.replication_handler', source=source, schema_json=avro_schema, source_owner_email='*****@*****.**', contains_pii=False, ) return schema.schema_id
def _register_avro_schema(self, namespace, source, two_fields): fields = [{'type': 'int', 'doc': 'test', 'name': 'foo'}] if two_fields: fields.append({'type': 'int', 'doc': 'test', 'name': 'bar'}) schema_json = { 'type': 'record', 'name': source, 'namespace': namespace, 'doc': 'test', 'fields': fields } return get_schematizer().register_schema( namespace=namespace, source=source, schema_str=simplejson.dumps(schema_json), source_owner_email=self.source_owner_email, contains_pii=False, base_schema_id=None)
def test_missing_mandatory_meta_attributes(self, valid_message_data, meta_param, mandatory_meta_attr_ids): with mock.patch.object(get_schematizer(), 'get_meta_attributes_by_schema_id', return_value=mandatory_meta_attr_ids): with pytest.raises(MissingMetaAttributeException) as e: self._get_dry_run_message_with_meta(valid_message_data, meta_param) assert e.value.args assert ("Meta Attributes with IDs `{0}` are not found for " "schema_id `{1}`.".format( ", ".join( str(m) for m in ({id for id in mandatory_meta_attr_ids} - {m.schema_id for m in meta_param})), valid_message_data['schema_id'])) in e.value.args[0]
def _register_avro_schema(self, namespace, source, two_fields): fields = [{'type': 'int', 'doc': 'test', 'name': 'foo'}] if two_fields: fields.append({'type': 'int', 'doc': 'test', 'name': 'bar'}) schema_json = { 'type': 'record', 'name': source, 'namespace': namespace, 'doc': 'test', 'fields': fields } return get_schematizer().register_schema( namespace=namespace, source=source, schema_str=simplejson.dumps(schema_json), source_owner_email=self.source_owner_email, contains_pii=False, base_schema_id=None )
def process_commandline_options(self, args=None): super(FullRefreshJob, self).process_commandline_options(args=args) if (self.options.avg_rows_per_second_cap is not None and self.options.avg_rows_per_second_cap <= 0): raise ValueError("--avg-rows-per-second-cap must be greater than 0") if self.options.batch_size <= 0: raise ValueError("--batch-size option must be greater than 0.") if not self.options.source_id and not ( self.options.source_name and self.options.namespace ): raise ValueError("--source-id or both of--source-name and --namespace must be defined") if self.options.source_id and ( self.options.source_name or self.options.namespace ): raise ValueError("Cannot use both --source-id and either of --namespace and --source-name") load_package_config(self.options.config_path) self.schematizer = get_schematizer()
def _register_schema(self, namespace, source, containers): avro_schema = { 'type': 'record', 'name': source, 'namespace': namespace, 'doc': 'test', 'fields': [{ 'type': 'int', 'doc': 'test', 'name': 'id' }] } reg_schema = get_schematizer().register_schema_from_schema_json( namespace=namespace, source=source, schema_json=avro_schema, source_owner_email='*****@*****.**', contains_pii=False) containers.create_kafka_topic(str(reg_schema.topic.name)) return reg_schema
def process_commandline_options(self, args=None): super(FullRefreshJob, self).process_commandline_options(args=args) if (self.options.avg_rows_per_second_cap is not None and self.options.avg_rows_per_second_cap <= 0): raise ValueError( "--avg-rows-per-second-cap must be greater than 0") if self.options.batch_size <= 0: raise ValueError("--batch-size option must be greater than 0.") if not self.options.source_id and not (self.options.source_name and self.options.namespace): raise ValueError( "--source-id or both of--source-name and --namespace must be defined" ) if self.options.source_id and (self.options.source_name or self.options.namespace): raise ValueError( "Cannot use both --source-id and either of --namespace and --source-name" ) load_package_config(self.options.config_path) self.schematizer = get_schematizer()
def schematizer(self, containers): return get_schematizer()
def __init__(self): self._schematizer = get_schematizer() self._schema_id_cache = {}
def _schematizer(self): return get_schematizer()
def schematizer_client(containers): return get_schematizer()
def mock_get_topics_by_criteria(self, topics): with mock.patch.object(get_schematizer(), 'get_topics_by_criteria', return_value=topics, autospec=True) as mock_schematizer: yield mock_schematizer