class FirebaseJob(BaseJob): name = 'job' # Any type here needs to be registered in the API as APIServer._allowed_types _resources = [FirebaseInstance, Subscription] schema = schemas.FB_JOB public_actions = BaseJob.public_actions + [ 'get_logs', 'list_topics', 'list_subscribed_topics' ] # publicly available list of topics subscribed_topics: dict log_stack: list log: Callable # each job instance has it's own log object to keep log_stacks -> user reportable consumer: KafkaConsumer = None # processing artifacts _indices: dict _schemas: dict _previous_topics: list _firebase: FirebaseInstance _subscriptions: List[Subscription] def _setup(self): self.subscribed_topics = {} self._schemas = {} self._subscriptions = [] self._previous_topics = [] self.log_stack = [] self.log = callback_logger('JOB', self.log_stack, 100) self.group_name = f'{self.tenant}.firebaseconsumer.{self._id}' self.sleep_delay: float = 0.5 self.report_interval: int = 100 args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()} args['group.id'] = self.group_name LOG.debug(args) self.consumer = KafkaConsumer(**args) def _job_firebase(self, config=None) -> FirebaseInstance: if config: fb: List[FirebaseInstance] = self.get_resources('firebase', config) if not fb: raise ConsumerHttpException('No Firebase associated with Job', 400) self._firebase = fb[0] return self._firebase def _job_subscriptions(self, config=None) -> List[Subscription]: if config: subs = self.get_resources('subscription', config) if not subs: raise ConsumerHttpException('No Subscriptions associated with Job', 400) self._subscriptions = subs return self._subscriptions def _job_subscription_for_topic(self, topic) -> Subscription: return next(iter( sorted([ i for i in self._job_subscriptions() if i._handles_topic(topic, self.tenant) ])), None) def _test_connections(self, config): self._job_subscriptions(config) self._job_firebase(config).test_connection() # raises CHE return True def _get_messages(self, config): try: self.log.debug(f'{self._id} checking configurations...') self._test_connections(config) subs = self._job_subscriptions() self._handle_new_subscriptions(subs) self.log.debug(f'Job {self._id} getting messages') return self.consumer.poll_and_deserialize( timeout=5, num_messages=1) # max except ConsumerHttpException as cer: # don't fetch messages if we can't post them self.log.debug(f'Job not ready: {cer}') self.status = JobStatus.RECONFIGURE sleep(self.sleep_delay * 10) return [] except Exception as err: import traceback traceback_str = ''.join(traceback.format_tb(err.__traceback__)) self.log.critical(f'unhandled error: {str(err)} | {traceback_str}') raise err sleep(self.sleep_delay) return [] def _handle_new_subscriptions(self, subs): old_subs = list(sorted(set(self.subscribed_topics.values()))) for sub in subs: pattern = sub.definition.topic_pattern # only allow regex on the end of patterns if pattern.endswith('*'): self.subscribed_topics[sub.id] = f'^{self.tenant}.{pattern}' else: self.subscribed_topics[sub.id] = f'{self.tenant}.{pattern}' new_subs = list(sorted(set(self.subscribed_topics.values()))) _diff = list(set(old_subs).symmetric_difference(set(new_subs))) if _diff: self.log.info(f'{self.tenant} added subs to topics: {_diff}') self.consumer.subscribe(new_subs, on_assign=self._on_assign) def _handle_messages(self, config, messages): self.log.debug(f'{self.group_name} | reading {len(messages)} messages') MAX_SUBMIT = 50 count = 0 subs = {} firebase: FirebaseInstance = self._job_firebase() cfs: firestore.Client = firebase.get_cloud_firestore() batch = cfs.batch() for msg in messages: topic = msg.topic if topic not in subs: subs[topic] = self._job_subscription_for_topic(topic) schema = msg.schema if schema != self._schemas.get(topic): self.log.info(f'{self._id} Schema change on {topic}') self._update_topic(topic, schema) self._schemas[topic] = schema else: self.log.debug('Schema unchanged.') self.add_message(msg.value, topic, subs[topic], cfs, batch) count += 1 if (count % MAX_SUBMIT) == 0: batch.commit() batch = cfs.batch() batch.commit() self.log.info(f'processed {count} {topic} docs in tenant {self.tenant}') # called when a subscription causes a new assignment to be given to the consumer def _on_assign(self, *args, **kwargs): # TODO check rules for topic in Firebase assignment = args[1] for _part in assignment: if _part.topic not in self._previous_topics: self.log.info(f'New topic to configure: {_part.topic}') self._apply_consumer_filters(_part.topic) self._previous_topics.append(_part.topic) def _apply_consumer_filters(self, topic): self.log.debug(f'{self._id} applying filter for new topic {topic}') subscription = self._job_subscription_for_topic(topic) if not subscription: self.log.error(f'Could not find subscription for topic {topic}') return try: opts = subscription.definition.topic_options _flt = opts.get('filter_required', False) if _flt: _filter_options = { 'check_condition_path': opts.get('filter_field_path', ''), 'pass_conditions': opts.get('filter_pass_values', []), 'requires_approval': _flt } self.log.info(_filter_options) self.consumer.set_topic_filter_config( topic, FilterConfig(**_filter_options) ) mask_annotation = opts.get('masking_annotation', None) if mask_annotation: _mask_options = { 'mask_query': mask_annotation, 'mask_levels': opts.get('masking_levels', []), 'emit_level': opts.get('masking_emit_level') } self.log.info(_mask_options) self.consumer.set_topic_mask_config( topic, MaskConfig(**_mask_options) ) self.log.info(f'Filters applied for topic {topic}') except AttributeError as aer: self.log.error(f'No topic options for {subscription.id}| {aer}') def _name_from_topic(self, topic): return topic.lstrip(f'{self.tenant}.') def _update_topic(self, topic, schema: Mapping[Any, Any]): self.log.debug(f'{self.tenant} is updating topic schema: {topic},' f' firebase does not care...') def add_message( self, doc, topic: str, sub: Subscription, cfs: firestore.Client, batch: firestore_v1.batch.WriteBatch ): mode: helpers.SyncMode = sub.sync_mode() if mode in [helpers.SyncMode.NONE, helpers.SyncMode.CONSUME]: # these don't send data to FB return None topic_name = self._name_from_topic(topic) path = sub.path_for_topic(topic_name) if mode is helpers.SyncMode.SYNC: # todo: # check the hash.. # needs_update = something() # if needs_update: # return helpers.cfs_ref(cfs, path, doc.get('id')) pass elif mode is helpers.SyncMode.FORWARD: # we don't care about the hashes ref = helpers.cfs_ref(cfs, path, doc.get('id')) batch.set(ref, doc) # public def list_topics(self, *args, **kwargs): ''' Get a list of topics to which the job can subscribe. You can also use a wildcard at the end of names like: Name* which would capture both Name1 && Name2, etc ''' timeout = 5 try: md = self.consumer.list_topics(timeout=timeout) except (KafkaException) as ker: raise ConsumerHttpException(str(ker) + f'@timeout: {timeout}', 500) topics = [ str(t).split(f'{self.tenant}.')[1] for t in iter(md.topics.values()) if str(t).startswith(self.tenant) ] return topics # public def list_subscribed_topics(self, *arg, **kwargs): ''' A List of topics currently subscribed to by this job ''' return list(self.subscribed_topics.values()) # public def get_logs(self, *arg, **kwargs): ''' A list of the last 100 log entries from this job in format [ (timestamp, log_level, message), (timestamp, log_level, message), ... ] ''' return self.log_stack[:]
class ESJob(BaseJob): name = 'job' # Any type here needs to be registered in the API as APIServer._allowed_types _resources = [ESInstance, LocalESInstance, KibanaInstance, LocalKibanaInstance, Subscription] schema = schemas.ES_JOB public_actions = BaseJob.public_actions + [ 'list_topics', 'list_subscribed_topics', 'list_assigned_topics' ] # publicly available list of topics subscribed_topics: dict log_stack: list log: Callable # each job instance has it's own log object to keep log_stacks -> user reportable consumer: KafkaConsumer = None # processing artifacts _indices: dict _schemas: dict _processors: dict _doc_types: dict _routes: dict _previous_topics: list _kibana: KibanaInstance _elasticsearch: ESInstance _subscriptions: List[Subscription] def _setup(self): self.subscribed_topics = {} self._indices = {} self._schemas = {} self._processors = {} self._doc_types = {} self._routes = {} self._subscriptions = [] self._previous_topics = [] self.group_name = f'{self.tenant}.esconsumer.{self._id}' self.sleep_delay: float = 0.5 self.report_interval: int = 100 args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()} args['group.id'] = self.group_name LOG.debug(args) self.consumer = KafkaConsumer(**args) def _job_elasticsearch(self, config=None) -> ESInstance: if config: es = self.get_resources('local_elasticsearch', config) + \ self.get_resources('elasticsearch', config) if not es: raise ConsumerHttpException('No ES associated with Job', 400) self._elasticsearch = es[0] return self._elasticsearch def _job_kibana(self, config=None) -> KibanaInstance: if config: kibana = self.get_resources('local_kibana', config) + \ self.get_resources('kibana', config) if not kibana: raise ConsumerHttpException('No Kibana associated with Job', 400) self._kibana = kibana[0] return self._kibana def _job_subscriptions(self, config=None) -> List[Subscription]: if config: subs = self.get_resources('subscription', config) if not subs: raise ConsumerHttpException('No Subscriptions associated with Job', 400) self._subscriptions = subs return self._subscriptions def _job_subscription_for_topic(self, topic): return next(iter( sorted([ i for i in self._job_subscriptions() if i._handles_topic(topic, self.tenant) ])), None) def _test_connections(self, config): self._job_subscriptions(config) self._job_elasticsearch(config).test_connection() # raises CHE self._job_kibana(config).test_connection() # raises CHE return True def _get_messages(self, config): try: self.log.debug(f'{self._id} checking configurations...') self._test_connections(config) subs = self._job_subscriptions() self._handle_new_subscriptions(subs) self.log.debug(f'Job {self._id} getting messages') return self.consumer.poll_and_deserialize( timeout=5, num_messages=1) # max except ConsumerHttpException as cer: # don't fetch messages if we can't post them self.log.debug(f'Job not ready: {cer}') self.status = JobStatus.RECONFIGURE sleep(self.sleep_delay * 10) return [] except Exception as err: traceback_str = ''.join(traceback.format_tb(err.__traceback__)) self.log.critical(f'unhandled error: {str(err)} | {traceback_str}') raise err sleep(self.sleep_delay) return [] def _handle_new_subscriptions(self, subs): old_subs = list(sorted(set(self.subscribed_topics.values()))) for sub in subs: pattern = sub.definition.topic_pattern # only allow regex on the end of patterns if pattern.endswith('*'): self.subscribed_topics[sub.id] = f'^{self.tenant}.{pattern}' else: self.subscribed_topics[sub.id] = f'{self.tenant}.{pattern}' new_subs = list(sorted(set(self.subscribed_topics.values()))) _diff = list(set(old_subs).symmetric_difference(set(new_subs))) if _diff: self.log.info(f'{self.tenant} added subs to topics: {_diff}') self.consumer.subscribe(new_subs, on_assign=self._on_assign) def _handle_messages(self, config, messages): self.log.debug(f'{self.group_name} | reading {len(messages)} messages') count = 0 for msg in messages: topic = msg.topic schema = msg.schema if schema != self._schemas.get(topic): self.log.info(f'{self._id} Schema change on {topic}') self._update_topic(topic, schema) self._schemas[topic] = schema else: self.log.debug('Schema unchanged.') processor = self._processors[topic] index_name = self._indices[topic]['name'] doc_type = self._doc_types[topic] route_getter = self._routes[topic] doc = processor.process(msg.value) self.submit( index_name, doc_type, doc, topic, route_getter, ) count += 1 self.log.info(f'processed {count} {topic} docs in tenant {self.tenant}') # called when a subscription causes a new assignment to be given to the consumer def _on_assign(self, *args, **kwargs): assignment = args[1] for _part in assignment: if _part.topic not in self._previous_topics: self.log.info(f'New topic to configure: {_part.topic}') self._apply_consumer_filters(_part.topic) self._previous_topics.append(_part.topic) def _apply_consumer_filters(self, topic): self.log.debug(f'{self._id} applying filter for new topic {topic}') subscription = self._job_subscription_for_topic(topic) if not subscription: self.log.error(f'Could not find subscription for topic {topic}') return try: opts = subscription.definition.topic_options _flt = opts.get('filter_required', False) if _flt: _filter_options = { 'check_condition_path': opts.get('filter_field_path', ''), 'pass_conditions': opts.get('filter_pass_values', []), 'requires_approval': _flt } self.log.info(_filter_options) self.consumer.set_topic_filter_config( topic, FilterConfig(**_filter_options) ) mask_annotation = opts.get('masking_annotation', None) if mask_annotation: _mask_options = { 'mask_query': mask_annotation, 'mask_levels': opts.get('masking_levels', []), 'emit_level': opts.get('masking_emit_level') } self.log.info(_mask_options) self.consumer.set_topic_mask_config( topic, MaskConfig(**_mask_options) ) self.log.info(f'Filters applied for topic {topic}') except AttributeError as aer: self.log.error(f'No topic options for {subscription.id}| {aer}') def _name_from_topic(self, topic): return topic[:].replace(f'{self.tenant}.', '', 1) def _update_topic(self, topic, schema: Mapping[Any, Any]): self.log.debug(f'{self.tenant} is updating topic: {topic}') subscription = self._job_subscription_for_topic(topic) if not subscription: self.log.error(f'Could not find subscription for topic {topic}') return node: Node = Node(schema) self.log.debug('getting index') es_index = index_handler.get_es_index_from_subscription( subscription.definition.get('es_options'), name=self._name_from_topic(topic), tenant=self.tenant.lower(), schema=node ) self.log.debug(f'index {es_index}') alias_request = subscription.definition.get('es_options', {}).get('alias_name') if alias_request: alias = f'{alias_request}'.lower() else: alias = index_handler.get_alias_from_namespace(node.namespace) # Try to add the indices / ES alias es_instance = self._job_elasticsearch().get_session() if index_handler.es_index_changed(es_instance, es_index, self.tenant): self.log.debug(f'{self.tenant} updated schema for {topic}') self.log.debug(f'registering ES index:\n{json.dumps(es_index, indent=2)}') index_handler.update_es_index( es_instance, es_index, self.tenant, alias ) conn: KibanaInstance = self._job_kibana() old_schema = self._schemas.get(topic) updated_kibana = index_handler.kibana_handle_schema_change( self.tenant.lower(), alias, old_schema, schema, subscription.definition, es_index, es_instance, conn ) if updated_kibana: self.log.info( f'Registered kibana index {alias} for {self.tenant}' ) else: self.log.info( f'Kibana index {alias} did not need update.' ) self._indices[topic] = es_index self.log.debug(f'{self.tenant}:{topic} | idx: {es_index}') # update processor for type doc_type, instr = list(es_index['body']['mappings'].items())[0] self._doc_types[topic] = doc_type self._processors[topic] = ESItemProcessor(topic, instr, node) self._routes[topic] = self._processors[topic].create_route() def submit(self, index_name, doc_type, doc, topic, route_getter): es = self._job_elasticsearch().get_session() parent = doc.get('_parent', None) if parent: # _parent field can only be in metadata apparently del doc['_parent'] route = route_getter(doc) _id = doc.get('id') try: es.create( index=index_name, id=_id, routing=route, doc_type=doc_type, body=doc ) self.log.debug( f'ES CREATE-OK [{index_name}:{self.group_name}]' f' -> {_id}') except (Exception, ESTransportError) as ese: self.log.debug('Could not create doc because of error: %s\nAttempting update.' % ese) try: route = self._routes[topic](doc) es.update( index=index_name, id=_id, routing=route, doc_type=doc_type, body={'doc': doc} ) self.log.debug( f'ES UPDATE-OK [{index_name}:{self.group_name}]' f' -> {_id}') except ESTransportError as ese2: self.log.info( f'''conflict!, ignoring doc with id {_id}''' f'{ese2}' ) # public def list_topics(self, *args, **kwargs): ''' Get a list of topics to which the job can subscribe. You can also use a wildcard at the end of names like: Name* which would capture both Name1 && Name2, etc ''' timeout = 5 try: md = self.consumer.list_topics(timeout=timeout) except (KafkaException) as ker: raise ConsumerHttpException(str(ker) + f'@timeout: {timeout}', 500) topics = [ str(t).split(f'{self.tenant}.')[1] for t in iter(md.topics.values()) if str(t).startswith(self.tenant) ] return topics # public def list_subscribed_topics(self, *arg, **kwargs): ''' A List of topics currently subscribed to by this job ''' return list(self.subscribed_topics.values()) # public def list_assigned_topics(self, *arg, **kwargs): ''' A List of topics currently assigned to this consumer ''' return self._previous_topics[:]
class PipelinePubSub(object): def __init__( self, tenant: str, kafka_group=None, definition: Dict = None, zeebe: 'ZeebeInstance' = None # noqa type hint ): self.tenant = tenant self.kafka_group_id = kafka_group self.definition = definition self.kafka_consumer = None self.kafka_producer = self.__get_kafka_producer() self.source_type = None self.zeebe: 'ZeebeInstance' = zeebe # noqa self.zeebe_connection: ZeebeConnection = None def _make_context(self, evt: Event): data = {} if self.definition.get('const'): data = {'const': self.definition['const']} if not self.zeebe_connection: self.zeebe_connection = (None if not self.zeebe else self.zeebe.get_connection()) return PipelineContext(evt, zeebe=self.zeebe_connection, kafka_producer=self.kafka_producer, data=data) def commit(self): if self.kafka_consumer: self.kafka_consumer.commit() def __has_kafka_setter(self) -> bool: if 'error_handling' in self.definition: return True return 'kafkamessage' in set( [stage.get('type') for stage in self.definition.get('stages', [])]) def __get_kafka_producer(self) -> KafkaProducer: if self.__has_kafka_setter(): return KafkaProducer(**get_kafka_admin_config()) def __message_getter(self): if not self.source_type: if 'kafka_subscription' in self.definition: self.source_type = PipelineConnection.KAFKA self.__source = self.__make_kafka_getter() elif 'zeebe_subscription' in self.definition: self.source_type = PipelineConnection.ZEEBE self.__source = self.__make_zeebe_getter() return self.__source def __make_kafka_getter(self): args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()} # the usual Kafka Client Configuration args['group.id'] = self.kafka_group_id args['auto.offset.reset'] = \ self.definition['kafka_subscription'].get('auto_offset_reset', 'earliest') self.kafka_consumer = KafkaConsumer(**args) pattern = self.definition['kafka_subscription'].get( 'topic_pattern', '*') # only allow regex on the end of patterns if pattern.endswith('*'): topic = f'^{self.tenant}.{pattern}' else: topic = f'{self.tenant}.{pattern}' self.kafka_consumer.subscribe([topic], on_assign=self._on_kafka_assign) def _getter() -> Iterable[KafkaMessage]: messages = self.kafka_consumer.poll_and_deserialize(timeout=5, num_messages=1) for msg in messages: yield KafkaMessage(msg) return _getter def __make_zeebe_getter(self): workflow_id = self.definition.get('zeebe_subscription') if not self.zeebe_connection: self.zeebe_connection = (None if not self.zeebe else self.zeebe.get_connection()) def _getter() -> Iterable[ZeebeJob]: jobs = self.zeebe_connection.job_iterator(workflow_id, self.kafka_group_id, max=50) c = 0 for job in jobs: c += 1 yield job LOG.debug(f'No more jobs available on {workflow_id}, got {c}') return _getter def _get_events(self) -> Iterable[Event]: _source = self.__message_getter() yield from _source() def get(self) -> Iterable[PipelineContext]: evts: Iterable[Event] = self._get_events() for evt in evts: yield self._make_context(evt) def test(self, evt: Event) -> PipelineContext: return self._make_context(evt) # called when a subscription causes a new assignment to be given to the consumer def _on_kafka_assign(self, *args, **kwargs): assignment = args[1] topics = set([_part.topic for _part in assignment]) for topic in list(topics): self._apply_consumer_filters(topic) def _apply_consumer_filters(self, topic): try: opts = self.definition['kafka_subscription'].get( 'topic_options', {}) _flt = opts.get('filter_required', False) if _flt: _filter_options = { 'check_condition_path': opts.get('filter_field_path', ''), 'pass_conditions': opts.get('filter_pass_values', []), 'requires_approval': _flt } self.kafka_consumer.set_topic_filter_config( topic, FilterConfig(**_filter_options)) mask_annotation = opts.get('masking_annotation', None) if mask_annotation: _mask_options = { 'mask_query': mask_annotation, 'mask_levels': opts.get('masking_levels', []), 'emit_level': opts.get('masking_emit_level') } self.kafka_consumer.set_topic_mask_config( topic, MaskConfig(**_mask_options)) except AttributeError as aer: LOG.error(f'No topic options for {self._id}| {aer}')
class CKANJob(BaseJob): name = 'job' _resources = [CKANInstance, Subscription] schema = schemas.CKAN_JOB public_actions = BaseJob.public_actions + [ 'get_logs', 'list_topics', 'list_subscribed_topics' ] # publicly available list of topics subscribed_topics: dict log_stack: list log: Callable # each job instance has it's own log object to keep log_stacks -> user reportable consumer: KafkaConsumer = None # processing artifacts _schemas: dict _doc_types: dict _routes: dict _previous_topics: list _ckan: CKANInstance _subscriptions: List[Subscription] def _setup(self): self.subscribed_topics = {} self._schemas = {} self._topic_fields = {} self._subscriptions = [] self._previous_topics = [] self.log_stack = [] self.log = callback_logger('JOB', self.log_stack, 100) self.group_name = f'{self.tenant}.{self._id}' self.sleep_delay: float = 0.5 self.report_interval: int = 100 args = {k.lower(): v for k, v in KAFKA_CONFIG.copy().items()} args['group.id'] = self.group_name LOG.debug(args) self.consumer = KafkaConsumer(**args) self.rename_fields = {} self.bad_terms = [] def _job_ckan(self, config=None) -> CKANInstance: if config: ckan = self.get_resources('ckan', config) if not ckan: raise ConsumerHttpException( 'No CKAN instance associated with Job', 400) self._ckan = ckan[0] return self._ckan def _job_subscriptions(self, config=None) -> List[Subscription]: if config: subs = self.get_resources('subscription', config) if not subs: raise ConsumerHttpException( 'No Subscriptions associated with Job', 400) self._subscriptions = subs return self._subscriptions def _job_subscription_for_topic(self, topic): return next( iter( sorted([ i for i in self._job_subscriptions() if i._handles_topic(topic, self.tenant) ])), None) def _test_connections(self, config): self._job_subscriptions(config) self._job_ckan(config).test_connection() # raises CHE return True def _get_messages(self, config): try: self.log.debug(f'{self._id} checking configurations...') self._test_connections(config) subs = self._job_subscriptions() self._handle_new_subscriptions(subs) self.log.debug(f'Job {self._id} getting messages') return self.consumer.poll_and_deserialize(timeout=5, num_messages=1) # max except ConsumerHttpException as cer: # don't fetch messages if we can't post them self.log.debug(f'Job not ready: {cer}') self.status = JobStatus.RECONFIGURE sleep(self.sleep_delay * 10) return [] except Exception as err: import traceback traceback_str = ''.join(traceback.format_tb(err.__traceback__)) self.log.critical(f'unhandled error: {str(err)} | {traceback_str}') raise err sleep(self.sleep_delay) return [] def _handle_new_subscriptions(self, subs): old_subs = list(sorted(set(self.subscribed_topics.values()))) for sub in subs: pattern = sub.definition.topic_pattern # only allow regex on the end of patterns if pattern.endswith('*'): self.subscribed_topics[sub.id] = f'^{self.tenant}.{pattern}' else: self.subscribed_topics[sub.id] = f'{self.tenant}.{pattern}' new_subs = list(sorted(set(self.subscribed_topics.values()))) _diff = list(set(old_subs).symmetric_difference(set(new_subs))) if _diff: self.log.info(f'{self.tenant} added subs to topics: {_diff}') self.consumer.subscribe(new_subs, on_assign=self._on_assign) def _handle_messages(self, config, messages): self.log.debug(f'{self.group_name} | reading {len(messages)} messages') ckan_instance = self._job_ckan(config=config) server_url = ckan_instance.definition.get('url') api_key = ckan_instance.definition.get('key') ckan_remote = RemoteCKAN(server_url, apikey=api_key) count = 0 records = [] topic = None for msg in messages: topic = msg.topic schema = msg.schema if schema != self._schemas.get(topic): self.log.info(f'{self._id} Schema change on {topic}') self._schemas[topic] = schema fields, definition_names = extract_fields_from_schema(schema) fields = prepare_fields_for_resource(fields, definition_names) self._topic_fields[topic] = fields else: self.log.debug('Schema unchanged.') records.append(msg.value) resource = self.submit_artefacts(topic, schema, ckan_remote) count += 1 if resource: self._create_resource_in_datastore(resource, ckan_remote) self.send_data_to_datastore(self._topic_fields[topic], records, resource, ckan_remote) self.log.info(f'processed {count} {topic} docs') def submit_artefacts(self, topic, schema, ckan_remote): subscription = self._job_subscription_for_topic(topic) target_options = subscription.definition.get('target_options') target_dataset_metadata = CONSUMER_CONFIG.get('metadata', {}) target_dataset_metadata.update(target_options.get('dataset_metadata')) dataset = self._create_dataset_in_ckan(target_dataset_metadata, ckan_remote) if dataset: resource_name = schema.get('name') return self._create_resource_in_ckan(resource_name, dataset, ckan_remote) # called when a subscription causes a new # assignment to be given to the consumer def _on_assign(self, *args, **kwargs): assignment = args[1] for _part in assignment: if _part.topic not in self._previous_topics: self.log.info(f'New topic to configure: {_part.topic}') self._apply_consumer_filters(_part.topic) self._previous_topics.append(_part.topic) def _apply_consumer_filters(self, topic): self.log.debug(f'{self._id} applying filter for new topic {topic}') subscription = self._job_subscription_for_topic(topic) if not subscription: self.log.error(f'Could not find subscription for topic {topic}') return try: opts = subscription.definition.topic_options _flt = opts.get('filter_required', False) if _flt: _filter_options = { 'check_condition_path': opts.get('filter_field_path', ''), 'pass_conditions': opts.get('filter_pass_values', []), 'requires_approval': _flt } self.log.info(_filter_options) self.consumer.set_topic_filter_config( topic, FilterConfig(**_filter_options)) mask_annotation = opts.get('masking_annotation', None) if mask_annotation: _mask_options = { 'mask_query': mask_annotation, 'mask_levels': opts.get('masking_levels', []), 'emit_level': opts.get('masking_emit_level') } self.log.info(_mask_options) self.consumer.set_topic_mask_config( topic, MaskConfig(**_mask_options)) self.log.info(f'Filters applied for topic {topic}') except AttributeError as aer: self.log.error(f'No topic options for {subscription.id}| {aer}') def _create_dataset_in_ckan(self, dataset, ckan): dataset_name = dataset.get('name').lower() org_name = dataset.get('owner_org').lower() # ckan allows only lower case dataset names dataset.update({'name': dataset_name, 'owner_org': org_name}) try: ckan.action.organization_show(id=org_name) except ckanapi_errors.NotFound: self.log.debug(f'Creating {org_name} organization') try: org = { 'name': org_name, 'state': 'active', } ckan.action.organization_create(**org) self.log.debug(f'Successfully created {org_name} organization') except ckanapi_errors.ValidationError as e: self.log.error(f'Cannot create organization {org_name} \ because of the following errors: {json.dumps(e.error_dict)}' ) return except ckanapi_errors.ValidationError as e: self.log.error( f'Could not find {org_name} organization. {json.dumps(e.error_dict)}' ) return try: return ckan.action.package_show(id=dataset_name) except ckanapi_errors.NotFound: # Dataset does not exist, so continue with execution to create it. pass try: new_dataset = ckan.action.package_create(**dataset) self.log.debug(f'Dataset {dataset_name} created in CKAN portal.') return new_dataset except ckanapi_errors.NotAuthorized as e: self.log.error(f'Cannot create dataset {dataset_name}. {str(e)}') except ckanapi_errors.ValidationError as e: self.log.error( f'Cannot create dataset {dataset_name}. Payload is not valid. \ Check the following errors: {json.dumps(e.error_dict)}') def _create_resource_in_ckan(self, resource_name, dataset, ckan): try: resources = ckan.action.resource_search( query=f'name:{resource_name}') # todo: filter resource on dataset too if resources['count']: return resources['results'][0] except Exception: pass try: self.log.debug(f'Creating {resource_name} resource') resource = { 'package_id': dataset.get('name'), 'name': resource_name, 'url_type': 'datastore', } new_resource = ckan.action.resource_create(**resource) self.log.debug(f'Successfully created {resource_name} resource') return new_resource except ckanapi_errors.NotAuthorized as e: self.log.error(f'Cannot create resource {resource_name}. {str(e)}') except ckanapi_errors.ValidationError as e: self.log.error( f'Cannot create resource {resource_name}. Payload is not valid. \ Check the following errors: {json.dumps(e.error_dict)}') def _create_resource_in_datastore(self, resource, ckan): payload = { 'resource_id': resource.get('id'), } try: ckan.action.datastore_create(**payload) except ckanapi_errors.CKANAPIError as e: self.log.error(f'An error occurred while creating resource \ {resource.get("name")} in Datastore. {str(e)}') def send_data_to_datastore(self, fields, records, resource, ckan): resource_id = resource.get('id') resource_name = resource.get('name') payload = { 'id': resource_id, 'limit': 1, } try: response = ckan.action.datastore_search(**payload) except ckanapi_errors.CKANAPIError as e: self.log.error( f'An error occurred while getting Datastore fields for resource \ {resource_id}. {str(e)}') return new_fields = response.get('fields') new_fields[:] = [ field for field in new_fields if field.get('id') != '_id' ] schema_changes = self.get_schema_changes(new_fields, fields) if len(new_fields) == 0 or len(schema_changes) > 0: self.log.info('Datastore detected schema changes') for new_field in schema_changes: new_fields.append(new_field) payload = { 'resource_id': resource_id, 'fields': new_fields, } try: ckan.action.datastore_create(**payload) except ckanapi_errors.CKANAPIError as cke: self.log.error( f'An error occurred while adding new fields for resource \ {resource_name} in Datastore.') label = str(cke) self.log.error('ResourceType: {0} Error: {1}'.format( resource_name, label)) bad_fields = literal_eval(label).get('fields', None) if not isinstance(bad_fields, list): raise ValueError('Bad field could not be identified.') issue = bad_fields[0] bad_term = str(issue.split(' ')[0]).strip("'").strip('"') self.bad_terms.append(bad_term) self.log.info('Recovery from error: bad field name %s' % bad_term) self.log.info('Reverting %s' % (schema_changes, )) for new_field in schema_changes: new_fields.remove(new_field) return self.send_data_to_datastore(fields, records, resource, ckan) records = self.convert_item_to_array(records, new_fields) payload = { 'resource_id': resource_id, 'method': 'insert', 'records': records, } try: ckan.action.datastore_upsert(**payload) self.log.info(f'Updated resource {resource_id} in {ckan.address}.') except ckanapi_errors.CKANAPIError as cke: self.log.error( f'An error occurred while inserting data into resource {resource_name}' ) self.log.error(f'ResourceType: {resource} Error: {str(cke)}') def get_schema_changes(self, schema, fields): ''' Only check if new field has been added. ''' new_fields = [] for field in fields: field_found = False for schema_field in schema: if field.get('id') == schema_field.get('id'): field_found = True break if not field_found: if field.get('id') in self.bad_terms: new_fields.append(self.rename_field(field)) else: new_fields.append(field) return new_fields def rename_field(self, field): bad_name = field.get('id') new_name = 'ae' + bad_name self.rename_fields[bad_name] = new_name field['id'] = new_name return field def convert_item_to_array(self, records, new_fields): ''' If a field is of type array, and the value for it contains a primitive type, then convert it to an array of that primitive type. This mutation is required for all records, otherwise CKAN will raise an exception. Example: For given field which is of type array of integers {'type': '_int', 'id': 'scores'} Original record {'scores': 10} Changed record {'scores': [10]} ''' array_fields = [] records = records[:] for field in new_fields: if field.get('type').startswith('_'): array_fields.append(field.get('id')) for record in records: for key, value in record.items(): if self.bad_terms: name = self.rename_fields.get(key, key) if name != key: del record[key] else: name = key if key in array_fields: record[name] = [value] else: record[name] = value return records # public def list_topics(self, *args, **kwargs): ''' Get a list of topics to which the job can subscribe. You can also use a wildcard at the end of names like: Name* which would capture both Name1 && Name2, etc ''' timeout = 5 try: md = self.consumer.list_topics(timeout=timeout) except (KafkaException) as ker: raise ConsumerHttpException(str(ker) + f'@timeout: {timeout}', 500) topics = [ str(t).split(f'{self.tenant}.')[1] for t in iter(md.topics.values()) if str(t).startswith(self.tenant) ] return topics # public def list_subscribed_topics(self, *arg, **kwargs): ''' A List of topics currently subscribed to by this job ''' return list(self.subscribed_topics.values()) # public def get_logs(self, *arg, **kwargs): ''' A list of the last 100 log entries from this job in format [ (timestamp, log_level, message), (timestamp, log_level, message), ... ] ''' return self.log_stack[:]