def entity_from_json(json_str, entity_type, api_client=None): """ Hack to take advantage of the code-generated deserialization code See not on `entity_to_dict()` about api_client argument. """ if not api_client: api_client = ApiClient() thing = collections.namedtuple('Thing', ['data']) thing.data = json_str return api_client.deserialize(thing, entity_type)
def entity_to_dict(entity, api_client=None): """ Hack to take advantage of the code-generated serialization code. Initializing/destroying ApiClient objects is surprisingly expensive (because it involves a threadpool), so we allow passing an existing instance. If you already have a full-on API connection `api`, you can access the ApiClient object as `api.api_client`. This is such a speed-up that this argument may become mandatory. """ if not api_client: api_client = ApiClient() return api_client.sanitize_for_serialization(entity)
def authenticated_api(host_uri: str, token: Optional[str] = None) -> DefaultApi: """ Note: if this helper is called, it's implied that an actual API connection is needed, so it does try to connect and verify credentials. """ conf = Configuration() conf.host = host_uri if not token: token = os.environ["FATCAT_API_AUTH_TOKEN"] if not token: sys.stderr.write( "This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n" ) sys.exit(-1) conf.api_key["Authorization"] = token conf.api_key_prefix["Authorization"] = "Bearer" api = DefaultApi(ApiClient(conf)) # verify up front that auth is working api.auth_check() return api
def generic_entity_create_from_toml( user_api: ApiClient, entity_type: str, editgroup_id: str, toml_str: str ) -> EntityEdit: if entity_type == "container": entity = entity_from_toml(toml_str, ContainerEntity) edit = user_api.create_container(editgroup_id, entity) elif entity_type == "creator": entity = entity_from_toml(toml_str, CreatorEntity) edit = user_api.create_creator(editgroup_id, entity) elif entity_type == "file": entity = entity_from_toml(toml_str, FileEntity) edit = user_api.create_file(editgroup_id, entity) elif entity_type == "fileset": entity = entity_from_toml(toml_str, FilesetEntity) edit = user_api.create_fileset(editgroup_id, entity) elif entity_type == "webcapture": entity = entity_from_toml(toml_str, WebcaptureEntity) edit = user_api.create_webcapture(editgroup_id, entity) elif entity_type == "release": entity = entity_from_toml(toml_str, ReleaseEntity) edit = user_api.create_release(editgroup_id, entity) elif entity_type == "work": entity = entity_from_toml(toml_str, WorkEntity) edit = user_api.create_work(editgroup_id, entity) else: raise NotImplementedError return edit
def public_api(host_uri: str) -> DefaultApi: """ Note: unlike the authenticated variant, this helper might get called even if the API isn't going to be used, so it's important that it doesn't try to actually connect to the API host or something. """ conf = Configuration() conf.host = host_uri return DefaultApi(ApiClient(conf))
def entity_from_json( json_str: str, entity_type: Any, api_client: Optional[ApiClient] = None ) -> Any: """ Hack to take advantage of the code-generated deserialization code See note on `entity_to_dict()` about api_client argument. """ if not api_client: api_client = ApiClient() thing = collections.namedtuple("thing", ["data"]) thing.data = json_str return api_client.deserialize(thing, entity_type)
def __init__(self, api: ApiClient, entity_type: Any, **kwargs) -> None: eg_extra = kwargs.get("editgroup_extra", dict()) eg_extra["git_rev"] = eg_extra.get( "git_rev", subprocess.check_output(["git", "describe", "--always"]).strip()).decode("utf-8") eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityCleaner") self.api = api self.entity_type = entity_type self.dry_run_mode = kwargs.get("dry_run_mode", True) self.edit_batch_size = kwargs.get("edit_batch_size", 50) self.editgroup_description = kwargs.get("editgroup_description", "Generic Entity Cleaner Bot") self.editgroup_extra = eg_extra self.reset() self.ac = ApiClient() if self.dry_run_mode: print("Running in dry-run mode!")
def __init__(self, api, entity_type, **kwargs): eg_extra = kwargs.get('editgroup_extra', dict()) eg_extra['git_rev'] = eg_extra.get( 'git_rev', subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityCleaner') self.api = api self.entity_type = entity_type self.dry_run_mode = kwargs.get('dry_run_mode', True) self.edit_batch_size = kwargs.get('edit_batch_size', 50) self.editgroup_description = kwargs.get('editgroup_description', "Generic Entity Cleaner Bot") self.editgroup_extra = eg_extra self.reset() self.ac = ApiClient() if self.dry_run_mode: print("Running in dry-run mode!")
def generic_entity_delete_entity( user_api: ApiClient, entity_type: str, editgroup_id: str, entity_ident: str ) -> EntityEdit: try: if entity_type == "container": edit = user_api.delete_container(editgroup_id, entity_ident) elif entity_type == "creator": edit = user_api.delete_creator(editgroup_id, entity_ident) elif entity_type == "file": edit = user_api.delete_file(editgroup_id, entity_ident) elif entity_type == "fileset": edit = user_api.delete_fileset(editgroup_id, entity_ident) elif entity_type == "webcapture": edit = user_api.delete_webcapture(editgroup_id, entity_ident) elif entity_type == "release": edit = user_api.delete_release(editgroup_id, entity_ident) elif entity_type == "work": edit = user_api.delete_work(editgroup_id, entity_ident) else: raise NotImplementedError except ApiException as ae: raise ae return edit
def run(self): ac = ApiClient() def fail_fast(err, partitions): if err is not None: print("Kafka consumer commit error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors if p.error: print("Kafka consumer commit error: {}".format(p.error)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) #print("Kafka consumer commit successful") pass def on_rebalance(consumer, partitions): for p in partitions: if p.error: raise KafkaException(p.error) print("Kafka partitions rebalanced: {} / {}".format( consumer, partitions)) consumer_conf = self.kafka_config.copy() consumer_conf.update({ 'group.id': self.consumer_group, 'on_commit': fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker 'enable.auto.commit': True, 'enable.auto.offset.store': False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) 'max.poll.interval.ms': 60000, 'default.topic.config': { 'auto.offset.reset': 'latest', }, }) consumer = Consumer(consumer_conf) consumer.subscribe( [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) while True: batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval) if not batch: if not consumer.assignment(): print("... no Kafka consumer partitions assigned yet") print("... nothing new from kafka, try again (interval: {}". format(self.poll_interval)) continue print("... got {} kafka messages".format(len(batch))) # first check errors on entire batch... for msg in batch: if msg.error(): raise KafkaException(msg.error()) # ... then process bulk_actions = [] for msg in batch: json_str = msg.value().decode('utf-8') # HACK: work around a bug where container entities got published to # release_v03 topic if self.elasticsearch_document_name == "release": entity_dict = json.loads(json_str) if entity_dict.get( 'name') and not entity_dict.get('title'): continue entity = entity_from_json(json_str, self.entity_type, api_client=ac) # TODO: handle deletions from index bulk_actions.append( json.dumps({ "index": { "_id": entity.ident, }, })) bulk_actions.append(json.dumps(self.transform_func(entity))) print("Upserting, eg, {} (of {} releases in elasticsearch)".format( entity.ident, len(batch))) elasticsearch_endpoint = "{}/{}/{}/_bulk".format( self.elasticsearch_backend, self.elasticsearch_index, self.elasticsearch_document_name) resp = requests.post( elasticsearch_endpoint, headers={"Content-Type": "application/x-ndjson"}, data="\n".join(bulk_actions) + "\n") resp.raise_for_status() if resp.json()['errors']: desc = "Elasticsearch errors from post to {}:".format( elasticsearch_endpoint) print(desc) print(resp.content) raise Exception(desc) for msg in batch: # offsets are *committed* (to brokers) automatically, but need # to be marked as processed here consumer.store_offsets(message=msg)
import collections import json from typing import Any, Optional from fatcat_openapi_client import ApiClient _global_serde_api_client = ApiClient() def entity_to_dict(entity: Any, api_client: Optional[ApiClient] = None) -> dict: """ Hack to take advantage of the code-generated serialization code. Initializing/destroying ApiClient objects is surprisingly expensive (because it involves a threadpool), so we allow passing an existing instance. If you already have a full-on API connection `api`, you can access the ApiClient object as `api.api_client`. This is such a speed-up that this argument may become mandatory. """ if not api_client: api_client = _global_serde_api_client return api_client.sanitize_for_serialization(entity) def entity_from_json(json_str: str, entity_type: Any, api_client: Optional[ApiClient] = None) -> Any: """ Hack to take advantage of the code-generated deserialization code
def generic_entity_delete_edit( user_api: ApiClient, entity_type: str, editgroup_id: str, edit_id: str ) -> None: try: if entity_type == "container": user_api.delete_container_edit(editgroup_id, edit_id) elif entity_type == "creator": user_api.delete_creator_edit(editgroup_id, edit_id) elif entity_type == "file": user_api.delete_file_edit(editgroup_id, edit_id) elif entity_type == "fileset": user_api.delete_fileset_edit(editgroup_id, edit_id) elif entity_type == "webcapture": user_api.delete_webcapture_edit(editgroup_id, edit_id) elif entity_type == "release": user_api.delete_release_edit(editgroup_id, edit_id) elif entity_type == "work": user_api.delete_work_edit(editgroup_id, edit_id) else: raise NotImplementedError except ApiException as ae: if ae.status == 404: pass else: raise ae
def run(self) -> None: ac = ApiClient() api = public_api(self.api_host) # only used by container indexing query_stats code path es_client = elasticsearch.Elasticsearch(self.elasticsearch_backend) def fail_fast(err: Any, partitions: List[Any]) -> None: if err is not None: print("Kafka consumer commit error: {}".format(err), file=sys.stderr) print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors if p.error: print("Kafka consumer commit error: {}".format(p.error), file=sys.stderr) print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) # print("Kafka consumer commit successful") pass def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None: for p in partitions: if p.error: raise KafkaException(p.error) print( "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), file=sys.stderr, ) consumer_conf = self.kafka_config.copy() consumer_conf.update( { "group.id": self.consumer_group, "on_commit": fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker "enable.auto.commit": True, "enable.auto.offset.store": False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) "max.poll.interval.ms": 60000, "default.topic.config": { "auto.offset.reset": "latest", }, } ) consumer = Consumer(consumer_conf) consumer.subscribe( [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) while True: batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval) if not batch: if not consumer.assignment(): print("... no Kafka consumer partitions assigned yet", file=sys.stderr) print( "... nothing new from kafka, try again (interval: {}".format( self.poll_interval ), file=sys.stderr, ) continue print("... got {} kafka messages".format(len(batch)), file=sys.stderr) # first check errors on entire batch... for msg in batch: if msg.error(): raise KafkaException(msg.error()) # ... then process bulk_actions = [] for msg in batch: json_str = msg.value().decode("utf-8") entity = entity_from_json(json_str, self.entity_type, api_client=ac) assert isinstance(entity, self.entity_type) if self.entity_type == ChangelogEntry: key = entity.index # might need to fetch from API if not ( entity.editgroup # pylint: disable=no-member # (TODO) and entity.editgroup.editor # pylint: disable=no-member # (TODO) ): entity = api.get_changelog_entry(entity.index) else: key = entity.ident # pylint: disable=no-member # (TODO) if self.entity_type != ChangelogEntry and entity.state == "wip": print( f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}", file=sys.stderr, ) continue if self.entity_type == ContainerEntity and self.query_stats: stats = query_es_container_stats( entity.ident, es_client=es_client, es_index=self.elasticsearch_release_index, merge_shadows=True, ) doc_dict = container_to_elasticsearch(entity, stats=stats) else: doc_dict = self.transform_func(entity) # TODO: handle deletions from index bulk_actions.append( json.dumps( { "index": { "_id": key, }, } ) ) bulk_actions.append(json.dumps(doc_dict)) # if only WIP entities, then skip if not bulk_actions: for msg in batch: consumer.store_offsets(message=msg) continue print( "Upserting, eg, {} (of {} {} in elasticsearch)".format( key, len(batch), self.entity_type.__name__ ), file=sys.stderr, ) elasticsearch_endpoint = "{}/{}/_bulk".format( self.elasticsearch_backend, self.elasticsearch_index ) resp = requests.post( elasticsearch_endpoint, headers={"Content-Type": "application/x-ndjson"}, data="\n".join(bulk_actions) + "\n", ) resp.raise_for_status() if resp.json()["errors"]: desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint) print(desc, file=sys.stderr) print(resp.content, file=sys.stderr) raise Exception(desc) for msg in batch: # offsets are *committed* (to brokers) automatically, but need # to be marked as processed here consumer.store_offsets(message=msg)