Esempio n. 1
0
def entity_from_json(json_str, entity_type, api_client=None):
    """
    Hack to take advantage of the code-generated deserialization code

    See not on `entity_to_dict()` about api_client argument.
    """
    if not api_client:
        api_client = ApiClient()
    thing = collections.namedtuple('Thing', ['data'])
    thing.data = json_str
    return api_client.deserialize(thing, entity_type)
Esempio n. 2
0
def entity_to_dict(entity, api_client=None):
    """
    Hack to take advantage of the code-generated serialization code.

    Initializing/destroying ApiClient objects is surprisingly expensive
    (because it involves a threadpool), so we allow passing an existing
    instance. If you already have a full-on API connection `api`, you can
    access the ApiClient object as `api.api_client`. This is such a speed-up
    that this argument may become mandatory.
    """
    if not api_client:
        api_client = ApiClient()
    return api_client.sanitize_for_serialization(entity)
Esempio n. 3
0
def authenticated_api(host_uri: str,
                      token: Optional[str] = None) -> DefaultApi:
    """
    Note: if this helper is called, it's implied that an actual API connection
    is needed, so it does try to connect and verify credentials.
    """

    conf = Configuration()
    conf.host = host_uri
    if not token:
        token = os.environ["FATCAT_API_AUTH_TOKEN"]
    if not token:
        sys.stderr.write(
            "This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n"
        )
        sys.exit(-1)

    conf.api_key["Authorization"] = token
    conf.api_key_prefix["Authorization"] = "Bearer"
    api = DefaultApi(ApiClient(conf))

    # verify up front that auth is working
    api.auth_check()

    return api
Esempio n. 4
0
def generic_entity_create_from_toml(
    user_api: ApiClient, entity_type: str, editgroup_id: str, toml_str: str
) -> EntityEdit:
    if entity_type == "container":
        entity = entity_from_toml(toml_str, ContainerEntity)
        edit = user_api.create_container(editgroup_id, entity)
    elif entity_type == "creator":
        entity = entity_from_toml(toml_str, CreatorEntity)
        edit = user_api.create_creator(editgroup_id, entity)
    elif entity_type == "file":
        entity = entity_from_toml(toml_str, FileEntity)
        edit = user_api.create_file(editgroup_id, entity)
    elif entity_type == "fileset":
        entity = entity_from_toml(toml_str, FilesetEntity)
        edit = user_api.create_fileset(editgroup_id, entity)
    elif entity_type == "webcapture":
        entity = entity_from_toml(toml_str, WebcaptureEntity)
        edit = user_api.create_webcapture(editgroup_id, entity)
    elif entity_type == "release":
        entity = entity_from_toml(toml_str, ReleaseEntity)
        edit = user_api.create_release(editgroup_id, entity)
    elif entity_type == "work":
        entity = entity_from_toml(toml_str, WorkEntity)
        edit = user_api.create_work(editgroup_id, entity)
    else:
        raise NotImplementedError
    return edit
Esempio n. 5
0
def public_api(host_uri: str) -> DefaultApi:
    """
    Note: unlike the authenticated variant, this helper might get called even
    if the API isn't going to be used, so it's important that it doesn't try to
    actually connect to the API host or something.
    """
    conf = Configuration()
    conf.host = host_uri
    return DefaultApi(ApiClient(conf))
Esempio n. 6
0
def entity_from_json(
    json_str: str, entity_type: Any, api_client: Optional[ApiClient] = None
) -> Any:
    """
    Hack to take advantage of the code-generated deserialization code

    See note on `entity_to_dict()` about api_client argument.
    """
    if not api_client:
        api_client = ApiClient()
    thing = collections.namedtuple("thing", ["data"])
    thing.data = json_str
    return api_client.deserialize(thing, entity_type)
Esempio n. 7
0
    def __init__(self, api: ApiClient, entity_type: Any, **kwargs) -> None:

        eg_extra = kwargs.get("editgroup_extra", dict())
        eg_extra["git_rev"] = eg_extra.get(
            "git_rev",
            subprocess.check_output(["git", "describe",
                                     "--always"]).strip()).decode("utf-8")
        eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.EntityCleaner")

        self.api = api
        self.entity_type = entity_type
        self.dry_run_mode = kwargs.get("dry_run_mode", True)
        self.edit_batch_size = kwargs.get("edit_batch_size", 50)
        self.editgroup_description = kwargs.get("editgroup_description",
                                                "Generic Entity Cleaner Bot")
        self.editgroup_extra = eg_extra
        self.reset()
        self.ac = ApiClient()

        if self.dry_run_mode:
            print("Running in dry-run mode!")
Esempio n. 8
0
    def __init__(self, api, entity_type, **kwargs):

        eg_extra = kwargs.get('editgroup_extra', dict())
        eg_extra['git_rev'] = eg_extra.get(
            'git_rev',
            subprocess.check_output(["git", "describe",
                                     "--always"]).strip()).decode('utf-8')
        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.EntityCleaner')

        self.api = api
        self.entity_type = entity_type
        self.dry_run_mode = kwargs.get('dry_run_mode', True)
        self.edit_batch_size = kwargs.get('edit_batch_size', 50)
        self.editgroup_description = kwargs.get('editgroup_description',
                                                "Generic Entity Cleaner Bot")
        self.editgroup_extra = eg_extra
        self.reset()
        self.ac = ApiClient()

        if self.dry_run_mode:
            print("Running in dry-run mode!")
Esempio n. 9
0
def generic_entity_delete_entity(
    user_api: ApiClient, entity_type: str, editgroup_id: str, entity_ident: str
) -> EntityEdit:
    try:
        if entity_type == "container":
            edit = user_api.delete_container(editgroup_id, entity_ident)
        elif entity_type == "creator":
            edit = user_api.delete_creator(editgroup_id, entity_ident)
        elif entity_type == "file":
            edit = user_api.delete_file(editgroup_id, entity_ident)
        elif entity_type == "fileset":
            edit = user_api.delete_fileset(editgroup_id, entity_ident)
        elif entity_type == "webcapture":
            edit = user_api.delete_webcapture(editgroup_id, entity_ident)
        elif entity_type == "release":
            edit = user_api.delete_release(editgroup_id, entity_ident)
        elif entity_type == "work":
            edit = user_api.delete_work(editgroup_id, entity_ident)
        else:
            raise NotImplementedError
    except ApiException as ae:
        raise ae
    return edit
Esempio n. 10
0
    def run(self):
        ac = ApiClient()

        def fail_fast(err, partitions):
            if err is not None:
                print("Kafka consumer commit error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)
            for p in partitions:
                # check for partition-specific commit errors
                if p.error:
                    print("Kafka consumer commit error: {}".format(p.error))
                    print("Bailing out...")
                    # TODO: should it be sys.exit(-1)?
                    raise KafkaException(err)
            #print("Kafka consumer commit successful")
            pass

        def on_rebalance(consumer, partitions):
            for p in partitions:
                if p.error:
                    raise KafkaException(p.error)
            print("Kafka partitions rebalanced: {} / {}".format(
                consumer, partitions))

        consumer_conf = self.kafka_config.copy()
        consumer_conf.update({
            'group.id': self.consumer_group,
            'on_commit': fail_fast,
            # messages don't have offset marked as stored until pushed to
            # elastic, but we do auto-commit stored offsets to broker
            'enable.auto.commit': True,
            'enable.auto.offset.store': False,
            # user code timeout; if no poll after this long, assume user code
            # hung and rebalance (default: 5min)
            'max.poll.interval.ms': 60000,
            'default.topic.config': {
                'auto.offset.reset': 'latest',
            },
        })
        consumer = Consumer(consumer_conf)
        consumer.subscribe(
            [self.consume_topic],
            on_assign=on_rebalance,
            on_revoke=on_rebalance,
        )

        while True:
            batch = consumer.consume(num_messages=self.batch_size,
                                     timeout=self.poll_interval)
            if not batch:
                if not consumer.assignment():
                    print("... no Kafka consumer partitions assigned yet")
                print("... nothing new from kafka, try again (interval: {}".
                      format(self.poll_interval))
                continue
            print("... got {} kafka messages".format(len(batch)))
            # first check errors on entire batch...
            for msg in batch:
                if msg.error():
                    raise KafkaException(msg.error())
            # ... then process
            bulk_actions = []
            for msg in batch:
                json_str = msg.value().decode('utf-8')
                # HACK: work around a bug where container entities got published to
                # release_v03 topic
                if self.elasticsearch_document_name == "release":
                    entity_dict = json.loads(json_str)
                    if entity_dict.get(
                            'name') and not entity_dict.get('title'):
                        continue
                entity = entity_from_json(json_str,
                                          self.entity_type,
                                          api_client=ac)
                # TODO: handle deletions from index
                bulk_actions.append(
                    json.dumps({
                        "index": {
                            "_id": entity.ident,
                        },
                    }))
                bulk_actions.append(json.dumps(self.transform_func(entity)))
            print("Upserting, eg, {} (of {} releases in elasticsearch)".format(
                entity.ident, len(batch)))
            elasticsearch_endpoint = "{}/{}/{}/_bulk".format(
                self.elasticsearch_backend, self.elasticsearch_index,
                self.elasticsearch_document_name)
            resp = requests.post(
                elasticsearch_endpoint,
                headers={"Content-Type": "application/x-ndjson"},
                data="\n".join(bulk_actions) + "\n")
            resp.raise_for_status()
            if resp.json()['errors']:
                desc = "Elasticsearch errors from post to {}:".format(
                    elasticsearch_endpoint)
                print(desc)
                print(resp.content)
                raise Exception(desc)
            for msg in batch:
                # offsets are *committed* (to brokers) automatically, but need
                # to be marked as processed here
                consumer.store_offsets(message=msg)
Esempio n. 11
0
import collections
import json
from typing import Any, Optional

from fatcat_openapi_client import ApiClient

_global_serde_api_client = ApiClient()


def entity_to_dict(entity: Any,
                   api_client: Optional[ApiClient] = None) -> dict:
    """
    Hack to take advantage of the code-generated serialization code.

    Initializing/destroying ApiClient objects is surprisingly expensive
    (because it involves a threadpool), so we allow passing an existing
    instance. If you already have a full-on API connection `api`, you can
    access the ApiClient object as `api.api_client`. This is such a speed-up
    that this argument may become mandatory.
    """
    if not api_client:
        api_client = _global_serde_api_client
    return api_client.sanitize_for_serialization(entity)


def entity_from_json(json_str: str,
                     entity_type: Any,
                     api_client: Optional[ApiClient] = None) -> Any:
    """
    Hack to take advantage of the code-generated deserialization code
Esempio n. 12
0
def generic_entity_delete_edit(
    user_api: ApiClient, entity_type: str, editgroup_id: str, edit_id: str
) -> None:
    try:
        if entity_type == "container":
            user_api.delete_container_edit(editgroup_id, edit_id)
        elif entity_type == "creator":
            user_api.delete_creator_edit(editgroup_id, edit_id)
        elif entity_type == "file":
            user_api.delete_file_edit(editgroup_id, edit_id)
        elif entity_type == "fileset":
            user_api.delete_fileset_edit(editgroup_id, edit_id)
        elif entity_type == "webcapture":
            user_api.delete_webcapture_edit(editgroup_id, edit_id)
        elif entity_type == "release":
            user_api.delete_release_edit(editgroup_id, edit_id)
        elif entity_type == "work":
            user_api.delete_work_edit(editgroup_id, edit_id)
        else:
            raise NotImplementedError
    except ApiException as ae:
        if ae.status == 404:
            pass
        else:
            raise ae
Esempio n. 13
0
    def run(self) -> None:
        ac = ApiClient()
        api = public_api(self.api_host)

        # only used by container indexing query_stats code path
        es_client = elasticsearch.Elasticsearch(self.elasticsearch_backend)

        def fail_fast(err: Any, partitions: List[Any]) -> None:
            if err is not None:
                print("Kafka consumer commit error: {}".format(err), file=sys.stderr)
                print("Bailing out...", file=sys.stderr)
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)
            for p in partitions:
                # check for partition-specific commit errors
                if p.error:
                    print("Kafka consumer commit error: {}".format(p.error), file=sys.stderr)
                    print("Bailing out...", file=sys.stderr)
                    # TODO: should it be sys.exit(-1)?
                    raise KafkaException(p.error)
            # print("Kafka consumer commit successful")
            pass

        def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None:
            for p in partitions:
                if p.error:
                    raise KafkaException(p.error)
            print(
                "Kafka partitions rebalanced: {} / {}".format(consumer, partitions),
                file=sys.stderr,
            )

        consumer_conf = self.kafka_config.copy()
        consumer_conf.update(
            {
                "group.id": self.consumer_group,
                "on_commit": fail_fast,
                # messages don't have offset marked as stored until pushed to
                # elastic, but we do auto-commit stored offsets to broker
                "enable.auto.commit": True,
                "enable.auto.offset.store": False,
                # user code timeout; if no poll after this long, assume user code
                # hung and rebalance (default: 5min)
                "max.poll.interval.ms": 60000,
                "default.topic.config": {
                    "auto.offset.reset": "latest",
                },
            }
        )
        consumer = Consumer(consumer_conf)
        consumer.subscribe(
            [self.consume_topic],
            on_assign=on_rebalance,
            on_revoke=on_rebalance,
        )

        while True:
            batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval)
            if not batch:
                if not consumer.assignment():
                    print("... no Kafka consumer partitions assigned yet", file=sys.stderr)
                print(
                    "... nothing new from kafka, try again (interval: {}".format(
                        self.poll_interval
                    ),
                    file=sys.stderr,
                )
                continue
            print("... got {} kafka messages".format(len(batch)), file=sys.stderr)
            # first check errors on entire batch...
            for msg in batch:
                if msg.error():
                    raise KafkaException(msg.error())
            # ... then process
            bulk_actions = []
            for msg in batch:
                json_str = msg.value().decode("utf-8")
                entity = entity_from_json(json_str, self.entity_type, api_client=ac)
                assert isinstance(entity, self.entity_type)
                if self.entity_type == ChangelogEntry:
                    key = entity.index
                    # might need to fetch from API
                    if not (
                        entity.editgroup  # pylint: disable=no-member # (TODO)
                        and entity.editgroup.editor  # pylint: disable=no-member # (TODO)
                    ):
                        entity = api.get_changelog_entry(entity.index)
                else:
                    key = entity.ident  # pylint: disable=no-member # (TODO)

                if self.entity_type != ChangelogEntry and entity.state == "wip":
                    print(
                        f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}",
                        file=sys.stderr,
                    )
                    continue

                if self.entity_type == ContainerEntity and self.query_stats:
                    stats = query_es_container_stats(
                        entity.ident,
                        es_client=es_client,
                        es_index=self.elasticsearch_release_index,
                        merge_shadows=True,
                    )
                    doc_dict = container_to_elasticsearch(entity, stats=stats)
                else:
                    doc_dict = self.transform_func(entity)

                # TODO: handle deletions from index
                bulk_actions.append(
                    json.dumps(
                        {
                            "index": {
                                "_id": key,
                            },
                        }
                    )
                )
                bulk_actions.append(json.dumps(doc_dict))

            # if only WIP entities, then skip
            if not bulk_actions:
                for msg in batch:
                    consumer.store_offsets(message=msg)
                continue

            print(
                "Upserting, eg, {} (of {} {} in elasticsearch)".format(
                    key, len(batch), self.entity_type.__name__
                ),
                file=sys.stderr,
            )
            elasticsearch_endpoint = "{}/{}/_bulk".format(
                self.elasticsearch_backend, self.elasticsearch_index
            )
            resp = requests.post(
                elasticsearch_endpoint,
                headers={"Content-Type": "application/x-ndjson"},
                data="\n".join(bulk_actions) + "\n",
            )
            resp.raise_for_status()
            if resp.json()["errors"]:
                desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint)
                print(desc, file=sys.stderr)
                print(resp.content, file=sys.stderr)
                raise Exception(desc)
            for msg in batch:
                # offsets are *committed* (to brokers) automatically, but need
                # to be marked as processed here
                consumer.store_offsets(message=msg)