Ejemplo n.º 1
0
Archivo: index.py Proyecto: ziqizh/h
    def index(self, annotation_ids=None):
        """
        Reindex annotations.

        :param annotation_ids: a list of ids to reindex, reindexes all when `None`.
        :type annotation_ids: collection

        :returns: a set of errored ids
        :rtype: set
        """
        if not annotation_ids:
            annotations = _all_annotations(session=self.session,
                                           windowsize=PG_WINDOW_SIZE)
        else:
            annotations = _filtered_annotations(session=self.session,
                                                ids=annotation_ids)

        # Report indexing status as we go
        annotations = _log_status(annotations, log_every=PG_WINDOW_SIZE)

        indexing = es_helpers.streaming_bulk(self.es_client.conn, annotations,
                                             chunk_size=ES_CHUNK_SIZE,
                                             raise_on_error=False,
                                             expand_action_callback=self._prepare)
        errored = set()
        for ok, item in indexing:
            if not ok:
                status = item[self.op_type]

                was_doc_exists_err = 'document already exists' in status['error']
                if self.op_type == 'create' and was_doc_exists_err:
                    continue

                errored.add(status['_id'])
        return errored
Ejemplo n.º 2
0
    def index(self, annotation_ids=None, windowsize=PG_WINDOW_SIZE, chunk_size=ES_CHUNK_SIZE):
        """
        Reindex annotations.

        :param annotation_ids: a list of ids to reindex, reindexes all when `None`.
        :type annotation_ids: collection
        :param windowsize: the number of annotations to index in between progress log statements
        :type windowsize: integer
        :param chunk_size: the number of docs in one chunk sent to ES
        :type chunk_size: integer

        :returns: a set of errored ids
        :rtype: set
        """
        if not annotation_ids:
            annotations = _all_annotations(session=self.session, windowsize=windowsize)
        else:
            annotations = _filtered_annotations(session=self.session,
                                                ids=annotation_ids)

        # Report indexing status as we go
        annotations = _log_status(annotations, log_every=windowsize)

        indexing = es_helpers.streaming_bulk(self.es_client.conn, annotations,
                                             chunk_size=chunk_size,
                                             raise_on_error=False,
                                             expand_action_callback=self._prepare)
        errored = set()
        for ok, item in indexing:
            if not ok:
                status = item[self.op_type]

                was_doc_exists_err = 'document already exists' in status['error']
                if self.op_type == 'create' and was_doc_exists_err:
                    continue

                errored.add(status['_id'])
        return errored
Ejemplo n.º 3
0
def es_index(es, actions, errorsfp, logger, _dbg=0):
    """
    Now do the indexing specified by the actions.
    """
    # These need to be defined before the closure below. These work because
    # a closure remembers the binding of a name to an object. If integer
    # objects were used, the name would be bound to that integer value only
    # so for the retries, incrementing the integer would change the outer
    # scope's view of the name.  By using a Counter object, the name to
    # object binding is maintained, but the object contents are changed.
    actions_deque = deque()
    actions_retry_deque = deque()
    retries_tracker = Counter()

    def actions_tracking_closure(cl_actions):
        for cl_action in cl_actions:
            for field in ('_id', '_index', '_type'):
                assert field in cl_action, "Action missing '{}' field:" \
                        " {!r}".format(field, cl_action)
            assert _op_type == cl_action['_op_type'], "Unexpected _op_type" \
                    " value '{}' in action {!r}".format(
                    cl_action['_op_type'], cl_action)

            actions_deque.append((0, cl_action))   # Append to the right side ...
            yield cl_action
            # if after yielding an action some actions appear on the retry deque
            # start yielding those actions until we drain the retry queue.
            backoff = 1
            while len(actions_retry_deque) > 0:
                time.sleep(_calc_backoff_sleep(backoff))
                retries_tracker['retries'] += 1
                retry_actions = []
                # First drain the retry deque entirely so that we know when we
                # have cycled through the entire list to be retried.
                while len(actions_retry_deque) > 0:
                    retry_actions.append(actions_retry_deque.popleft())
                for retry_count, retry_action in retry_actions:
                    # Append to the right side ...
                    actions_deque.append((retry_count, retry_action))
                    yield retry_action
                # if after yielding all the actions to be retried, some show up
                # on the retry deque again, we extend our sleep backoff to avoid
                # pounding on the ES instance.
                backoff += 1

    beg, end = _do_ts(), None
    successes = 0
    duplicates = 0
    failures = 0

    # Create the generator that closes over the external generator, "actions"
    generator = actions_tracking_closure(actions)

    streaming_bulk_generator = helpers.streaming_bulk(
            es, generator, raise_on_error=False,
            raise_on_exception=False, request_timeout=_request_timeout)

    for ok, resp_payload in streaming_bulk_generator:
        retry_count, action = actions_deque.popleft()
        try:
            resp = resp_payload[_op_type]
        except KeyError as e:
            assert not ok, "{!r}".format(ok)
            assert e.args[0] == _op_type, "e.args = {!r}, _op_type = {!r}".format(e.args, _op_type)
            # For whatever reason, some errors are always returned using
            # the "index" operation type instead of _op_type (e.g. "create"
            # op type still comes back as an "index" response).
            try:
                resp = resp_payload['index']
            except KeyError:
                # resp is not of expected form; set it to the complete
                # payload, so that it can be reported properly below.
                resp = resp_payload
        try:
            status = resp['status']
        except KeyError as e:
            assert not ok
            # Limit the length of the error message.
            logger.error("{!r}", e)
            status = 999
        else:
            assert action['_id'] == resp['_id']
        if ok:
            successes += 1
        else:
            if status == 409:
                if retry_count == 0:
                    # Only count duplicates if the retry count is 0 ...
                    duplicates += 1
                else:
                    # ... otherwise consider it successful.
                    successes += 1
            elif status == 400:
                try:
                    exc_payload = resp['exception']
                except KeyError:
                    pass
                else:
                    resp['exception'] = repr(exc_payload)
                jsonstr = json.dumps({ "action": action, "ok": ok, "resp": resp, "retry_count": retry_count, "timestamp": tstos(_do_ts()) }, indent=4, sort_keys=True)
                print(jsonstr, file=errorsfp)
                errorsfp.flush()
                failures += 1
            else:
                try:
                    exc_payload = resp['exception']
                except KeyError:
                    pass
                else:
                    resp['exception'] = repr(exc_payload)
                try:
                    error = resp['error']
                except KeyError:
                    error = ""
                if status == 403 and error.startswith("IndexClosedException"):
                    # Don't retry closed index exceptions
                    jsonstr = json.dumps({ "action": action, "ok": ok, "resp": resp, "retry_count": retry_count, "timestamp": tstos(_do_ts()) }, indent=4, sort_keys=True)
                    print(jsonstr, file=errorsfp)
                    errorsfp.flush()
                    failures += 1
                else:
                    # Retry all other errors.
                    # Limit the length of the error message.
                    logger.warning("retrying action: {}", json.dumps(resp)[:_MAX_ERRMSG_LENGTH])
                    actions_retry_deque.append((retry_count + 1, action))

    end = _do_ts()

    assert len(actions_deque) == 0
    assert len(actions_retry_deque) == 0

    return (beg, end, successes, duplicates, failures, retries_tracker['retries'])
Ejemplo n.º 4
0
def streaming_bulk(es, actions):
    """
    streaming_bulk(es, actions)
    Arguments:
        es - An Elasticsearch client object already constructed
        actions - An iterable for the documents to be indexed
    Returns:
        A tuple with the start and end times, the # of successfully indexed,
        duplicate, and failed documents, along with number of times a bulk
        request was retried.
    """

    # These need to be defined before the closure below. These work because
    # a closure remembers the binding of a name to an object. If integer
    # objects were used, the name would be bound to that integer value only
    # so for the retries, incrementing the integer would change the outer
    # scope's view of the name.  By using a Counter object, the name to
    # object binding is maintained, but the object contents are changed.
    actions_deque = deque()
    actions_retry_deque = deque()
    retries_tracker = Counter()

    def actions_tracking_closure(cl_actions):
        for cl_action in cl_actions:
            assert '_id' in cl_action
            assert '_index' in cl_action
            assert '_type' in cl_action
            assert _op_type == cl_action['_op_type']

            actions_deque.append((0, cl_action))   # Append to the right side ...
            yield cl_action
            # if after yielding an action some actions appear on the retry deque
            # start yielding those actions until we drain the retry queue.
            backoff = 1
            while len(actions_retry_deque) > 0:
                time.sleep(_calc_backoff_sleep(backoff))
                retries_tracker['retries'] += 1
                retry_actions = []
                # First drain the retry deque entirely so that we know when we
                # have cycled through the entire list to be retried.
                while len(actions_retry_deque) > 0:
                    retry_actions.append(actions_retry_deque.popleft())
                for retry_count, retry_action in retry_actions:
                    actions_deque.append((retry_count, retry_action))   # Append to the right side ...
                    yield retry_action
                # if after yielding all the actions to be retried, some show up
                # on the retry deque again, we extend our sleep backoff to avoid
                # pounding on the ES instance.
                backoff += 1

    beg, end = time.time(), None
    successes = 0
    duplicates = 0
    failures = 0

    # Create the generator that closes over the external generator, "actions"
    generator = actions_tracking_closure(actions)

    streaming_bulk_generator = helpers.streaming_bulk(
            es, generator, raise_on_error=False,
            raise_on_exception=False, request_timeout=_request_timeout)

    for ok, resp_payload in streaming_bulk_generator:
        retry_count, action = actions_deque.popleft()
        try:
            resp = resp_payload[_op_type]
            status = resp['status']
        except KeyError as e:
            assert not ok
            # resp is not of expected form
            print(resp)
            status = 999
        else:
            assert action['_id'] == resp['_id']
        if ok:
            successes += 1
        else:
            if status == 409:
                if retry_count == 0:
                    # Only count duplicates if the retry count is 0 ...
                    duplicates += 1
                else:
                    # ... otherwise consider it successful.
                    successes += 1
            elif status == 400:
                doc = {
                        "action": action,
                        "ok": ok,
                        "resp": resp,
                        "retry_count": retry_count,
                        "timestamp": _tstos(time.time())
                        }
                jsonstr = json.dumps(doc, indent=4, sort_keys=True)
                print(jsonstr)
                #errorsfp.flush()
                failures += 1
            else:
                # Retry all other errors
                print(resp)
                actions_retry_deque.append((retry_count + 1, action))

    end = time.time()

    assert len(actions_deque) == 0
    assert len(actions_retry_deque) == 0

    return (beg, end, successes, duplicates, failures, retries_tracker['retries'])
Ejemplo n.º 5
0
def streaming_bulk(es, actions, errorsfp, logger):
    """
    streaming_bulk(es, actions, errorsfp, logger)

    Arguments:

        es - An Elasticsearch client object already constructed
        actions - An iterable for the documents to be indexed
        errorsfp - A file pointer for where to write 400 errors
        logger - A python logging object to use to report behaviors;
                 (the logger is expected to handle {} formatting)

    Returns:

        A tuple with the start and end times, the # of successfully indexed,
        duplicate, and failed documents, along with number of times a bulk
        request was retried.
    """

    # These need to be defined before the closure below. These work because
    # a closure remembers the binding of a name to an object. If integer
    # objects were used, the name would be bound to that integer value only
    # so for the retries, incrementing the integer would change the outer
    # scope's view of the name.  By using a Counter object, the name to
    # object binding is maintained, but the object contents are changed.
    actions_deque = deque()
    actions_retry_deque = deque()
    retries_tracker = Counter()

    def actions_tracking_closure(cl_actions):
        for cl_action in cl_actions:
            for field in ("_id", "_index", "_type"):
                assert (field in cl_action
                        ), f"Action missing '{field}' field: {cl_action!r}"
            assert _op_type == cl_action["_op_type"], (
                "Unexpected _op_type"
                f" value \"{cl_action['_op_type']}\" in action {cl_action!r}")

            # Append to the right side ...
            actions_deque.append((0, cl_action))
            yield cl_action
            # If after yielding an action some actions appear on the retry
            # deque, start yielding those actions until we drain the retry
            # queue.
            backoff = 1
            while len(actions_retry_deque) > 0:
                _sleep_w_backoff(backoff)
                retries_tracker["retries"] += 1
                retry_actions = []
                # First drain the retry deque entirely so that we know when we
                # have cycled through the entire list to be retried.
                while len(actions_retry_deque) > 0:
                    retry_actions.append(actions_retry_deque.popleft())
                for retry_count, retry_action in retry_actions:
                    # Append to the right side ...
                    actions_deque.append((retry_count, retry_action))
                    yield retry_action
                # If after yielding all the actions to be retried, some show
                # up on the retry deque again, we extend our sleep backoff to
                # avoid pounding on the ES instance.
                backoff += 1

    beg, end = time.time(), None
    successes = 0
    duplicates = 0
    failures = 0

    # Create the generator that closes over the external generator, "actions"
    generator = actions_tracking_closure(actions)

    streaming_bulk_generator = helpers.streaming_bulk(
        es,
        generator,
        raise_on_error=False,
        raise_on_exception=False,
        request_timeout=_request_timeout,
    )

    for ok, resp_payload in streaming_bulk_generator:
        retry_count, action = actions_deque.popleft()
        try:
            resp = resp_payload[_op_type]
        except KeyError as e:
            assert not ok, f"ok = {ok!r}, e = {e!r}"
            assert (e.args[0] == _op_type
                    ), f"e.args = {e.args!r}, _op_type = {_op_type!r}"
            # For whatever reason, some errors are always returned using
            # the "index" operation type instead of _op_type (e.g. "create"
            # op type still comes back as an "index" response).
            try:
                resp = resp_payload["index"]
            except KeyError:
                # resp is not of expected form; set it to the complete
                # payload, so that it can be reported properly below.
                resp = resp_payload
        try:
            status = resp["status"]
        except KeyError as e:
            assert not ok, f"ok = {ok!r}, e = {e!r}"
            logger.error("{!r}", e)
            status = 999
        else:
            assert action["_id"] == resp["_id"], (
                "Response encountered out of order from actions, "
                f"action = {action!r}, response = {resp!r}")
        if ok:
            successes += 1
        else:
            if status == 409:
                if retry_count == 0:
                    # Only count duplicates if the retry count is 0 ...
                    duplicates += 1
                else:
                    # ... otherwise consider it successful.
                    successes += 1
            elif status == 400:
                try:
                    exc_payload = resp["exception"]
                except KeyError:
                    pass
                else:
                    # We have an exception object in the response object
                    # which is not always JSON serializable, so we use
                    # `repr` to turn that exception into a serializable
                    # string while maintaining as much information about
                    # the exception as possible.
                    resp["exception"] = repr(exc_payload)
                jsonstr = json.dumps(
                    {
                        "action": action,
                        "ok": ok,
                        "resp": resp,
                        "retry_count": retry_count,
                        "timestamp": _tstos(),
                    },
                    indent=4,
                    sort_keys=True,
                )
                print(jsonstr, file=errorsfp)
                errorsfp.flush()
                failures += 1
            else:
                try:
                    exc_payload = resp["exception"]
                except KeyError:
                    pass
                else:
                    resp["exception"] = repr(exc_payload)
                try:
                    error = resp["error"]
                except KeyError:
                    error = ""
                if status == 403 and error.startswith("IndexClosedException"):
                    # Don't retry closed index exceptions
                    jsonstr = json.dumps(
                        {
                            "action": action,
                            "ok": ok,
                            "resp": resp,
                            "retry_count": retry_count,
                            "timestamp": _tstos(),
                        },
                        indent=4,
                        sort_keys=True,
                    )
                    print(jsonstr, file=errorsfp)
                    errorsfp.flush()
                    failures += 1
                else:
                    # Retry all other errors.
                    # Limit the length of the warning message.
                    logger.warning("retrying action: {}",
                                   json.dumps(resp)[:_MAX_ERRMSG_LENGTH])
                    actions_retry_deque.append((retry_count + 1, action))

    end = time.time()

    if len(actions_deque) > 0:
        logger.error("We still have {:d} actions in the deque",
                     len(actions_deque))
    if len(actions_retry_deque) > 0:
        logger.error("We still have {:d} retry actions in the deque",
                     len(actions_retry_deque))

    return (beg, end, successes, duplicates, failures,
            retries_tracker['retries'])
Ejemplo n.º 6
0
def es_index(es, actions, errorsfp, logger, _dbg=0):
    """
    Now do the indexing specified by the actions.
    """
    # These need to be defined before the closure below. These work because
    # a closure remembers the binding of a name to an object. If integer
    # objects were used, the name would be bound to that integer value only
    # so for the retries, incrementing the integer would change the outer
    # scope's view of the name.  By using a Counter object, the name to
    # object binding is maintained, but the object contents are changed.
    actions_deque = deque()
    actions_retry_deque = deque()
    retries_tracker = Counter()

    def actions_tracking_closure(cl_actions):
        for cl_action in cl_actions:
            for field in ('_id', '_index', '_type'):
                assert field in cl_action, "Action missing '{}' field:" \
                        " {!r}".format(field, cl_action)
            assert _op_type == cl_action['_op_type'], "Unexpected _op_type" \
                    " value '{}' in action {!r}".format(
                    cl_action['_op_type'], cl_action)

            actions_deque.append(
                (0, cl_action))  # Append to the right side ...
            yield cl_action
            # if after yielding an action some actions appear on the retry deque
            # start yielding those actions until we drain the retry queue.
            backoff = 1
            while len(actions_retry_deque) > 0:
                time.sleep(_calc_backoff_sleep(backoff))
                retries_tracker['retries'] += 1
                retry_actions = []
                # First drain the retry deque entirely so that we know when we
                # have cycled through the entire list to be retried.
                while len(actions_retry_deque) > 0:
                    retry_actions.append(actions_retry_deque.popleft())
                for retry_count, retry_action in retry_actions:
                    # Append to the right side ...
                    actions_deque.append((retry_count, retry_action))
                    yield retry_action
                # if after yielding all the actions to be retried, some show up
                # on the retry deque again, we extend our sleep backoff to avoid
                # pounding on the ES instance.
                backoff += 1

    beg, end = _do_ts(), None
    successes = 0
    duplicates = 0
    failures = 0

    # Create the generator that closes over the external generator, "actions"
    generator = actions_tracking_closure(actions)

    streaming_bulk_generator = helpers.streaming_bulk(
        es,
        generator,
        raise_on_error=False,
        raise_on_exception=False,
        request_timeout=_request_timeout)

    for ok, resp_payload in streaming_bulk_generator:
        retry_count, action = actions_deque.popleft()
        try:
            resp = resp_payload[_op_type]
        except KeyError as e:
            assert not ok, "{!r}".format(ok)
            assert e.args[
                0] == _op_type, "e.args = {!r}, _op_type = {!r}".format(
                    e.args, _op_type)
            # For whatever reason, some errors are always returned using
            # the "index" operation type instead of _op_type (e.g. "create"
            # op type still comes back as an "index" response).
            try:
                resp = resp_payload['index']
            except KeyError:
                # resp is not of expected form; set it to the complete
                # payload, so that it can be reported properly below.
                resp = resp_payload
        try:
            status = resp['status']
        except KeyError as e:
            assert not ok
            # Limit the length of the error message.
            logger.error("{!r}", e)
            status = 999
        else:
            assert action['_id'] == resp['_id']
        if ok:
            successes += 1
        else:
            if status == 409:
                if retry_count == 0:
                    # Only count duplicates if the retry count is 0 ...
                    duplicates += 1
                else:
                    # ... otherwise consider it successful.
                    successes += 1
            elif status == 400:
                try:
                    exc_payload = resp['exception']
                except KeyError:
                    pass
                else:
                    resp['exception'] = repr(exc_payload)
                jsonstr = json.dumps(
                    {
                        "action": action,
                        "ok": ok,
                        "resp": resp,
                        "retry_count": retry_count,
                        "timestamp": tstos(_do_ts())
                    },
                    indent=4,
                    sort_keys=True)
                print(jsonstr, file=errorsfp)
                errorsfp.flush()
                failures += 1
            else:
                try:
                    exc_payload = resp['exception']
                except KeyError:
                    pass
                else:
                    resp['exception'] = repr(exc_payload)
                try:
                    error = resp['error']
                except KeyError:
                    error = ""
                if status == 403 and error.startswith("IndexClosedException"):
                    # Don't retry closed index exceptions
                    jsonstr = json.dumps(
                        {
                            "action": action,
                            "ok": ok,
                            "resp": resp,
                            "retry_count": retry_count,
                            "timestamp": tstos(_do_ts())
                        },
                        indent=4,
                        sort_keys=True)
                    print(jsonstr, file=errorsfp)
                    errorsfp.flush()
                    failures += 1
                else:
                    # Retry all other errors.
                    # Limit the length of the error message.
                    logger.warning("retrying action: {}",
                                   json.dumps(resp)[:_MAX_ERRMSG_LENGTH])
                    actions_retry_deque.append((retry_count + 1, action))

    end = _do_ts()

    assert len(actions_deque) == 0
    assert len(actions_retry_deque) == 0

    return (beg, end, successes, duplicates, failures,
            retries_tracker['retries'])