def index(self, annotation_ids=None): """ Reindex annotations. :param annotation_ids: a list of ids to reindex, reindexes all when `None`. :type annotation_ids: collection :returns: a set of errored ids :rtype: set """ if not annotation_ids: annotations = _all_annotations(session=self.session, windowsize=PG_WINDOW_SIZE) else: annotations = _filtered_annotations(session=self.session, ids=annotation_ids) # Report indexing status as we go annotations = _log_status(annotations, log_every=PG_WINDOW_SIZE) indexing = es_helpers.streaming_bulk(self.es_client.conn, annotations, chunk_size=ES_CHUNK_SIZE, raise_on_error=False, expand_action_callback=self._prepare) errored = set() for ok, item in indexing: if not ok: status = item[self.op_type] was_doc_exists_err = 'document already exists' in status['error'] if self.op_type == 'create' and was_doc_exists_err: continue errored.add(status['_id']) return errored
def index(self, annotation_ids=None, windowsize=PG_WINDOW_SIZE, chunk_size=ES_CHUNK_SIZE): """ Reindex annotations. :param annotation_ids: a list of ids to reindex, reindexes all when `None`. :type annotation_ids: collection :param windowsize: the number of annotations to index in between progress log statements :type windowsize: integer :param chunk_size: the number of docs in one chunk sent to ES :type chunk_size: integer :returns: a set of errored ids :rtype: set """ if not annotation_ids: annotations = _all_annotations(session=self.session, windowsize=windowsize) else: annotations = _filtered_annotations(session=self.session, ids=annotation_ids) # Report indexing status as we go annotations = _log_status(annotations, log_every=windowsize) indexing = es_helpers.streaming_bulk(self.es_client.conn, annotations, chunk_size=chunk_size, raise_on_error=False, expand_action_callback=self._prepare) errored = set() for ok, item in indexing: if not ok: status = item[self.op_type] was_doc_exists_err = 'document already exists' in status['error'] if self.op_type == 'create' and was_doc_exists_err: continue errored.add(status['_id']) return errored
def es_index(es, actions, errorsfp, logger, _dbg=0): """ Now do the indexing specified by the actions. """ # These need to be defined before the closure below. These work because # a closure remembers the binding of a name to an object. If integer # objects were used, the name would be bound to that integer value only # so for the retries, incrementing the integer would change the outer # scope's view of the name. By using a Counter object, the name to # object binding is maintained, but the object contents are changed. actions_deque = deque() actions_retry_deque = deque() retries_tracker = Counter() def actions_tracking_closure(cl_actions): for cl_action in cl_actions: for field in ('_id', '_index', '_type'): assert field in cl_action, "Action missing '{}' field:" \ " {!r}".format(field, cl_action) assert _op_type == cl_action['_op_type'], "Unexpected _op_type" \ " value '{}' in action {!r}".format( cl_action['_op_type'], cl_action) actions_deque.append((0, cl_action)) # Append to the right side ... yield cl_action # if after yielding an action some actions appear on the retry deque # start yielding those actions until we drain the retry queue. backoff = 1 while len(actions_retry_deque) > 0: time.sleep(_calc_backoff_sleep(backoff)) retries_tracker['retries'] += 1 retry_actions = [] # First drain the retry deque entirely so that we know when we # have cycled through the entire list to be retried. while len(actions_retry_deque) > 0: retry_actions.append(actions_retry_deque.popleft()) for retry_count, retry_action in retry_actions: # Append to the right side ... actions_deque.append((retry_count, retry_action)) yield retry_action # if after yielding all the actions to be retried, some show up # on the retry deque again, we extend our sleep backoff to avoid # pounding on the ES instance. backoff += 1 beg, end = _do_ts(), None successes = 0 duplicates = 0 failures = 0 # Create the generator that closes over the external generator, "actions" generator = actions_tracking_closure(actions) streaming_bulk_generator = helpers.streaming_bulk( es, generator, raise_on_error=False, raise_on_exception=False, request_timeout=_request_timeout) for ok, resp_payload in streaming_bulk_generator: retry_count, action = actions_deque.popleft() try: resp = resp_payload[_op_type] except KeyError as e: assert not ok, "{!r}".format(ok) assert e.args[0] == _op_type, "e.args = {!r}, _op_type = {!r}".format(e.args, _op_type) # For whatever reason, some errors are always returned using # the "index" operation type instead of _op_type (e.g. "create" # op type still comes back as an "index" response). try: resp = resp_payload['index'] except KeyError: # resp is not of expected form; set it to the complete # payload, so that it can be reported properly below. resp = resp_payload try: status = resp['status'] except KeyError as e: assert not ok # Limit the length of the error message. logger.error("{!r}", e) status = 999 else: assert action['_id'] == resp['_id'] if ok: successes += 1 else: if status == 409: if retry_count == 0: # Only count duplicates if the retry count is 0 ... duplicates += 1 else: # ... otherwise consider it successful. successes += 1 elif status == 400: try: exc_payload = resp['exception'] except KeyError: pass else: resp['exception'] = repr(exc_payload) jsonstr = json.dumps({ "action": action, "ok": ok, "resp": resp, "retry_count": retry_count, "timestamp": tstos(_do_ts()) }, indent=4, sort_keys=True) print(jsonstr, file=errorsfp) errorsfp.flush() failures += 1 else: try: exc_payload = resp['exception'] except KeyError: pass else: resp['exception'] = repr(exc_payload) try: error = resp['error'] except KeyError: error = "" if status == 403 and error.startswith("IndexClosedException"): # Don't retry closed index exceptions jsonstr = json.dumps({ "action": action, "ok": ok, "resp": resp, "retry_count": retry_count, "timestamp": tstos(_do_ts()) }, indent=4, sort_keys=True) print(jsonstr, file=errorsfp) errorsfp.flush() failures += 1 else: # Retry all other errors. # Limit the length of the error message. logger.warning("retrying action: {}", json.dumps(resp)[:_MAX_ERRMSG_LENGTH]) actions_retry_deque.append((retry_count + 1, action)) end = _do_ts() assert len(actions_deque) == 0 assert len(actions_retry_deque) == 0 return (beg, end, successes, duplicates, failures, retries_tracker['retries'])
def streaming_bulk(es, actions): """ streaming_bulk(es, actions) Arguments: es - An Elasticsearch client object already constructed actions - An iterable for the documents to be indexed Returns: A tuple with the start and end times, the # of successfully indexed, duplicate, and failed documents, along with number of times a bulk request was retried. """ # These need to be defined before the closure below. These work because # a closure remembers the binding of a name to an object. If integer # objects were used, the name would be bound to that integer value only # so for the retries, incrementing the integer would change the outer # scope's view of the name. By using a Counter object, the name to # object binding is maintained, but the object contents are changed. actions_deque = deque() actions_retry_deque = deque() retries_tracker = Counter() def actions_tracking_closure(cl_actions): for cl_action in cl_actions: assert '_id' in cl_action assert '_index' in cl_action assert '_type' in cl_action assert _op_type == cl_action['_op_type'] actions_deque.append((0, cl_action)) # Append to the right side ... yield cl_action # if after yielding an action some actions appear on the retry deque # start yielding those actions until we drain the retry queue. backoff = 1 while len(actions_retry_deque) > 0: time.sleep(_calc_backoff_sleep(backoff)) retries_tracker['retries'] += 1 retry_actions = [] # First drain the retry deque entirely so that we know when we # have cycled through the entire list to be retried. while len(actions_retry_deque) > 0: retry_actions.append(actions_retry_deque.popleft()) for retry_count, retry_action in retry_actions: actions_deque.append((retry_count, retry_action)) # Append to the right side ... yield retry_action # if after yielding all the actions to be retried, some show up # on the retry deque again, we extend our sleep backoff to avoid # pounding on the ES instance. backoff += 1 beg, end = time.time(), None successes = 0 duplicates = 0 failures = 0 # Create the generator that closes over the external generator, "actions" generator = actions_tracking_closure(actions) streaming_bulk_generator = helpers.streaming_bulk( es, generator, raise_on_error=False, raise_on_exception=False, request_timeout=_request_timeout) for ok, resp_payload in streaming_bulk_generator: retry_count, action = actions_deque.popleft() try: resp = resp_payload[_op_type] status = resp['status'] except KeyError as e: assert not ok # resp is not of expected form print(resp) status = 999 else: assert action['_id'] == resp['_id'] if ok: successes += 1 else: if status == 409: if retry_count == 0: # Only count duplicates if the retry count is 0 ... duplicates += 1 else: # ... otherwise consider it successful. successes += 1 elif status == 400: doc = { "action": action, "ok": ok, "resp": resp, "retry_count": retry_count, "timestamp": _tstos(time.time()) } jsonstr = json.dumps(doc, indent=4, sort_keys=True) print(jsonstr) #errorsfp.flush() failures += 1 else: # Retry all other errors print(resp) actions_retry_deque.append((retry_count + 1, action)) end = time.time() assert len(actions_deque) == 0 assert len(actions_retry_deque) == 0 return (beg, end, successes, duplicates, failures, retries_tracker['retries'])
def streaming_bulk(es, actions, errorsfp, logger): """ streaming_bulk(es, actions, errorsfp, logger) Arguments: es - An Elasticsearch client object already constructed actions - An iterable for the documents to be indexed errorsfp - A file pointer for where to write 400 errors logger - A python logging object to use to report behaviors; (the logger is expected to handle {} formatting) Returns: A tuple with the start and end times, the # of successfully indexed, duplicate, and failed documents, along with number of times a bulk request was retried. """ # These need to be defined before the closure below. These work because # a closure remembers the binding of a name to an object. If integer # objects were used, the name would be bound to that integer value only # so for the retries, incrementing the integer would change the outer # scope's view of the name. By using a Counter object, the name to # object binding is maintained, but the object contents are changed. actions_deque = deque() actions_retry_deque = deque() retries_tracker = Counter() def actions_tracking_closure(cl_actions): for cl_action in cl_actions: for field in ("_id", "_index", "_type"): assert (field in cl_action ), f"Action missing '{field}' field: {cl_action!r}" assert _op_type == cl_action["_op_type"], ( "Unexpected _op_type" f" value \"{cl_action['_op_type']}\" in action {cl_action!r}") # Append to the right side ... actions_deque.append((0, cl_action)) yield cl_action # If after yielding an action some actions appear on the retry # deque, start yielding those actions until we drain the retry # queue. backoff = 1 while len(actions_retry_deque) > 0: _sleep_w_backoff(backoff) retries_tracker["retries"] += 1 retry_actions = [] # First drain the retry deque entirely so that we know when we # have cycled through the entire list to be retried. while len(actions_retry_deque) > 0: retry_actions.append(actions_retry_deque.popleft()) for retry_count, retry_action in retry_actions: # Append to the right side ... actions_deque.append((retry_count, retry_action)) yield retry_action # If after yielding all the actions to be retried, some show # up on the retry deque again, we extend our sleep backoff to # avoid pounding on the ES instance. backoff += 1 beg, end = time.time(), None successes = 0 duplicates = 0 failures = 0 # Create the generator that closes over the external generator, "actions" generator = actions_tracking_closure(actions) streaming_bulk_generator = helpers.streaming_bulk( es, generator, raise_on_error=False, raise_on_exception=False, request_timeout=_request_timeout, ) for ok, resp_payload in streaming_bulk_generator: retry_count, action = actions_deque.popleft() try: resp = resp_payload[_op_type] except KeyError as e: assert not ok, f"ok = {ok!r}, e = {e!r}" assert (e.args[0] == _op_type ), f"e.args = {e.args!r}, _op_type = {_op_type!r}" # For whatever reason, some errors are always returned using # the "index" operation type instead of _op_type (e.g. "create" # op type still comes back as an "index" response). try: resp = resp_payload["index"] except KeyError: # resp is not of expected form; set it to the complete # payload, so that it can be reported properly below. resp = resp_payload try: status = resp["status"] except KeyError as e: assert not ok, f"ok = {ok!r}, e = {e!r}" logger.error("{!r}", e) status = 999 else: assert action["_id"] == resp["_id"], ( "Response encountered out of order from actions, " f"action = {action!r}, response = {resp!r}") if ok: successes += 1 else: if status == 409: if retry_count == 0: # Only count duplicates if the retry count is 0 ... duplicates += 1 else: # ... otherwise consider it successful. successes += 1 elif status == 400: try: exc_payload = resp["exception"] except KeyError: pass else: # We have an exception object in the response object # which is not always JSON serializable, so we use # `repr` to turn that exception into a serializable # string while maintaining as much information about # the exception as possible. resp["exception"] = repr(exc_payload) jsonstr = json.dumps( { "action": action, "ok": ok, "resp": resp, "retry_count": retry_count, "timestamp": _tstos(), }, indent=4, sort_keys=True, ) print(jsonstr, file=errorsfp) errorsfp.flush() failures += 1 else: try: exc_payload = resp["exception"] except KeyError: pass else: resp["exception"] = repr(exc_payload) try: error = resp["error"] except KeyError: error = "" if status == 403 and error.startswith("IndexClosedException"): # Don't retry closed index exceptions jsonstr = json.dumps( { "action": action, "ok": ok, "resp": resp, "retry_count": retry_count, "timestamp": _tstos(), }, indent=4, sort_keys=True, ) print(jsonstr, file=errorsfp) errorsfp.flush() failures += 1 else: # Retry all other errors. # Limit the length of the warning message. logger.warning("retrying action: {}", json.dumps(resp)[:_MAX_ERRMSG_LENGTH]) actions_retry_deque.append((retry_count + 1, action)) end = time.time() if len(actions_deque) > 0: logger.error("We still have {:d} actions in the deque", len(actions_deque)) if len(actions_retry_deque) > 0: logger.error("We still have {:d} retry actions in the deque", len(actions_retry_deque)) return (beg, end, successes, duplicates, failures, retries_tracker['retries'])
def es_index(es, actions, errorsfp, logger, _dbg=0): """ Now do the indexing specified by the actions. """ # These need to be defined before the closure below. These work because # a closure remembers the binding of a name to an object. If integer # objects were used, the name would be bound to that integer value only # so for the retries, incrementing the integer would change the outer # scope's view of the name. By using a Counter object, the name to # object binding is maintained, but the object contents are changed. actions_deque = deque() actions_retry_deque = deque() retries_tracker = Counter() def actions_tracking_closure(cl_actions): for cl_action in cl_actions: for field in ('_id', '_index', '_type'): assert field in cl_action, "Action missing '{}' field:" \ " {!r}".format(field, cl_action) assert _op_type == cl_action['_op_type'], "Unexpected _op_type" \ " value '{}' in action {!r}".format( cl_action['_op_type'], cl_action) actions_deque.append( (0, cl_action)) # Append to the right side ... yield cl_action # if after yielding an action some actions appear on the retry deque # start yielding those actions until we drain the retry queue. backoff = 1 while len(actions_retry_deque) > 0: time.sleep(_calc_backoff_sleep(backoff)) retries_tracker['retries'] += 1 retry_actions = [] # First drain the retry deque entirely so that we know when we # have cycled through the entire list to be retried. while len(actions_retry_deque) > 0: retry_actions.append(actions_retry_deque.popleft()) for retry_count, retry_action in retry_actions: # Append to the right side ... actions_deque.append((retry_count, retry_action)) yield retry_action # if after yielding all the actions to be retried, some show up # on the retry deque again, we extend our sleep backoff to avoid # pounding on the ES instance. backoff += 1 beg, end = _do_ts(), None successes = 0 duplicates = 0 failures = 0 # Create the generator that closes over the external generator, "actions" generator = actions_tracking_closure(actions) streaming_bulk_generator = helpers.streaming_bulk( es, generator, raise_on_error=False, raise_on_exception=False, request_timeout=_request_timeout) for ok, resp_payload in streaming_bulk_generator: retry_count, action = actions_deque.popleft() try: resp = resp_payload[_op_type] except KeyError as e: assert not ok, "{!r}".format(ok) assert e.args[ 0] == _op_type, "e.args = {!r}, _op_type = {!r}".format( e.args, _op_type) # For whatever reason, some errors are always returned using # the "index" operation type instead of _op_type (e.g. "create" # op type still comes back as an "index" response). try: resp = resp_payload['index'] except KeyError: # resp is not of expected form; set it to the complete # payload, so that it can be reported properly below. resp = resp_payload try: status = resp['status'] except KeyError as e: assert not ok # Limit the length of the error message. logger.error("{!r}", e) status = 999 else: assert action['_id'] == resp['_id'] if ok: successes += 1 else: if status == 409: if retry_count == 0: # Only count duplicates if the retry count is 0 ... duplicates += 1 else: # ... otherwise consider it successful. successes += 1 elif status == 400: try: exc_payload = resp['exception'] except KeyError: pass else: resp['exception'] = repr(exc_payload) jsonstr = json.dumps( { "action": action, "ok": ok, "resp": resp, "retry_count": retry_count, "timestamp": tstos(_do_ts()) }, indent=4, sort_keys=True) print(jsonstr, file=errorsfp) errorsfp.flush() failures += 1 else: try: exc_payload = resp['exception'] except KeyError: pass else: resp['exception'] = repr(exc_payload) try: error = resp['error'] except KeyError: error = "" if status == 403 and error.startswith("IndexClosedException"): # Don't retry closed index exceptions jsonstr = json.dumps( { "action": action, "ok": ok, "resp": resp, "retry_count": retry_count, "timestamp": tstos(_do_ts()) }, indent=4, sort_keys=True) print(jsonstr, file=errorsfp) errorsfp.flush() failures += 1 else: # Retry all other errors. # Limit the length of the error message. logger.warning("retrying action: {}", json.dumps(resp)[:_MAX_ERRMSG_LENGTH]) actions_retry_deque.append((retry_count + 1, action)) end = _do_ts() assert len(actions_deque) == 0 assert len(actions_retry_deque) == 0 return (beg, end, successes, duplicates, failures, retries_tracker['retries'])