def transform_data(worker: TaskSpec, records: List[dict], converters: Dict[str, Callable], routing_field: Optional[str] = None) -> List[dict]: logger.info( format_log(f"Transforming data", name=worker.name, action="Transform")) start = perf_counter() for record in records: for field, converter in converters.items(): record[field] = converter(record[field]) # Route all documents with the same recipient to the same shard # This allows for accuracy and early-termination of "top N" recipient category aggregation queries # Recipient is are highest-cardinality category with over 2M unique values to aggregate against, # and this is needed for performance # ES helper will pop any "meta" fields like "routing" from provided data dict and use them in the action if routing_field: record["routing"] = record[routing_field] # Explicitly setting the ES _id field to match the postgres PK value allows # bulk index operations to be upserts without creating duplicate documents # IF and ONLY IF a routing meta field is not also provided (one whose value differs # from the doc _id field). If explicit routing is done, UPSERTs may cause duplicates, # so docs must be deleted before UPSERTed. (More info in streaming_post_to_es(...)) record["_id"] = record[worker.field_for_es_id] duration = perf_counter() - start logger.info( format_log(f"Transformation operation took {duration:.2f}s", name=worker.name, action="Transform")) return records
def delete_docs_by_unique_key(client: Elasticsearch, key: str, value_list: list, task_id: str, index) -> int: """ Bulk delete a batch of documents whose field identified by ``key`` matches any value provided in the ``values_list``. Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster key (str): name of filed in targeted elasticearch index that shoudld have a unique value for every doc in the index. Ideally the field or sub-field provided is of ``keyword`` type. value_list (list): if key field has these values, the document will be deleted task_id (str): name of ES ETL job being run, used in logging index (str): name of index (or alias) to target for the ``_delete_by_query`` ES operation. NOTE: This delete routine looks at just the index name given. If there are duplicate records across multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple indices, or this will need to be run once per index. Returns: Number of ES documents deleted """ start = perf_counter() if len(value_list) == 0: logger.info( format_log("Nothing to delete", action="Delete", name=task_id)) return 0 logger.info( format_log(f"Deleting up to {len(value_list):,} document(s)", action="Delete", name=task_id)) if not index: raise RuntimeError("index name must be provided") deleted = 0 is_error = False try: # 65,536 is max number of terms that can be added to an ES terms filter query values_generator = chunks(value_list, 50000) for chunk_of_values in values_generator: # Creates an Elasticsearch query criteria for the _delete_by_query call q = ES_Q("terms", **{key: chunk_of_values}) # Invoking _delete_by_query as per the elasticsearch-dsl docs: # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#delete-by-query response = Search(using=client, index=index).filter(q).delete() chunk_deletes = response["deleted"] deleted += chunk_deletes except Exception: is_error = True logger.exception(format_log("", name=task_id, action="Delete")) raise SystemExit(1) finally: error_text = " before encountering an error" if is_error else "" duration = perf_counter() - start docs = f"document{'s' if deleted != 1 else ''}" msg = f"Delete operation took {duration:.2f}s. Removed {deleted:,} {docs}{error_text}" logger.info(format_log(msg, action="Delete", name=task_id)) return deleted
def deleted_awards(client: Elasticsearch, config: dict) -> None: """Delete all awards in the Elasticsearch awards index that were deleted in the source database. This performs the deletes of award documents in ES in a series of batches, as there could be many. Millions of awards deleted may take a prohibitively long time, and it could be better to just re-index all documents from the DB instead. This requires looking-up the awards-to-delete by finding the unique-key of each parent award to any deleted transaction, and then getting the distinct list of unique-award-keys that are NOT present in the database; then deleting those in the ES awards index. - The deleted transactions are recorded in a CSV delete log file in S3. - NOTE!! This order of operations therefore requires that ES award deletes be processed BEFORE transaction ES deletes are (both deletes cannot run in parallel). Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task Returns: None """ deleted_ids = _gather_deleted_ids(config) # While extracting unique award keys, the lookup is on transactions and must match against the unique transaction id id_list = [{ "key": deleted_id, "col": ES_TRANSACTIONS_UNIQUE_KEY_FIELD } for deleted_id in deleted_ids] award_ids = _lookup_deleted_award_ids( client, id_list, config, settings.ES_TRANSACTIONS_QUERY_ALIAS_PREFIX + "-*") if (len(award_ids)) == 0: logger.info( format_log(f"No related awards require deletion", action="Delete")) return deleted_award_ids = _check_awards_for_deletes(award_ids) if len(deleted_award_ids) == 0: logger.info( format_log(f"No related awards require deletion", action="Delete")) return award_id_list = [{ "key": deleted_award[config["unique_key_field"]], "col": config["unique_key_field"] } for deleted_award in deleted_award_ids] _delete_from_es( client, award_id_list, index=config["query_alias_prefix"], max_query_size=config["max_query_size"], use_aliases=True, ) return
def transform_covid19_faba_data(worker: TaskSpec, records: List[dict]) -> List[dict]: logger.info( format_log(f"Transforming data", name=worker.name, action="Transform")) start = perf_counter() results = {} for record in records: es_id_field = record[worker.field_for_es_id] disinct_award_key = record.pop("financial_account_distinct_award_key") award_id = record.pop("award_id") award_type = record.pop("type") generated_unique_award_id = record.pop("generated_unique_award_id") total_loan_value = record.pop("total_loan_value") obligated_sum = record.get("transaction_obligated_amount" ) or 0 # record value for key may be None outlay_sum = ( (record.get("gross_outlay_amount_by_award_cpe") or 0) + (record.get( "ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe") or 0) + (record.get( "ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe") or 0)) # record value for any key may be None temp_key = disinct_award_key if temp_key not in results: results[temp_key] = { "financial_account_distinct_award_key": disinct_award_key, "award_id": award_id, "type": award_type, "generated_unique_award_id": generated_unique_award_id, "total_loan_value": total_loan_value, "financial_accounts_by_award": list(), "obligated_sum": 0, "outlay_sum": 0, "_id": es_id_field, } results[temp_key]["obligated_sum"] += obligated_sum if record.get("is_final_balances_for_fy"): results[temp_key]["outlay_sum"] += outlay_sum results[temp_key]["financial_accounts_by_award"].append(record) if len(results) != len(records): msg = f"Transformed {len(records)} database records into {len(results)} documents for ingest" logger.info(format_log(msg, name=worker.name, action="Transform")) msg = f"Transformation operation took {perf_counter() - start:.2f}s" logger.info(format_log(msg, name=worker.name, action="Transform")) return list(results.values() ) # don't need the dict key, return a list of the dict values
def extract_records(task: TaskSpec) -> List[dict]: start = perf_counter() logger.info( format_log(f"Extracting data from source", name=task.name, action="Extract")) try: records = task.execute_sql_func(task.sql, True) except Exception as e: logger.exception(f"Failed on partition {task.name} with '{task.sql}'") raise e msg = f"{len(records):,} records extracted in {perf_counter() - start:.2f}s" logger.info(format_log(msg, name=task.name, action="Extract")) return records
def count_of_records_to_process(config: dict) -> Tuple[int, int, int]: start = perf_counter() results = execute_sql_statement(obtain_min_max_count_sql(config), True, config["verbose"])[0] min_id, max_id, count = results["min"], results["max"], results["count"] msg = f"Found {count:,} {config['data_type']} DB records, took {perf_counter() - start:.2f}s" logger.info(format_log(msg, action="Extract")) return count, min_id, max_id
def create_load_alias(client, config): # ensure the new index is added to the alias used for incremental loads. # If the alias is on multiple indexes, the loads will fail! logger.info( format_log( f"Putting alias '{config['write_alias']}' on {config['index_name']}", action="ES Alias")) put_alias(client, config["index_name"], config["write_alias"], {})
def create_award_type_aliases(client, config): for award_type, award_type_codes in INDEX_ALIASES_TO_AWARD_TYPES.items(): alias_name = f"{config['query_alias_prefix']}-{award_type}" if config["verbose"]: msg = f"Putting alias '{alias_name}' on {config['index_name']} with award codes {award_type_codes}" logger.info(format_log(msg, action="ES Alias")) alias_body = {"filter": {"terms": {"type": award_type_codes}}} put_alias(client, config["index_name"], alias_name, alias_body)
def load_data(worker: TaskSpec, records: List[dict], client: Elasticsearch) -> Tuple[int, int]: start = perf_counter() logger.info( format_log(f"Starting Index operation", name=worker.name, action="Index")) success, failed = streaming_post_to_es( client, records, worker.index, worker.name, delete_before_index=worker.is_incremental) logger.info( format_log(f"Index operation took {perf_counter() - start:.2f}s", name=worker.name, action="Index")) return success, failed
def create_index(index, client): try: does_index_exist = client.indices.exists(index) except Exception: logger.exception("Unable to query cluster for indices") raise SystemExit(1) if not does_index_exist: logger.info(format_log(f"Creating index '{index}'", action="Index")) client.indices.create(index=index) client.indices.refresh(index)
def toggle_refresh_on(client, index): response = client.indices.get(index) aliased_index_name = list(response.keys())[0] current_refresh_interval = response[aliased_index_name]["settings"]["index"]["refresh_interval"] es_settingsfile = str(settings.APP_DIR / "etl" / "es_config_objects.json") with open(es_settingsfile) as f: settings_dict = json.load(f) final_refresh_interval = settings_dict["final_index_settings"]["refresh_interval"] client.indices.put_settings({"refresh_interval": final_refresh_interval}, index) message = f'Changed "refresh_interval" from {current_refresh_interval} to {final_refresh_interval}' logger.info(format_log(message, action="ES Settings"))
def swap_aliases(client, config): if client.indices.get_alias(config["index_name"], "*"): logger.info(format_log(f"Removing old aliases for index '{config['index_name']}'", action="ES Alias")) client.indices.delete_alias(config["index_name"], "_all") alias_patterns = config["query_alias_prefix"] + "*" old_indexes = [] try: old_indexes = list(client.indices.get_alias("*", alias_patterns).keys()) for old_index in old_indexes: client.indices.delete_alias(old_index, "_all") logger.info(format_log(f"Removing aliases from '{old_index}'", action="ES Alias")) except Exception: logger.exception(format_log(f"No aliases found for {alias_patterns}", action="ES Alias")) if config["create_award_type_aliases"]: create_award_type_aliases(client, config) else: create_read_alias(client, config) create_load_alias(client, config) try: if old_indexes: max_wait_time = 15 # in minutes start_wait_time = time.time() is_snapshot_conflict = is_snapshot_running(client, old_indexes) if is_snapshot_conflict: logger.info( format_log( f"Snapshot in-progress prevents delete; waiting up to {max_wait_time} minutes", action="ES Alias", ) ) while (time.time() - start_wait_time) < (max_wait_time * 60) and is_snapshot_conflict: logger.info(format_log("Waiting while snapshot is in-progress", action="ES Alias")) time.sleep(90) is_snapshot_conflict = is_snapshot_running(client, old_indexes) if is_snapshot_conflict: config["raise_status_code_3"] = True logger.error( format_log( f"Unable to delete index(es) '{old_indexes}' due to in-progress snapshot", action="ES Alias" ) ) else: client.indices.delete(index=old_indexes, ignore_unavailable=False) logger.info(format_log(f"Deleted index(es) '{old_indexes}'", action="ES Alias")) except Exception: logger.exception(format_log(f"Unable to delete indexes: {old_indexes}", action="ES Alias"))
def set_final_index_config(client, index): es_settingsfile = str(settings.APP_DIR / "etl" / "es_config_objects.json") with open(es_settingsfile) as f: settings_dict = json.load(f) final_index_settings = settings_dict["final_index_settings"] current_settings = client.indices.get(index)[index]["settings"]["index"] client.indices.put_settings(final_index_settings, index) client.indices.refresh(index) for setting, value in final_index_settings.items(): message = f'Changing "{setting}" from {current_settings.get(setting)} to {value}' logger.info(format_log(message, action="ES Settings"))
def swap_aliases(client, config): if client.indices.get_alias(config["index_name"], "*"): logger.info( format_log( f"Removing old aliases for index '{config['index_name']}'", action="ES Alias")) client.indices.delete_alias(config["index_name"], "_all") alias_patterns = config["query_alias_prefix"] + "*" old_indexes = [] try: old_indexes = list( client.indices.get_alias("*", alias_patterns).keys()) for old_index in old_indexes: client.indices.delete_alias(old_index, "_all") logger.info( format_log(f"Removing aliases from '{old_index}'", action="ES Alias")) except Exception: logger.exception(f"No aliases found for {alias_patterns}", action="ES Alias") if config["create_award_type_aliases"]: create_award_type_aliases(client, config) else: create_read_alias(client, config) create_load_alias(client, config) try: if old_indexes: client.indices.delete(index=old_indexes, ignore_unavailable=False) logger.info( format_log(f"Deleted index(es) '{old_indexes}'", action="ES Alias")) except Exception: logger.exception(f"Unable to delete indexes: {old_indexes}", action="ES Alias")
def deleted_awards(client: Elasticsearch, config: dict) -> None: """ so we have to find all the awards connected to these transactions, if we can't find the awards in the database, then we have to delete them from es """ deleted_ids = gather_deleted_ids(config) id_list = [{ "key": deleted_id, "col": config["unique_key_field"] } for deleted_id in deleted_ids] award_ids = get_deleted_award_ids( client, id_list, config, settings.ES_TRANSACTIONS_QUERY_ALIAS_PREFIX + "-*") if (len(award_ids)) == 0: logger.info( format_log(f"No related awards require deletion", action="Delete")) return deleted_award_ids = check_awards_for_deletes(award_ids) if len(deleted_award_ids) == 0: logger.info( format_log(f"No related awards require deletion", action="Delete")) return award_id_list = [{ "key": deleted_award["generated_unique_award_id"], "col": config["unique_key_field"] } for deleted_award in deleted_award_ids] delete_from_es( client, award_id_list, index=config["query_alias_prefix"], max_query_size=config["max_query_size"], use_aliases=True, ) return
def delete_from_es( client: Elasticsearch, id_list: List[dict], index: str, max_query_size: int, use_aliases: bool = False, task_id: Optional[Tuple[int, str]] = None, ) -> None: """ id_list = [ {key: 'key1', col: 'tranaction_id'}, {key: 'key2', col: 'generated_unique_transaction_id'}, ... ] - or - id_list = [ {key: 'key1', col: 'award_id'}, {key: 'key2', col: 'generated_unique_award_id'}, ... ] """ start = perf_counter() msg = f"Deleting up to {len(id_list):,} document{'s' if len(id_list) != 1 else ''}" logger.info(format_log(msg, name=task_id, action="Delete")) if use_aliases: index = f"{index}-*" start_ = client.count(index=index)["count"] logger.info( format_log(f"Starting amount of indices ----- {start_:,}", name=task_id, action="Delete")) col_to_items_dict = defaultdict(list) for l in id_list: col_to_items_dict[l["col"]].append(l["key"]) for column, values in col_to_items_dict.items(): logger.info( format_log(f"Deleting {len(values):,} of '{column}'", name=task_id, action="Delete")) values_generator = chunks(values, 1000) for v in values_generator: # IMPORTANT: This delete routine looks at just 1 index at a time. If there are duplicate records across # multiple indexes, those duplicates will not be caught by this routine. It is left as is because at the # time of this comment, we are migrating to using a single index. body = filter_query(column, v) response = client.search(index=index, body=json.dumps(body), size=max_query_size) delete_body = delete_query(response) try: client.delete_by_query(index=index, body=json.dumps(delete_body), refresh=True, size=max_query_size) except Exception: logger.exception(format_log("", name=task_id, action="Delete")) raise SystemExit(1) end_ = client.count(index=index)["count"] record_count = start_ - end_ duration = perf_counter() - start msg = f"Delete operation took {duration:.2f}s. Removed {record_count:,} document{'s' if record_count != 1 else ''}" logger.info(format_log(msg, name=task_id, action="Delete")) return
def gather_deleted_ids(config: dict) -> list: """ Connect to S3 and gather all of the transaction ids stored in CSV files generated by the broker when transactions are removed from the DB. """ if not config["process_deletes"]: logger.info( format_log(f"Skipping the S3 CSV fetch for deleted transactions", action="Delete")) return logger.info( format_log(f"Gathering all deleted transactions from S3", action="Delete")) start = perf_counter() bucket_objects = retrieve_s3_bucket_object_list( bucket_name=config["s3_bucket"]) logger.info( format_log( f"{len(bucket_objects):,} files found in bucket '{config['s3_bucket']}'", action="Delete")) if config["verbose"]: logger.info( format_log(f"CSV data from {config['starting_date']} to now", action="Delete")) filtered_csv_list = [ x for x in bucket_objects if (x.key.endswith(".csv") and not x.key.startswith("staging") and x.last_modified >= config["starting_date"]) ] if config["verbose"]: logger.info( format_log(f"Found {len(filtered_csv_list)} csv files", action="Delete")) deleted_ids = {} for obj in filtered_csv_list: object_data = access_s3_object(bucket_name=config["s3_bucket"], obj=obj) # Ingests the CSV into a dataframe. pandas thinks some ids are dates, so disable parsing data = pd.read_csv(object_data, dtype=str) if "detached_award_proc_unique" in data: new_ids = [ "CONT_TX_" + x.upper() for x in data["detached_award_proc_unique"].values ] elif "afa_generated_unique" in data: new_ids = [ "ASST_TX_" + x.upper() for x in data["afa_generated_unique"].values ] else: logger.info( format_log(f"[Missing valid col] in {obj.key}", action="Delete")) for uid in new_ids: if uid in deleted_ids: if deleted_ids[uid]["timestamp"] < obj.last_modified: deleted_ids[uid]["timestamp"] = obj.last_modified else: deleted_ids[uid] = {"timestamp": obj.last_modified} if config["verbose"]: for uid, deleted_dict in deleted_ids.items(): logger.info( format_log( f"id: {uid} last modified: {deleted_dict['timestamp']}", action="Delete")) logger.info( format_log( f"Gathering {len(deleted_ids):,} deleted transactions took {perf_counter() - start:.2f}s", action="Delete", )) return deleted_ids
def toggle_refresh_off(client, index): client.indices.put_settings({"refresh_interval": "-1"}, index) message = f'Set "refresh_interval": "-1" to turn auto refresh off' logger.info(format_log(message, action="ES Settings"))
def create_read_alias(client, config): alias_name = config["query_alias_prefix"] logger.info( format_log(f"Putting alias '{alias_name}' on {config['index_name']}", action="ES Alias")) put_alias(client, config["index_name"], alias_name, {})
def delete_docs_by_unique_key( client: Elasticsearch, key: str, value_list: list, task_id: str, index, refresh_after: bool = True, delete_chunk_size: int = 1000, ) -> int: """ Bulk delete a batch of documents whose field identified by ``key`` matches any value provided in the ``values_list``. NOTE: This delete routine looks at just the index name given. If there are duplicate records across multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple indices, or this will need to be run once per index. Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster key (str): name of field in targeted elasticsearch index that should have a unique value for every doc in the index. The field or sub-field provided MUST be of ``keyword`` type (or ``_id`` meta field) value_list (list): if key field has these values, the document will be deleted task_id (str): name of ES ETL job being run, used in logging index (str): name of index (or alias) to target for the ``_delete_by_query`` ES operation. refresh_after (bool): Whether to call ``_refresh`` on the index when all of the provided values in ``value_list`` have been processed for delete; defaults to ``True``. If many small deletes happen at a rapid rate, it may be best to set this ``False`` and await a deferred refresh afterward in the calling code. NOTE: This param will be ignored and a refresh will be attempted if this function errors-out during execution, in order to not leave un-refreshed deletes in the index. delete_chunk_size (int): the batch-size of terms value-array given to each _delete_by_query call. Needs to be less than 65536 (max values for any terms query), and less than index.max_results_window setting. Ideally use ``config["partition_size"]`` (derived from --partition-size) to set this to a calibrated value. If not provided, uses 1000 as a safe default (10,000 resulted in some timeouts on a busy cluster). Returns: Number of ES documents deleted """ start = perf_counter() if len(value_list) == 0: logger.info( format_log("Nothing to delete", action="Delete", name=task_id)) return 0 logger.info( format_log(f"Deleting up to {len(value_list):,} document(s)", action="Delete", name=task_id)) if not index: raise RuntimeError("index name must be provided") if not _is_allowed_key_field_type(client, key, index): msg = ( f'Cannot perform deletes in index "{index}" by key field "{key}" because its type is not one of ' f"the allowed field types, or the field was not found in that index." ) logger.error(format_log(msg=msg, action="Delete", name=task_id)) raise RuntimeError(msg) if delete_chunk_size > 65536: # 65,536 is max number of terms that can be added to an ES terms filter query msg = ( f"{delete_chunk_size} is greater than 65,536, which is the max number of terms that can be added to an ES " f"terms filter query") logger.error(format_log(msg=msg, action="Delete")) raise RuntimeError(msg) chunks_processed = 0 deleted = 0 is_error = False try: values_generator = chunks(value_list, delete_chunk_size) for chunk_of_values in values_generator: # Invoking _delete_by_query as per the elasticsearch-dsl docs: # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#delete-by-query # _refresh is deferred until the end of chunk processing q = Search(using=client, index=index).filter("terms", **{key: chunk_of_values}) # type: Search # params: # conflicts="proceed": Ignores version conflict errors if a doc delete is attempted more than once # slices="auto": Will create parallel delete batches per shard q = q.params(conflicts="proceed", slices="auto") response = q.delete() # Some subtle errors come back on the response if response["timed_out"]: msg = f"Delete request timed out on cluster after {int(response['took'])/1000:.2f}s" logger.error(format_log(msg=msg, action="Delete", name=task_id)) raise RuntimeError(msg) if response["failures"]: fail_snippet = "\n\t\t" + "\n\t\t".join( map(str, response["failures"][0:4])) + "\n\t\t" + "..." msg = f"Some docs failed to delete on cluster:{fail_snippet}" logger.error(format_log(msg=msg, action="Delete", name=task_id)) raise RuntimeError(msg) logger.info( format_log( f"Deleted {response['deleted']:,} docs in ES from chunk of size {len(chunk_of_values):,} " f"in {int(response['took'])/1000:.2f}s, " f"and ignored {response['version_conflicts']:,} version conflicts", action="Delete", name=task_id, )) deleted += response["deleted"] chunks_processed += 1 except Exception: is_error = True logger.exception(format_log("", name=task_id, action="Delete")) raise finally: if deleted > 0 and (refresh_after or is_error): if not is_error: refresh_msg = "Refreshing index so deletes take effect" else: refresh_msg = "Attempting index refresh while handling error so deletes take effect" logger.info(format_log(refresh_msg, action="Delete", name=task_id)) client.indices.refresh(index=index) if chunks_processed > 1 or is_error: # This log becomes redundant unless to log the sum of multiple chunks' deletes (or error) error_text = " before encountering an error" if is_error else "" duration = perf_counter() - start docs = f"document{'s' if deleted != 1 else ''}" msg = f"Delete operation took {duration:.2f}s. Removed {deleted:,} total {docs}{error_text}" logger.info(format_log(msg, action="Delete", name=task_id)) return deleted
def delete_awards(client: Elasticsearch, config: dict, task_id: str = "Sync DB Deletes") -> int: """Delete all awards in the Elasticsearch awards index that were deleted in the source database. This performs the deletes of award documents in ES in a series of batches, as there could be many. Millions of awards deleted may take a prohibitively long time, and it could be better to just re-index all documents from the DB instead. This requires looking-up the awards-to-delete by finding the unique-key of each parent award to any deleted transaction, and then getting the distinct list of unique-award-keys that are NOT present in the database; then deleting those in the ES awards index. - The deleted transactions are recorded in a CSV delete log file in S3. - NOTE!! This order of operations therefore requires that ES award deletes be processed BEFORE transaction ES deletes are (both deletes cannot run in parallel). Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task task_id (str): label for this sub-step of the ETL Returns: Number of ES docs deleted in the index """ deleted_tx_keys = _gather_deleted_transaction_keys(config) # While extracting unique award keys, the lookup is on transactions and must match against the unique transaction id award_keys = _lookup_deleted_award_keys( client, ES_TRANSACTIONS_UNIQUE_KEY_FIELD, [*deleted_tx_keys], config, settings.ES_TRANSACTIONS_QUERY_ALIAS_PREFIX + "-*", ) award_keys = list(set(award_keys)) # get unique list of keys award_keys_len = len(award_keys) if award_keys_len == 0: logger.info( format_log( f"No related awards found for deletion. Zero transaction docs found from which to derive awards.", action="Delete", name=task_id, )) return 0 logger.info( format_log( f"Derived {award_keys_len} award keys from transactions in ES", action="Delete", name=task_id)) deleted_award_kvs = _check_awards_for_deletes(award_keys) deleted_award_kvs_len = len(deleted_award_kvs) if deleted_award_kvs_len == 0: # In this case it could be an award's transaction was deleted, but not THE LAST transaction of that award. # i.e. the deleted transaction's "siblings" are still in the DB and therefore the parent award should remain logger.info( format_log( f"No related awards found will be deleted. All derived awards are still in the DB.", action="Delete", name=task_id, )) return 0 logger.info( format_log( f"{deleted_award_kvs_len} awards no longer in the DB will be removed from ES", action="Delete", name=task_id)) values_list = [v for d in deleted_award_kvs for v in d.values()] return delete_docs_by_unique_key( client, key=config["unique_key_field"], value_list=values_list, task_id=task_id, index=config["index_name"], delete_chunk_size=config["partition_size"], )
def _lookup_deleted_award_keys( client: Elasticsearch, lookup_key: str, value_list: list, config: dict, index: Optional[str] = None, lookup_chunk_size: int = 50000, ) -> list: """Derive a list of award keys given a target index, Lookup field, and lookup values This returns a list of all unique award keys, which are compiled from the ``ES_AWARDS_UNIQUE_KEY_FIELD`` field of any document in the given ``index`` that matches the query. The matching query is a terms query that will return the doc if its ``lookup_key`` field has any value provided in ``value_list``. Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster lookup_key (str): name of field in targeted elasticsearch index by which we are looking up docs. The field or sub-field provided MUST be of ``keyword`` type (or ``_id`` meta field) value_list (list): if lookup_key field has any of these values, the document will be returned from the lookup config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task index (str): Optional name, alias, or pattern of index this query will target. Looks up via config if not provided lookup_chunk_size (int): the batch-size of terms value-array to be looked-up. Needs to be less than 65536 (max values for any terms query), and less than config["max_query_size"] Returns: list of values for the ES_AWARDS_UNIQUE_KEY_FIELD fields in the looked-up documents. """ if index is None: index = f"{config['query_alias_prefix']}-*" if not _is_allowed_key_field_type(client, lookup_key, index): msg = ( f'Cannot perform lookups in index "{index}" with key field "{lookup_key}" because its type is not one of ' f"the allowed field types, or the field was not found in that index." ) logger.error(format_log(msg=msg, action="Delete")) raise RuntimeError(msg) if lookup_chunk_size > 65536: # 65,536 is max number of terms that can be added to an ES terms filter query msg = ( f"{lookup_chunk_size} is greater than 65,536, which is the max number of terms that can be added to an ES " f"terms filter query") logger.error(format_log(msg=msg, action="Delete")) raise RuntimeError(msg) if lookup_chunk_size > config["max_query_size"]: # Some keys would be left undiscovered if our chunk was cut short by the query only returning a lesser subset msg = ( f"{lookup_chunk_size} is greater {config['max_query_size']}, which is the max number of query " f"results returnable from this index. Use a smaller chunk or increase max_result_window for this index." ) logger.error(format_log(msg=msg, action="Delete")) raise RuntimeError(msg) award_key_list = [] values_generator = chunks(value_list, lookup_chunk_size) for chunk_of_values in values_generator: q = Search(using=client, index=index).filter("terms", **{lookup_key: chunk_of_values}) # type: Search q.update_from_dict({"size": config["max_query_size"]}) response = q.execute() if response["hits"]["total"]["value"] != 0: award_key_list += [ x["_source"][ES_AWARDS_UNIQUE_KEY_FIELD] for x in response["hits"]["hits"] ] return award_key_list
def streaming_post_to_es( client: Elasticsearch, chunk: list, index_name: str, job_name: str = None, delete_before_index: bool = True, delete_key: str = "_id", ) -> Tuple[int, int]: """ Pump data into an Elasticsearch index. Args: client: Elasticsearch client chunk (List[dict]): list of dictionary objects holding field_name:value data index_name (str): name of targetted index job_name (str): name of ES ETL job being run, used in logging delete_before_index (bool): When true, attempts to delete given documents by a unique key before indexing them. NOTE: For incremental loads, we must "delete-before-index" due to the fact that on many of our indices, we have different values for _id and routing key. Not doing this exposed a bug in our approach to expedite incremental UPSERTS aimed at allowing ES to overwrite documents when it encountered one already existing by a given _id. The problem is that the index operation uses the routing key to target only 1 shard for its index/overwrite. If the routing key value changes between two incremental loads of the same doc with the same _id, it may get routed to a different shard and won't overwrite the original doc, leaving duplicates across all shards in the index. delete_key (str): The column (field) name used for value lookup in the given chunk to derive documents to be deleted, if delete_before_index is True. Currently defaulting to "_id", taking advantage of the fact that we are explicitly setting "_id" in the documents to-be-indexed, which is a unique key for each doc (e.g. the PK of the DB row) Returns: (succeeded, failed) tuple, which counts successful index doc writes vs. failed doc writes """ success, failed = 0, 0 try: if delete_before_index: value_list = [doc[delete_key] for doc in chunk] delete_docs_by_unique_key( client, delete_key, value_list, job_name, index_name, refresh_after=False, ) for ok, item in helpers.streaming_bulk( client, actions=chunk, chunk_size=ES_BATCH_ENTRIES, max_chunk_bytes=ES_MAX_BATCH_BYTES, max_retries=10, index=index_name, ): if ok: success += 1 else: failed += 1 except Exception as e: logger.error( f"Error on partition {job_name}:\n\n{str(e)[:2000]}\n...\n{str(e)[-2000:]}\n" ) raise RuntimeError(f"{job_name}") logger.info( format_log(f"Success: {success:,} | Fail: {failed:,}", name=job_name, action="Index")) return success, failed