コード例 #1
0
def transform_data(worker: TaskSpec,
                   records: List[dict],
                   converters: Dict[str, Callable],
                   routing_field: Optional[str] = None) -> List[dict]:
    logger.info(
        format_log(f"Transforming data", name=worker.name, action="Transform"))
    start = perf_counter()

    for record in records:
        for field, converter in converters.items():
            record[field] = converter(record[field])

        # Route all documents with the same recipient to the same shard
        # This allows for accuracy and early-termination of "top N" recipient category aggregation queries
        # Recipient is are highest-cardinality category with over 2M unique values to aggregate against,
        # and this is needed for performance
        # ES helper will pop any "meta" fields like "routing" from provided data dict and use them in the action
        if routing_field:
            record["routing"] = record[routing_field]

        # Explicitly setting the ES _id field to match the postgres PK value allows
        # bulk index operations to be upserts without creating duplicate documents
        # IF and ONLY IF a routing meta field is not also provided (one whose value differs
        # from the doc _id field). If explicit routing is done, UPSERTs may cause duplicates,
        # so docs must be deleted before UPSERTed. (More info in streaming_post_to_es(...))
        record["_id"] = record[worker.field_for_es_id]

    duration = perf_counter() - start
    logger.info(
        format_log(f"Transformation operation took {duration:.2f}s",
                   name=worker.name,
                   action="Transform"))
    return records
コード例 #2
0
def delete_docs_by_unique_key(client: Elasticsearch, key: str,
                              value_list: list, task_id: str, index) -> int:
    """
    Bulk delete a batch of documents whose field identified by ``key`` matches any value provided in the
    ``values_list``.

    Args:
        client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster
        key (str): name of filed in targeted elasticearch index that shoudld have a unique value for
            every doc in the index. Ideally the field or sub-field provided is of ``keyword`` type.
        value_list (list): if key field has these values, the document will be deleted
        task_id (str): name of ES ETL job being run, used in logging
        index (str): name of index (or alias) to target for the ``_delete_by_query`` ES operation.

            NOTE: This delete routine looks at just the index name given. If there are duplicate records across
            multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple
            indices, or this will need to be run once per index.

    Returns: Number of ES documents deleted
    """
    start = perf_counter()

    if len(value_list) == 0:
        logger.info(
            format_log("Nothing to delete", action="Delete", name=task_id))
        return 0

    logger.info(
        format_log(f"Deleting up to {len(value_list):,} document(s)",
                   action="Delete",
                   name=task_id))
    if not index:
        raise RuntimeError("index name must be provided")

    deleted = 0
    is_error = False
    try:
        # 65,536 is max number of terms that can be added to an ES terms filter query
        values_generator = chunks(value_list, 50000)
        for chunk_of_values in values_generator:
            # Creates an Elasticsearch query criteria for the _delete_by_query call
            q = ES_Q("terms", **{key: chunk_of_values})
            # Invoking _delete_by_query as per the elasticsearch-dsl docs:
            #   https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#delete-by-query
            response = Search(using=client, index=index).filter(q).delete()
            chunk_deletes = response["deleted"]
            deleted += chunk_deletes
    except Exception:
        is_error = True
        logger.exception(format_log("", name=task_id, action="Delete"))
        raise SystemExit(1)
    finally:
        error_text = " before encountering an error" if is_error else ""
        duration = perf_counter() - start
        docs = f"document{'s' if deleted != 1 else ''}"
        msg = f"Delete operation took {duration:.2f}s. Removed {deleted:,} {docs}{error_text}"
        logger.info(format_log(msg, action="Delete", name=task_id))

    return deleted
コード例 #3
0
def deleted_awards(client: Elasticsearch, config: dict) -> None:
    """Delete all awards in the Elasticsearch awards index that were deleted in the source database.

    This performs the deletes of award documents in ES in a series of batches, as there could be many. Millions of
    awards deleted may take a prohibitively long time, and it could be better to just re-index all documents from
    the DB instead.

    This requires looking-up the awards-to-delete by finding the unique-key of each parent award to any deleted
    transaction, and then getting the distinct list of unique-award-keys that are NOT present in the database; then
    deleting those in the ES awards index.
    - The deleted transactions are recorded in a CSV delete log file in S3.
    - NOTE!! This order of operations therefore requires that ES award deletes be processed BEFORE transaction
      ES deletes are (both deletes cannot run in parallel).

    Args:
        client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster
        config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task

    Returns: None
    """
    deleted_ids = _gather_deleted_ids(config)
    # While extracting unique award keys, the lookup is on transactions and must match against the unique transaction id
    id_list = [{
        "key": deleted_id,
        "col": ES_TRANSACTIONS_UNIQUE_KEY_FIELD
    } for deleted_id in deleted_ids]
    award_ids = _lookup_deleted_award_ids(
        client, id_list, config,
        settings.ES_TRANSACTIONS_QUERY_ALIAS_PREFIX + "-*")
    if (len(award_ids)) == 0:
        logger.info(
            format_log(f"No related awards require deletion", action="Delete"))
        return

    deleted_award_ids = _check_awards_for_deletes(award_ids)
    if len(deleted_award_ids) == 0:
        logger.info(
            format_log(f"No related awards require deletion", action="Delete"))
        return

    award_id_list = [{
        "key": deleted_award[config["unique_key_field"]],
        "col": config["unique_key_field"]
    } for deleted_award in deleted_award_ids]
    _delete_from_es(
        client,
        award_id_list,
        index=config["query_alias_prefix"],
        max_query_size=config["max_query_size"],
        use_aliases=True,
    )

    return
コード例 #4
0
def transform_covid19_faba_data(worker: TaskSpec,
                                records: List[dict]) -> List[dict]:
    logger.info(
        format_log(f"Transforming data", name=worker.name, action="Transform"))
    start = perf_counter()
    results = {}

    for record in records:
        es_id_field = record[worker.field_for_es_id]
        disinct_award_key = record.pop("financial_account_distinct_award_key")
        award_id = record.pop("award_id")
        award_type = record.pop("type")
        generated_unique_award_id = record.pop("generated_unique_award_id")
        total_loan_value = record.pop("total_loan_value")
        obligated_sum = record.get("transaction_obligated_amount"
                                   ) or 0  # record value for key may be None
        outlay_sum = (
            (record.get("gross_outlay_amount_by_award_cpe") or 0) +
            (record.get(
                "ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe")
             or 0) + (record.get(
                 "ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe")
                      or 0))  # record value for any key may be None
        temp_key = disinct_award_key
        if temp_key not in results:
            results[temp_key] = {
                "financial_account_distinct_award_key": disinct_award_key,
                "award_id": award_id,
                "type": award_type,
                "generated_unique_award_id": generated_unique_award_id,
                "total_loan_value": total_loan_value,
                "financial_accounts_by_award": list(),
                "obligated_sum": 0,
                "outlay_sum": 0,
                "_id": es_id_field,
            }
        results[temp_key]["obligated_sum"] += obligated_sum
        if record.get("is_final_balances_for_fy"):
            results[temp_key]["outlay_sum"] += outlay_sum
        results[temp_key]["financial_accounts_by_award"].append(record)

    if len(results) != len(records):
        msg = f"Transformed {len(records)} database records into {len(results)} documents for ingest"
        logger.info(format_log(msg, name=worker.name, action="Transform"))

    msg = f"Transformation operation took {perf_counter() - start:.2f}s"
    logger.info(format_log(msg, name=worker.name, action="Transform"))
    return list(results.values()
                )  # don't need the dict key, return a list of the dict values
コード例 #5
0
def extract_records(task: TaskSpec) -> List[dict]:
    start = perf_counter()
    logger.info(
        format_log(f"Extracting data from source",
                   name=task.name,
                   action="Extract"))

    try:
        records = task.execute_sql_func(task.sql, True)
    except Exception as e:
        logger.exception(f"Failed on partition {task.name} with '{task.sql}'")
        raise e

    msg = f"{len(records):,} records extracted in {perf_counter() - start:.2f}s"
    logger.info(format_log(msg, name=task.name, action="Extract"))
    return records
コード例 #6
0
def count_of_records_to_process(config: dict) -> Tuple[int, int, int]:
    start = perf_counter()
    results = execute_sql_statement(obtain_min_max_count_sql(config), True,
                                    config["verbose"])[0]
    min_id, max_id, count = results["min"], results["max"], results["count"]
    msg = f"Found {count:,} {config['data_type']} DB records, took {perf_counter() - start:.2f}s"
    logger.info(format_log(msg, action="Extract"))
    return count, min_id, max_id
コード例 #7
0
def create_load_alias(client, config):
    # ensure the new index is added to the alias used for incremental loads.
    # If the alias is on multiple indexes, the loads will fail!
    logger.info(
        format_log(
            f"Putting alias '{config['write_alias']}' on {config['index_name']}",
            action="ES Alias"))
    put_alias(client, config["index_name"], config["write_alias"], {})
コード例 #8
0
def create_award_type_aliases(client, config):
    for award_type, award_type_codes in INDEX_ALIASES_TO_AWARD_TYPES.items():

        alias_name = f"{config['query_alias_prefix']}-{award_type}"
        if config["verbose"]:
            msg = f"Putting alias '{alias_name}' on {config['index_name']} with award codes {award_type_codes}"
            logger.info(format_log(msg, action="ES Alias"))
        alias_body = {"filter": {"terms": {"type": award_type_codes}}}
        put_alias(client, config["index_name"], alias_name, alias_body)
コード例 #9
0
def load_data(worker: TaskSpec, records: List[dict],
              client: Elasticsearch) -> Tuple[int, int]:
    start = perf_counter()
    logger.info(
        format_log(f"Starting Index operation",
                   name=worker.name,
                   action="Index"))
    success, failed = streaming_post_to_es(
        client,
        records,
        worker.index,
        worker.name,
        delete_before_index=worker.is_incremental)
    logger.info(
        format_log(f"Index operation took {perf_counter() - start:.2f}s",
                   name=worker.name,
                   action="Index"))
    return success, failed
コード例 #10
0
def create_index(index, client):
    try:
        does_index_exist = client.indices.exists(index)
    except Exception:
        logger.exception("Unable to query cluster for indices")
        raise SystemExit(1)
    if not does_index_exist:
        logger.info(format_log(f"Creating index '{index}'", action="Index"))
        client.indices.create(index=index)
        client.indices.refresh(index)
コード例 #11
0
def toggle_refresh_on(client, index):
    response = client.indices.get(index)
    aliased_index_name = list(response.keys())[0]
    current_refresh_interval = response[aliased_index_name]["settings"]["index"]["refresh_interval"]
    es_settingsfile = str(settings.APP_DIR / "etl" / "es_config_objects.json")
    with open(es_settingsfile) as f:
        settings_dict = json.load(f)
    final_refresh_interval = settings_dict["final_index_settings"]["refresh_interval"]
    client.indices.put_settings({"refresh_interval": final_refresh_interval}, index)
    message = f'Changed "refresh_interval" from {current_refresh_interval} to {final_refresh_interval}'
    logger.info(format_log(message, action="ES Settings"))
コード例 #12
0
def swap_aliases(client, config):
    if client.indices.get_alias(config["index_name"], "*"):
        logger.info(format_log(f"Removing old aliases for index '{config['index_name']}'", action="ES Alias"))
        client.indices.delete_alias(config["index_name"], "_all")

    alias_patterns = config["query_alias_prefix"] + "*"
    old_indexes = []

    try:
        old_indexes = list(client.indices.get_alias("*", alias_patterns).keys())
        for old_index in old_indexes:
            client.indices.delete_alias(old_index, "_all")
            logger.info(format_log(f"Removing aliases from '{old_index}'", action="ES Alias"))
    except Exception:
        logger.exception(format_log(f"No aliases found for {alias_patterns}", action="ES Alias"))

    if config["create_award_type_aliases"]:
        create_award_type_aliases(client, config)
    else:
        create_read_alias(client, config)

    create_load_alias(client, config)

    try:
        if old_indexes:
            max_wait_time = 15  # in minutes
            start_wait_time = time.time()
            is_snapshot_conflict = is_snapshot_running(client, old_indexes)
            if is_snapshot_conflict:
                logger.info(
                    format_log(
                        f"Snapshot in-progress prevents delete; waiting up to {max_wait_time} minutes",
                        action="ES Alias",
                    )
                )
            while (time.time() - start_wait_time) < (max_wait_time * 60) and is_snapshot_conflict:
                logger.info(format_log("Waiting while snapshot is in-progress", action="ES Alias"))
                time.sleep(90)
                is_snapshot_conflict = is_snapshot_running(client, old_indexes)
            if is_snapshot_conflict:
                config["raise_status_code_3"] = True
                logger.error(
                    format_log(
                        f"Unable to delete index(es) '{old_indexes}' due to in-progress snapshot", action="ES Alias"
                    )
                )
            else:
                client.indices.delete(index=old_indexes, ignore_unavailable=False)
                logger.info(format_log(f"Deleted index(es) '{old_indexes}'", action="ES Alias"))
    except Exception:
        logger.exception(format_log(f"Unable to delete indexes: {old_indexes}", action="ES Alias"))
コード例 #13
0
def set_final_index_config(client, index):
    es_settingsfile = str(settings.APP_DIR / "etl" / "es_config_objects.json")
    with open(es_settingsfile) as f:
        settings_dict = json.load(f)
    final_index_settings = settings_dict["final_index_settings"]

    current_settings = client.indices.get(index)[index]["settings"]["index"]

    client.indices.put_settings(final_index_settings, index)
    client.indices.refresh(index)
    for setting, value in final_index_settings.items():
        message = f'Changing "{setting}" from {current_settings.get(setting)} to {value}'
        logger.info(format_log(message, action="ES Settings"))
コード例 #14
0
def swap_aliases(client, config):
    if client.indices.get_alias(config["index_name"], "*"):
        logger.info(
            format_log(
                f"Removing old aliases for index '{config['index_name']}'",
                action="ES Alias"))
        client.indices.delete_alias(config["index_name"], "_all")

    alias_patterns = config["query_alias_prefix"] + "*"
    old_indexes = []

    try:
        old_indexes = list(
            client.indices.get_alias("*", alias_patterns).keys())
        for old_index in old_indexes:
            client.indices.delete_alias(old_index, "_all")
            logger.info(
                format_log(f"Removing aliases from '{old_index}'",
                           action="ES Alias"))
    except Exception:
        logger.exception(f"No aliases found for {alias_patterns}",
                         action="ES Alias")

    if config["create_award_type_aliases"]:
        create_award_type_aliases(client, config)
    else:
        create_read_alias(client, config)

    create_load_alias(client, config)

    try:
        if old_indexes:
            client.indices.delete(index=old_indexes, ignore_unavailable=False)
            logger.info(
                format_log(f"Deleted index(es) '{old_indexes}'",
                           action="ES Alias"))
    except Exception:
        logger.exception(f"Unable to delete indexes: {old_indexes}",
                         action="ES Alias")
コード例 #15
0
def deleted_awards(client: Elasticsearch, config: dict) -> None:
    """
    so we have to find all the awards connected to these transactions,
    if we can't find the awards in the database, then we have to delete them from es
    """
    deleted_ids = gather_deleted_ids(config)
    id_list = [{
        "key": deleted_id,
        "col": config["unique_key_field"]
    } for deleted_id in deleted_ids]
    award_ids = get_deleted_award_ids(
        client, id_list, config,
        settings.ES_TRANSACTIONS_QUERY_ALIAS_PREFIX + "-*")
    if (len(award_ids)) == 0:
        logger.info(
            format_log(f"No related awards require deletion", action="Delete"))
        return

    deleted_award_ids = check_awards_for_deletes(award_ids)
    if len(deleted_award_ids) == 0:
        logger.info(
            format_log(f"No related awards require deletion", action="Delete"))
        return

    award_id_list = [{
        "key": deleted_award["generated_unique_award_id"],
        "col": config["unique_key_field"]
    } for deleted_award in deleted_award_ids]
    delete_from_es(
        client,
        award_id_list,
        index=config["query_alias_prefix"],
        max_query_size=config["max_query_size"],
        use_aliases=True,
    )

    return
コード例 #16
0
def delete_from_es(
    client: Elasticsearch,
    id_list: List[dict],
    index: str,
    max_query_size: int,
    use_aliases: bool = False,
    task_id: Optional[Tuple[int, str]] = None,
) -> None:
    """
        id_list = [
            {key: 'key1', col: 'tranaction_id'},
            {key: 'key2', col: 'generated_unique_transaction_id'},
            ...
        ]
        - or -
        id_list = [
            {key: 'key1', col: 'award_id'},
            {key: 'key2', col: 'generated_unique_award_id'},
            ...
        ]

    """
    start = perf_counter()
    msg = f"Deleting up to {len(id_list):,} document{'s' if len(id_list) != 1 else ''}"
    logger.info(format_log(msg, name=task_id, action="Delete"))

    if use_aliases:
        index = f"{index}-*"
    start_ = client.count(index=index)["count"]
    logger.info(
        format_log(f"Starting amount of indices ----- {start_:,}",
                   name=task_id,
                   action="Delete"))
    col_to_items_dict = defaultdict(list)
    for l in id_list:
        col_to_items_dict[l["col"]].append(l["key"])

    for column, values in col_to_items_dict.items():
        logger.info(
            format_log(f"Deleting {len(values):,} of '{column}'",
                       name=task_id,
                       action="Delete"))
        values_generator = chunks(values, 1000)
        for v in values_generator:
            # IMPORTANT: This delete routine looks at just 1 index at a time. If there are duplicate records across
            # multiple indexes, those duplicates will not be caught by this routine. It is left as is because at the
            # time of this comment, we are migrating to using a single index.
            body = filter_query(column, v)
            response = client.search(index=index,
                                     body=json.dumps(body),
                                     size=max_query_size)
            delete_body = delete_query(response)
            try:
                client.delete_by_query(index=index,
                                       body=json.dumps(delete_body),
                                       refresh=True,
                                       size=max_query_size)
            except Exception:
                logger.exception(format_log("", name=task_id, action="Delete"))
                raise SystemExit(1)

    end_ = client.count(index=index)["count"]
    record_count = start_ - end_
    duration = perf_counter() - start
    msg = f"Delete operation took {duration:.2f}s. Removed {record_count:,} document{'s' if record_count != 1 else ''}"
    logger.info(format_log(msg, name=task_id, action="Delete"))
    return
コード例 #17
0
def gather_deleted_ids(config: dict) -> list:
    """
    Connect to S3 and gather all of the transaction ids stored in CSV files
    generated by the broker when transactions are removed from the DB.
    """

    if not config["process_deletes"]:
        logger.info(
            format_log(f"Skipping the S3 CSV fetch for deleted transactions",
                       action="Delete"))
        return

    logger.info(
        format_log(f"Gathering all deleted transactions from S3",
                   action="Delete"))
    start = perf_counter()

    bucket_objects = retrieve_s3_bucket_object_list(
        bucket_name=config["s3_bucket"])
    logger.info(
        format_log(
            f"{len(bucket_objects):,} files found in bucket '{config['s3_bucket']}'",
            action="Delete"))

    if config["verbose"]:
        logger.info(
            format_log(f"CSV data from {config['starting_date']} to now",
                       action="Delete"))

    filtered_csv_list = [
        x for x in bucket_objects
        if (x.key.endswith(".csv") and not x.key.startswith("staging")
            and x.last_modified >= config["starting_date"])
    ]

    if config["verbose"]:
        logger.info(
            format_log(f"Found {len(filtered_csv_list)} csv files",
                       action="Delete"))

    deleted_ids = {}

    for obj in filtered_csv_list:
        object_data = access_s3_object(bucket_name=config["s3_bucket"],
                                       obj=obj)

        # Ingests the CSV into a dataframe. pandas thinks some ids are dates, so disable parsing
        data = pd.read_csv(object_data, dtype=str)

        if "detached_award_proc_unique" in data:
            new_ids = [
                "CONT_TX_" + x.upper()
                for x in data["detached_award_proc_unique"].values
            ]
        elif "afa_generated_unique" in data:
            new_ids = [
                "ASST_TX_" + x.upper()
                for x in data["afa_generated_unique"].values
            ]
        else:
            logger.info(
                format_log(f"[Missing valid col] in {obj.key}",
                           action="Delete"))

        for uid in new_ids:
            if uid in deleted_ids:
                if deleted_ids[uid]["timestamp"] < obj.last_modified:
                    deleted_ids[uid]["timestamp"] = obj.last_modified
            else:
                deleted_ids[uid] = {"timestamp": obj.last_modified}

    if config["verbose"]:
        for uid, deleted_dict in deleted_ids.items():
            logger.info(
                format_log(
                    f"id: {uid} last modified: {deleted_dict['timestamp']}",
                    action="Delete"))

    logger.info(
        format_log(
            f"Gathering {len(deleted_ids):,} deleted transactions took {perf_counter() - start:.2f}s",
            action="Delete",
        ))
    return deleted_ids
コード例 #18
0
def toggle_refresh_off(client, index):
    client.indices.put_settings({"refresh_interval": "-1"}, index)
    message = f'Set "refresh_interval": "-1" to turn auto refresh off'
    logger.info(format_log(message, action="ES Settings"))
コード例 #19
0
def create_read_alias(client, config):
    alias_name = config["query_alias_prefix"]
    logger.info(
        format_log(f"Putting alias '{alias_name}' on {config['index_name']}",
                   action="ES Alias"))
    put_alias(client, config["index_name"], alias_name, {})
コード例 #20
0
def delete_docs_by_unique_key(
    client: Elasticsearch,
    key: str,
    value_list: list,
    task_id: str,
    index,
    refresh_after: bool = True,
    delete_chunk_size: int = 1000,
) -> int:
    """
    Bulk delete a batch of documents whose field identified by ``key`` matches any value provided in the
    ``values_list``.

    NOTE: This delete routine looks at just the index name given. If there are duplicate records across
    multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple
    indices, or this will need to be run once per index.

    Args:
        client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster
        key (str): name of field in targeted elasticsearch index that should have a unique value for
            every doc in the index. The field or sub-field provided MUST be of ``keyword`` type (or ``_id`` meta field)
        value_list (list): if key field has these values, the document will be deleted
        task_id (str): name of ES ETL job being run, used in logging
        index (str): name of index (or alias) to target for the ``_delete_by_query`` ES operation.
        refresh_after (bool): Whether to call ``_refresh`` on the index when all of the provided values in
            ``value_list`` have been processed for delete; defaults to ``True``. If many small deletes happen at a
            rapid rate, it may be best to set this ``False`` and await a deferred refresh afterward in the calling
            code. NOTE: This param will be ignored and a refresh will be attempted if this function
            errors-out during execution, in order to not leave un-refreshed deletes in the index.
        delete_chunk_size (int): the batch-size of terms value-array given to each _delete_by_query call. Needs to be
            less than 65536 (max values for any terms query), and less than index.max_results_window setting. Ideally
            use ``config["partition_size"]`` (derived from --partition-size) to set this to a calibrated value. If not
            provided, uses 1000 as a safe default (10,000 resulted in some timeouts on a busy cluster).

    Returns: Number of ES documents deleted
    """
    start = perf_counter()

    if len(value_list) == 0:
        logger.info(
            format_log("Nothing to delete", action="Delete", name=task_id))
        return 0

    logger.info(
        format_log(f"Deleting up to {len(value_list):,} document(s)",
                   action="Delete",
                   name=task_id))
    if not index:
        raise RuntimeError("index name must be provided")

    if not _is_allowed_key_field_type(client, key, index):
        msg = (
            f'Cannot perform deletes in index "{index}" by key field "{key}" because its type is not one of '
            f"the allowed field types, or the field was not found in that index."
        )
        logger.error(format_log(msg=msg, action="Delete", name=task_id))
        raise RuntimeError(msg)

    if delete_chunk_size > 65536:
        # 65,536 is max number of terms that can be added to an ES terms filter query
        msg = (
            f"{delete_chunk_size} is greater than 65,536, which is the max number of terms that can be added to an ES "
            f"terms filter query")
        logger.error(format_log(msg=msg, action="Delete"))
        raise RuntimeError(msg)

    chunks_processed = 0
    deleted = 0
    is_error = False
    try:
        values_generator = chunks(value_list, delete_chunk_size)
        for chunk_of_values in values_generator:
            # Invoking _delete_by_query as per the elasticsearch-dsl docs:
            #   https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#delete-by-query
            # _refresh is deferred until the end of chunk processing
            q = Search(using=client,
                       index=index).filter("terms",
                                           **{key:
                                              chunk_of_values})  # type: Search
            # params:
            # conflicts="proceed": Ignores version conflict errors if a doc delete is attempted more than once
            # slices="auto": Will create parallel delete batches per shard
            q = q.params(conflicts="proceed", slices="auto")
            response = q.delete()
            # Some subtle errors come back on the response
            if response["timed_out"]:
                msg = f"Delete request timed out on cluster after {int(response['took'])/1000:.2f}s"
                logger.error(format_log(msg=msg, action="Delete",
                                        name=task_id))
                raise RuntimeError(msg)
            if response["failures"]:
                fail_snippet = "\n\t\t" + "\n\t\t".join(
                    map(str, response["failures"][0:4])) + "\n\t\t" + "..."
                msg = f"Some docs failed to delete on cluster:{fail_snippet}"
                logger.error(format_log(msg=msg, action="Delete",
                                        name=task_id))
                raise RuntimeError(msg)
            logger.info(
                format_log(
                    f"Deleted {response['deleted']:,} docs in ES from chunk of size {len(chunk_of_values):,} "
                    f"in {int(response['took'])/1000:.2f}s, "
                    f"and ignored {response['version_conflicts']:,} version conflicts",
                    action="Delete",
                    name=task_id,
                ))
            deleted += response["deleted"]
            chunks_processed += 1
    except Exception:
        is_error = True
        logger.exception(format_log("", name=task_id, action="Delete"))
        raise
    finally:
        if deleted > 0 and (refresh_after or is_error):
            if not is_error:
                refresh_msg = "Refreshing index so deletes take effect"
            else:
                refresh_msg = "Attempting index refresh while handling error so deletes take effect"
            logger.info(format_log(refresh_msg, action="Delete", name=task_id))
            client.indices.refresh(index=index)
        if chunks_processed > 1 or is_error:
            # This log becomes redundant unless to log the sum of multiple chunks' deletes (or error)
            error_text = " before encountering an error" if is_error else ""
            duration = perf_counter() - start
            docs = f"document{'s' if deleted != 1 else ''}"
            msg = f"Delete operation took {duration:.2f}s. Removed {deleted:,} total {docs}{error_text}"
            logger.info(format_log(msg, action="Delete", name=task_id))

    return deleted
コード例 #21
0
def delete_awards(client: Elasticsearch,
                  config: dict,
                  task_id: str = "Sync DB Deletes") -> int:
    """Delete all awards in the Elasticsearch awards index that were deleted in the source database.

    This performs the deletes of award documents in ES in a series of batches, as there could be many. Millions of
    awards deleted may take a prohibitively long time, and it could be better to just re-index all documents from
    the DB instead.

    This requires looking-up the awards-to-delete by finding the unique-key of each parent award to any deleted
    transaction, and then getting the distinct list of unique-award-keys that are NOT present in the database; then
    deleting those in the ES awards index.
    - The deleted transactions are recorded in a CSV delete log file in S3.
    - NOTE!! This order of operations therefore requires that ES award deletes be processed BEFORE transaction
      ES deletes are (both deletes cannot run in parallel).

    Args:
        client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster
        config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task
        task_id (str): label for this sub-step of the ETL

    Returns: Number of ES docs deleted in the index
    """
    deleted_tx_keys = _gather_deleted_transaction_keys(config)
    # While extracting unique award keys, the lookup is on transactions and must match against the unique transaction id
    award_keys = _lookup_deleted_award_keys(
        client,
        ES_TRANSACTIONS_UNIQUE_KEY_FIELD,
        [*deleted_tx_keys],
        config,
        settings.ES_TRANSACTIONS_QUERY_ALIAS_PREFIX + "-*",
    )
    award_keys = list(set(award_keys))  # get unique list of keys
    award_keys_len = len(award_keys)
    if award_keys_len == 0:
        logger.info(
            format_log(
                f"No related awards found for deletion. Zero transaction docs found from which to derive awards.",
                action="Delete",
                name=task_id,
            ))
        return 0
    logger.info(
        format_log(
            f"Derived {award_keys_len} award keys from transactions in ES",
            action="Delete",
            name=task_id))

    deleted_award_kvs = _check_awards_for_deletes(award_keys)
    deleted_award_kvs_len = len(deleted_award_kvs)
    if deleted_award_kvs_len == 0:
        # In this case it could be an award's transaction was deleted, but not THE LAST transaction of that award.
        # i.e. the deleted transaction's "siblings" are still in the DB and therefore the parent award should remain
        logger.info(
            format_log(
                f"No related awards found will be deleted. All derived awards are still in the DB.",
                action="Delete",
                name=task_id,
            ))
        return 0
    logger.info(
        format_log(
            f"{deleted_award_kvs_len} awards no longer in the DB will be removed from ES",
            action="Delete",
            name=task_id))

    values_list = [v for d in deleted_award_kvs for v in d.values()]
    return delete_docs_by_unique_key(
        client,
        key=config["unique_key_field"],
        value_list=values_list,
        task_id=task_id,
        index=config["index_name"],
        delete_chunk_size=config["partition_size"],
    )
コード例 #22
0
def _lookup_deleted_award_keys(
    client: Elasticsearch,
    lookup_key: str,
    value_list: list,
    config: dict,
    index: Optional[str] = None,
    lookup_chunk_size: int = 50000,
) -> list:
    """Derive a list of award keys given a target index, Lookup field, and lookup values

    This returns a list of all unique award keys, which are compiled from the ``ES_AWARDS_UNIQUE_KEY_FIELD`` field of
    any document in the given ``index`` that matches the query. The matching query is a terms query that will return
    the doc if its ``lookup_key`` field has any value provided in ``value_list``.

    Args:
        client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster
        lookup_key (str): name of field in targeted elasticsearch index by which we are looking up docs. The field or
            sub-field provided MUST be of ``keyword`` type (or ``_id`` meta field)
        value_list (list): if lookup_key field has any of these values, the document will be returned from the lookup
        config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task
        index (str): Optional name, alias, or pattern of index this query will target. Looks up via config if not
            provided
        lookup_chunk_size (int): the batch-size of terms value-array to be looked-up. Needs to be less
            than 65536 (max values for any terms query), and less than config["max_query_size"]

    Returns: list of values for the ES_AWARDS_UNIQUE_KEY_FIELD fields in the looked-up documents.
    """
    if index is None:
        index = f"{config['query_alias_prefix']}-*"

    if not _is_allowed_key_field_type(client, lookup_key, index):
        msg = (
            f'Cannot perform lookups in index "{index}" with key field "{lookup_key}" because its type is not one of '
            f"the allowed field types, or the field was not found in that index."
        )
        logger.error(format_log(msg=msg, action="Delete"))
        raise RuntimeError(msg)

    if lookup_chunk_size > 65536:
        # 65,536 is max number of terms that can be added to an ES terms filter query
        msg = (
            f"{lookup_chunk_size} is greater than 65,536, which is the max number of terms that can be added to an ES "
            f"terms filter query")
        logger.error(format_log(msg=msg, action="Delete"))
        raise RuntimeError(msg)

    if lookup_chunk_size > config["max_query_size"]:
        # Some keys would be left undiscovered if our chunk was cut short by the query only returning a lesser subset
        msg = (
            f"{lookup_chunk_size} is greater {config['max_query_size']}, which is the max number of query "
            f"results returnable from this index. Use a smaller chunk or increase max_result_window for this index."
        )
        logger.error(format_log(msg=msg, action="Delete"))
        raise RuntimeError(msg)

    award_key_list = []
    values_generator = chunks(value_list, lookup_chunk_size)
    for chunk_of_values in values_generator:
        q = Search(using=client,
                   index=index).filter("terms",
                                       **{lookup_key:
                                          chunk_of_values})  # type: Search
        q.update_from_dict({"size": config["max_query_size"]})
        response = q.execute()
        if response["hits"]["total"]["value"] != 0:
            award_key_list += [
                x["_source"][ES_AWARDS_UNIQUE_KEY_FIELD]
                for x in response["hits"]["hits"]
            ]
    return award_key_list
コード例 #23
0
def streaming_post_to_es(
    client: Elasticsearch,
    chunk: list,
    index_name: str,
    job_name: str = None,
    delete_before_index: bool = True,
    delete_key: str = "_id",
) -> Tuple[int, int]:
    """
    Pump data into an Elasticsearch index.

    Args:
        client: Elasticsearch client
        chunk (List[dict]): list of dictionary objects holding field_name:value data
        index_name (str): name of targetted index
        job_name (str): name of ES ETL job being run, used in logging
        delete_before_index (bool): When true, attempts to delete given documents by a unique key before indexing them.
            NOTE: For incremental loads, we must "delete-before-index" due to the fact that on many of our indices,
                we have different values for _id and routing key.
                Not doing this exposed a bug in our approach to expedite incremental UPSERTS aimed at allowing ES to
                overwrite documents when it encountered one already existing by a given _id. The problem is that the
                index operation uses the routing key to target only 1 shard for its index/overwrite. If the routing key
                value changes between two incremental loads of the same doc with the same _id, it may get routed to a
                different shard and won't overwrite the original doc, leaving duplicates across all shards in the index.
        delete_key (str): The column (field) name used for value lookup in the given chunk to derive documents to be
            deleted, if delete_before_index is True. Currently defaulting to "_id", taking advantage of the fact
            that we are explicitly setting "_id" in the documents to-be-indexed, which is a unique key for each doc
            (e.g. the PK of the DB row)

    Returns: (succeeded, failed) tuple, which counts successful index doc writes vs. failed doc writes
    """

    success, failed = 0, 0
    try:
        if delete_before_index:
            value_list = [doc[delete_key] for doc in chunk]
            delete_docs_by_unique_key(
                client,
                delete_key,
                value_list,
                job_name,
                index_name,
                refresh_after=False,
            )
        for ok, item in helpers.streaming_bulk(
                client,
                actions=chunk,
                chunk_size=ES_BATCH_ENTRIES,
                max_chunk_bytes=ES_MAX_BATCH_BYTES,
                max_retries=10,
                index=index_name,
        ):
            if ok:
                success += 1
            else:
                failed += 1

    except Exception as e:
        logger.error(
            f"Error on partition {job_name}:\n\n{str(e)[:2000]}\n...\n{str(e)[-2000:]}\n"
        )
        raise RuntimeError(f"{job_name}")

    logger.info(
        format_log(f"Success: {success:,} | Fail: {failed:,}",
                   name=job_name,
                   action="Index"))
    return success, failed