def step_impl(context, module_name):
    schema_config = context.kickstart_schema_config[module_name]

    if schema_config["record_layout"].lower() == "csv":
        for collection in schema_config["schema"].keys():
            s3_result_key = os.path.join(context.kickstart_hive_result_path,
                                         f"e2e_{collection}.csv")
            console_printer.print_info(f"S3 Request Location: {s3_result_key}")
            file_content = aws_helper.get_s3_object(
                None, context.published_bucket, s3_result_key).decode("utf-8")
            actual_content = (file_content.replace("\t", ",").replace(
                "NULL", "None").strip().splitlines())
            expected_file_name = [
                file for file in context.kickstart_current_run_input_files
                if collection in file
            ][0]
            console_printer.print_info(
                f"Expected File Name: {expected_file_name}")
            expected_content = file_helper.get_contents_of_file(
                expected_file_name, False).splitlines()[1:]

            for input_line, output_line in zip(actual_content,
                                               expected_content):
                assert (
                    input_line.lower() == output_line.lower()
                ), f"Expected result of '{input_line}', does not match '{output_line}' for collection {collection}"

    elif schema_config["record_layout"].lower() == "json":
        for collection in schema_config["schema"].keys():
            s3_result_key = os.path.join(context.kickstart_hive_result_path,
                                         f"e2e_{collection}.csv")
            console_printer.print_info(f"S3 Request Location: {s3_result_key}")
            file_content = aws_helper.get_s3_object(
                None, context.published_bucket, s3_result_key).decode("utf-8")
            actual_content = file_content.replace("NULL",
                                                  "None").strip().splitlines()
            console_printer.print_info(
                f"This the local file name in the list: {context.kickstart_current_run_input_files}"
            )
            expected_file_name = [
                file for file in context.kickstart_current_run_input_files
                if f"{module_name}-{collection}" in file
            ][0]
            console_printer.print_info(
                f"Expected File Name: {expected_file_name}")
            expected_json = json.loads(
                file_helper.get_contents_of_file(expected_file_name,
                                                 False))["data"]
            expected_content = "\n".join([
                "\t".join([str(record[field]) for field in record])
                for record in expected_json
            ]).splitlines()

            for input_line, output_line in zip(actual_content,
                                               expected_content):
                assert (
                    input_line.lower() == output_line.lower()
                ), f"Expected result of '{input_line}', does not match '{output_line}' for collection {collection}"
def step_(context, input_json):
    expected_file_name = os.path.join(
        context.fixture_path_local, "pdm_data", "input_data", input_json
    )
    input_file = file_helper.get_contents_of_file(expected_file_name, False)
    inputs_s3_key = os.path.join(context.pdm_test_input_s3_prefix, input_json)
    aws_helper.put_object_in_s3(input_file, context.published_bucket, inputs_s3_key)
Exemple #3
0
def step_impl(context, dlq_file_template):
    for topic in context.topics_for_test:
        dlq_file = None
        for dlq_files_and_topic_tuple in context.kafka_generated_dlq_output_files:
            if topic["topic"] == dlq_files_and_topic_tuple[0]:
                for dlq_file_for_topic in dlq_files_and_topic_tuple[1]:
                    if dlq_file_template in dlq_file_for_topic:
                        dlq_file = dlq_file_for_topic

        if dlq_file is None:
            raise AssertionError(
                f"No generated dlq file could be found for dlq template of {dlq_file_template}"
            )

        expected_file_content = file_helper.get_contents_of_file(
            dlq_file, True)
        id_object = file_helper.get_id_object_from_json_file(dlq_file)

        dlq_full_file_path_s3 = os.path.join(
            context.s3_dlq_path_and_date_prefix, id_object)

        try:
            console_printer.print_info(
                f"Waiting for dlq file in s3 with prefix of '{dlq_full_file_path_s3}' in bucket '{context.s3_ingest_bucket}'"
            )
            aws_helper.wait_for_file_to_be_in_s3(context.s3_ingest_bucket,
                                                 dlq_full_file_path_s3,
                                                 context.timeout)
        except Exception as ex:
            raise AssertionError(ex)

        file_data = aws_helper.retrieve_files_from_s3(context.s3_ingest_bucket,
                                                      dlq_full_file_path_s3)
        number_files = len(file_data)
        if number_files != 1:
            raise AssertionError(
                f"There should a single dlq file found for {dlq_full_file_path_s3} but was {number_files}"
            )

        input_json = json.loads(expected_file_content)
        output_json = json.loads(file_data[0])

        console_printer.print_info(
            f"Asserting unformatted dlq message actual '{input_json}' is like expected '{output_json}'"
        )

        assert input_json["key"] in output_json["body"]
        assert input_json["reason"] in output_json["reason"]

        input_json_formatted = file_helper.get_json_with_replaced_values(
            expected_file_content)
        output_json_formatted = file_helper.get_json_with_replaced_values(
            file_data[0])

        console_printer.print_info(
            f"Asserting formatted dlq message actual '{output_json_formatted}' is like expected '{input_json_formatted}'"
        )

        assert input_json_formatted == output_json_formatted
def step_(context, expected_result_file_name):
    console_printer.print_info(f"S3 Request Location: {context.pdm_results_s3_file}")
    actual = aws_helper.get_s3_object(
        None, context.published_bucket, context.pdm_results_s3_file
    ).decode("ascii")
    actual_comma_deliminated = actual.replace("\t", ",").strip()

    expected_file_name = os.path.join(
        context.fixture_path_local, "pdm_data", "expected", expected_result_file_name
    )
    expected = file_helper.get_contents_of_file(expected_file_name, False)
    expected_comma_deliminated = expected.replace("\t", ",").strip()

    assert (
        expected_comma_deliminated == actual_comma_deliminated
    ), f"Expected result of '{expected_comma_deliminated}', does not match '{actual_comma_deliminated}'"
def get_locally_generated_snapshot_file_records(snapshot_file):
    """Returns the sorted contents of the local snapshot file as an array.

    Keyword arguments:
    snapshot_file -- full path to the file
    """
    expected_snapshot_file_s3_contents = file_helper.get_contents_of_file(
        snapshot_file, False)

    snapshot_records = []
    for line in expected_snapshot_file_s3_contents.splitlines():
        unsorted_json_record = json.loads(line)
        snapshot_records.append(
            json.dumps(unsorted_json_record, sort_keys=True))

    return snapshot_records
Exemple #6
0
def step_(context, expected_result_file_name):
    console_printer.print_info(
        f"S3 Request Location: {context.adg_results_s3_file}")
    actual = (aws_helper.get_s3_object(
        None, context.published_bucket,
        context.adg_results_s3_file).decode("ascii").replace("\t", "").replace(
            " ", "").strip())

    expected_file_name = os.path.join(
        context.fixture_path_local,
        "snapshot_data",
        "expected",
        expected_result_file_name,
    )
    expected = (file_helper.get_contents_of_file(
        expected_file_name, False).replace("\t", "").replace(" ", "").strip())

    assert (expected == actual
            ), f"Expected result of '{expected}', does not match '{actual}'"
Exemple #7
0
def step_impl(context, dlq_file_template):
    for topic in context.topics_for_test:
        dlq_file = None
        for dlq_files_and_topic_tuple in context.kafka_generated_dlq_output_files:
            if topic["topic"] == dlq_files_and_topic_tuple[0]:
                for dlq_file_for_topic in dlq_files_and_topic_tuple[1]:
                    if dlq_file_template in dlq_file_for_topic:
                        dlq_file = dlq_file_for_topic

        if dlq_file is None:
            raise AssertionError(
                f"No generated dlq file could be found for dlq template of {dlq_file_template}"
            )

        expected_file_content = file_helper.get_contents_of_file(
            dlq_file, True)
        id_object = file_helper.get_id_object_from_json_file(dlq_file)

        test_run_topic_name = template_helper.get_topic_name(topic["topic"])
        file_comparer.assert_specific_id_missing_in_hbase(
            test_run_topic_name, id_object, 5, True)
def files_upload_to_s3(context, local_file_list, folder_name, upload_method):

    for file in local_file_list:
        if upload_method.lower() == "unencrypted":
            console_printer.print_info(
                f"Data will be uploaded in {upload_method} format to s3 bucket"
            )
            console_printer.print_info(f"The file name is {file}")
            file_name = os.path.basename(file)
            input_file = file_helper.get_contents_of_file(file, False)
            inputs_s3_key = os.path.join(folder_name, file_name)
            console_printer.print_info(
                f"Uploading the local file {file} with basename as {file_name} into s3 bucket {context.published_bucket} using key name as {inputs_s3_key}"
            )
            aws_helper.put_object_in_s3(
                input_file, context.published_bucket, inputs_s3_key
            )
        elif upload_method.lower() == "encrypted":
            console_printer.print_info(
                f"Data will be uploaded in {upload_method} format to s3 bucket"
            )
            console_printer.print_info(f"The input file name is {file}")

            file_name = os.path.basename(file)
            encrypted_key = context.encryption_encrypted_key
            master_key = context.encryption_master_key_id
            plaintext_key = context.encryption_plaintext_key
            [
                file_iv_int,
                file_iv_whole,
            ] = historic_data_load_generator.generate_initialisation_vector()

            console_printer.print_info(f"Extracting the raw data from local directory")
            data = file_helper.get_contents_of_file(file, False).encode("utf-8")

            console_printer.print_info(f"Applying encryption to the raw data")
            input_data = historic_data_load_generator.encrypt(
                file_iv_whole, plaintext_key, data
            )
            inputs_s3_key = os.path.join(folder_name, file_name + ".enc")

            all_metadata = json.loads(
                historic_data_load_generator.generate_encryption_metadata_for_metadata_file(
                    encrypted_key, master_key, plaintext_key, file_iv_int
                )
            )

            console_printer.print_info("Metadata of for encrypted file is \n")
            console_printer.print_info(f"{json.dumps(all_metadata)}")

            metadata = {
                "iv": all_metadata["initialisationVector"],
                "ciphertext": all_metadata["encryptedEncryptionKey"],
                "datakeyencryptionkeyid": all_metadata["keyEncryptionKeyId"],
            }
            console_printer.print_info(
                f"Uploading the local file {file} with basename as {file_name} into s3 bucket {context.published_bucket} using key name as {inputs_s3_key} and along with metadata"
            )

            aws_helper.put_object_in_s3_with_metadata(
                input_data, context.published_bucket, inputs_s3_key, metadata=metadata
            )
Exemple #9
0
def _generate_input_data_files_threaded(
    s3_bucket,
    s3_prefix,
    file_count,
    record_count,
    short_topic,
    input_template,
    encrypted_key,
    plaintext_key,
    master_key,
    input_folder,
    max_worker_count,
):
    """Generates required historic data files from the files in the given folder using multiple threads.

    Keyword arguments:
    s3_bucket -- the s3 bucket to send input files to
    s3_prefix -- the s3 prefix to send input files to
    file_count -- the number of files to create for the collection
    record_count -- the number of records per file
    short_topic -- the short topic
    input_template -- the name and location for the input template json file
    encrypted_key -- the encrypted version of the plaintext key
    plaintext_key -- the plaintext data key for encrypting the data file
    master_key -- the master key used to encrypt the data key
    input_folder -- the folder to store the generated input files in
    max_worker_count -- max thread number
    """
    input_base_content = file_helper.get_contents_of_file(
        input_template, False)
    [file_iv_int, file_iv_whole] = generate_initialisation_vector()
    encryption_json_text = generate_encryption_metadata_for_metadata_file(
        encrypted_key, master_key, plaintext_key, file_iv_int)

    with ThreadPoolExecutor(max_workers=max_worker_count) as executor_input:
        future_results_input = []

        for file_number in range(1, int(file_count) + 1):
            future_results_input.append(
                executor_input.submit(
                    _generate_input_files_with_different_timestamps,
                    s3_bucket,
                    s3_prefix,
                    file_number,
                    record_count,
                    file_count,
                    short_topic,
                    file_iv_whole,
                    input_base_content,
                    encrypted_key,
                    plaintext_key,
                    master_key,
                    input_folder,
                    encryption_json_text,
                ))

        wait(future_results_input)
        for future in future_results_input:
            try:
                yield future.result()
            except Exception as ex:
                console_printer.print_error_text(
                    f"Individual file generation failed with error: '{ex}'")
Exemple #10
0
def assert_specific_file_stored_in_hbase(topic_name,
                                         file_full_path,
                                         timeout,
                                         record_expected_in_hbase=True,
                                         wrap_id=False):
    """Checks the specific file in stored in HBase and raises assertion error if not.

    Keyword arguments:
    topic_name -- full topic name
    file_full_path -- the full path and name of the file to check
    timeout -- the timeout in seconds
    record_expected_in_hbase -- true if the record should be in HBase and false if it should not (default True)
    wrap_id -- True is the id format should be wrapped with an "id" object (default False)
    """
    file_contents = file_helper.get_contents_of_file(file_full_path, True)
    id_object = file_helper.get_id_object_from_json_file(file_full_path)

    input_formatted = file_helper.get_json_with_replaced_values(file_contents)

    console_printer.print_info(
        f"Retrieving specific file from HBase using topic name '{topic_name}', "
        +
        f"id of '{id_object}', expected setting of '{record_expected_in_hbase}' "
        + f"and wrap id setting of '{wrap_id}'")

    file_found = False
    file_matches = False
    count = 1
    while (not file_found or (record_expected_in_hbase
                              and not file_matches)) and count <= timeout:
        hbase_data = aws_helper.retrieve_data_from_hbase(
            id_object, topic_name, wrap_id)
        if hbase_data:
            file_found = True
            hbase_data_json = json.loads(hbase_data)
            hbase_contents = json.dumps(hbase_data_json, sort_keys=True)
            hbase_formatted = file_helper.get_json_with_replaced_values(
                hbase_contents)
            if input_formatted == hbase_formatted:
                file_matches = True
        time.sleep(1)
        count += 1

    if record_expected_in_hbase:
        if not file_found:
            raise AssertionError(
                f"File {file_full_path}: content '{input_formatted}' not found in HBase"
            )
        elif not file_matches:
            raise AssertionError(
                f"Input mismatch for file '{file_full_path}': content '{input_formatted}' not matched by HBase content '{hbase_formatted}'"
            )
    elif file_found:
        raise AssertionError(
            f"File '{file_full_path}': content '{input_formatted}' found in HBase"
        )
    elif file_matches:
        raise AssertionError(
            f"Input mismatch for file '{file_full_path}': content '{input_formatted}' matched by HBase content '{hbase_formatted}'"
        )

    return topic_name