Exemple #1
0
def test_score_file_transforms_to_single_values(spark_session):
    input_rows = [
        (10, 1),
        (20, 2),
        (30, 3),
        (40, 4),
        (50, 5),
        (60, 6),
        (70, 7),
        (80, 8),
        (90, 9)]
    input_df = spark_session.createDataFrame(input_rows, scoring_input_schema())
    subcategory_df = define_classification_subcategory_df(spark_session)
    result_df = score_file(subcategory_df, input_df)
    expected_results = [
        [10, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [20, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [30, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [40, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [50, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [60, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [70, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [80, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [90, 0, 0, 0, 0, 0, 0, 0, 0, 1],
    ]
    col_names = [InputColumnNames.RECORD_ID, ClassificationCategoryAbbreviations.AUTO_SALES,
                 ClassificationCategoryAbbreviations.EDUCATION,
                 ClassificationCategoryAbbreviations.INSURANCE, ClassificationCategoryAbbreviations.FINANCIAL_SERVICES,
                 ClassificationCategoryAbbreviations.REAL_ESTATE, ClassificationCategoryAbbreviations.JOBS,
                 ClassificationCategoryAbbreviations.LEGAL, ClassificationCategoryAbbreviations.HOME_SERVICES,
                 ClassificationCategoryAbbreviations.OTHER]
    extracted_row_values = extract_rows_for_col(result_df, col_names, InputColumnNames.RECORD_ID)
    assert expected_results == extracted_row_values
Exemple #2
0
def test_score_file_returns_category_abbrev_column_values(spark_session):
    input_rows = [(10, 1)]
    input_df = spark_session.createDataFrame(input_rows, scoring_input_schema())
    subcategory_df = define_classification_subcategory_df(spark_session)
    result_df = score_file(subcategory_df, input_df)
    col_names = [InputColumnNames.RECORD_ID, ClassificationCategoryAbbreviations.AUTO_SALES,
                 ClassificationCategoryAbbreviations.EDUCATION,
                 ClassificationCategoryAbbreviations.INSURANCE, ClassificationCategoryAbbreviations.FINANCIAL_SERVICES,
                 ClassificationCategoryAbbreviations.REAL_ESTATE, ClassificationCategoryAbbreviations.JOBS,
                 ClassificationCategoryAbbreviations.LEGAL, ClassificationCategoryAbbreviations.HOME_SERVICES,
                 ClassificationCategoryAbbreviations.OTHER]
    assert sorted(result_df.schema.names) == sorted(col_names)
Exemple #3
0
def analyze(spark, logger, **job_args):
    """
    Takes the spark context launched in main.py and runs the AIDA Insights application. The application will
    take an input file, get the canonical hash values for phones, emails, and devices, retrieve associated
    lead ids, classify those leads, attempt to score the leads, and finally write the result to a CSV location
    that is based on the client name and environment.
    :param spark: The spark context
    :param logger: The underlying JVM logger
    :param job_args: A Dict of job arguments, currently client_name and environment
    """
    client_name = job_args["client_name"]
    environment = job_args["environment"]

    time_stamp = datetime.datetime.utcnow()

    logger_prefix = "AIDA_INSIGHTS: "

    logger.info(logger_prefix + "STARTING UP APPLICATION")
    logger.info(logger_prefix + "USING THE FOLLOWING JOB ARGUMENTS")
    logger.info(logger_prefix + "CLIENT NAME: " + client_name)
    logger.info(logger_prefix + "ENVIRONMENT: " + environment)

    logger.info(logger_prefix + "READING INPUT FILE")
    input_data_frame = process_input_file(spark, logger, client_name,
                                          environment)
    logger.info("INPUT_DATA_FRAME PARTITION SIZE: {size}".format(
        size=input_data_frame.rdd.getNumPartitions()))

    logger.info(logger_prefix + "CLASSIFYING FILE INPUTS")
    classification_data_frame = classify(spark, logger, input_data_frame,
                                         environment)
    logger.info("CLASSIFICATION_DATA_FRAME PARTITION SIZE: {size}".format(
        size=classification_data_frame.rdd.getNumPartitions()))

    logger.info(logger_prefix + "SCORING RESULTS")
    classify_subcategory_df = get_classification_subcategory_df(
        spark, environment, logger)
    scored_data_frame = score_file(classify_subcategory_df,
                                   classification_data_frame)
    logger.info("CLASSIFY_SUBCATEGORY_DF PARTITION SIZE: {size}".format(
        size=classify_subcategory_df.rdd.getNumPartitions()))
    logger.info("SCORED_DATA_FRAME PARTITION SIZE: {size}".format(
        size=scored_data_frame.rdd.getNumPartitions()))

    output_path = build_output_csv_folder_name(environment, client_name,
                                               time_stamp)
    logger.info(logger_prefix +
                "WRITING OUTPUT FILE TO {path}".format(path=output_path))
    write_output(output_path, classify_subcategory_df, scored_data_frame)
Exemple #4
0
def test_score_file_with_no_classification_in_row(spark_session):
    input_rows = [
        (10, 1),
        (20, None),  # no classification found
        (30, 3)]
    input_df = spark_session.createDataFrame(input_rows, scoring_input_schema())
    subcategory_df = define_classification_subcategory_df(spark_session)
    result_df = score_file(subcategory_df, input_df)
    expected_results = [
        [10, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [20, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # row should be all zeros
        [30, 0, 0, 1, 0, 0, 0, 0, 0, 0]
    ]
    col_names = [InputColumnNames.RECORD_ID, ClassificationCategoryAbbreviations.AUTO_SALES,
                 ClassificationCategoryAbbreviations.EDUCATION,
                 ClassificationCategoryAbbreviations.INSURANCE, ClassificationCategoryAbbreviations.FINANCIAL_SERVICES,
                 ClassificationCategoryAbbreviations.REAL_ESTATE, ClassificationCategoryAbbreviations.JOBS,
                 ClassificationCategoryAbbreviations.LEGAL, ClassificationCategoryAbbreviations.HOME_SERVICES,
                 ClassificationCategoryAbbreviations.OTHER]
    extracted_row_values = extract_rows_for_col(result_df, col_names, InputColumnNames.RECORD_ID)
    assert expected_results == extracted_row_values
Exemple #5
0
def test_score_file_with_empty_input_file(spark_session):
    input_rows = []
    input_df = spark_session.createDataFrame(input_rows, scoring_input_schema())
    subcategory_df = define_classification_subcategory_df(spark_session)
    result_df = score_file(subcategory_df, input_df)
    assert result_df.count() == 0