def lambda_handler(event, context): """ Returns JSON data with new IQR columns and respective values. :param event: JSON payload that contains: json_data, questions_list, distinct_values. Type: JSON. :param context: N/A. :return: Success - {"success": True/False, "data"/"error": "JSON String"/"Message"} """ current_module = "IQRS - Method" error_message = "" # Define run_id outside of try block run_id = 0 # Set-up variables for status message bpm_queue_url = None try: # Retrieve run_id before input validation # Because it is used in exception handling run_id = event["RuntimeVariables"]["run_id"] runtime_variables = RuntimeSchema().load(event["RuntimeVariables"]) # Runtime Variables bpm_queue_url = runtime_variables["bpm_queue_url"] distinct_values = runtime_variables["distinct_values"] environment = runtime_variables["environment"] input_data = pd.DataFrame(runtime_variables["data"]) questions_list = runtime_variables["questions_list"] survey = runtime_variables["survey"] except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) return {"success": False, "error": error_message} try: logger = general_functions.get_logger(survey, current_module, environment, run_id) except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) return {"success": False, "error": error_message} try: logger.info("Started - retrieved configuration variables.") movement_columns = produce_columns("movement_", questions_list) iqrs_columns = produce_columns("iqrs_", questions_list) iqrs_df = calc_iqrs( input_data, movement_columns, iqrs_columns, distinct_values ) logger.info("Successfully finished calculations of IQRS.") json_out = iqrs_df.to_json(orient="records") final_output = {"data": json_out} except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context, bpm_queue_url=bpm_queue_url) finally: if (len(error_message)) > 0: logger.error(error_message) return {"success": False, "error": error_message} logger.info("Successfully completed module: " + current_module) final_output["success"] = True return final_output
def lambda_handler(event, context): """ The wrangler is responsible for preparing the data so the IQRS method can be applied. :param event: Contains all the variables which are required for the specific run. :param context: N/A :return: Success & None/Error - Type: JSON """ current_module = "Imputation IQRS - Wrangler." error_message = "" # Define run_id outside of try block run_id = 0 # Set-up variables for status message bpm_queue_url = None try: # Retrieve run_id before input validation # Because it is used in exception handling run_id = event["RuntimeVariables"]["run_id"] # Set up clients lambda_client = boto3.client("lambda", region_name="eu-west-2") environment_variables = EnvironmentSchema().load(os.environ) runtime_variables = RuntimeSchema().load(event["RuntimeVariables"]) # Environment Variables bucket_name = environment_variables["bucket_name"] method_name = environment_variables["method_name"] run_environment = environment_variables["run_environment"] # Runtime Variables bpm_queue_url = runtime_variables["bpm_queue_url"] distinct_values = runtime_variables["distinct_values"] environment = runtime_variables["environment"] in_file_name = runtime_variables["in_file_name"] out_file_name = runtime_variables["out_file_name"] questions_list = runtime_variables["questions_list"] sns_topic_arn = runtime_variables["sns_topic_arn"] survey = runtime_variables['survey'] except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) raise exception_classes.LambdaFailure(error_message) try: logger = general_functions.get_logger(survey, current_module, environment, run_id) except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) raise exception_classes.LambdaFailure(error_message) try: logger.info("Started - retrieved configuration variables.") data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name) logger.info("Successfully retrieved data.") iqrs_columns = imp_func.produce_columns("iqrs_", questions_list) for col in iqrs_columns: data[col] = 0 logger.info("IQRS columns successfully added") data_json = data.to_json(orient="records") logger.info("Dataframe converted to JSON") payload = { "RuntimeVariables": { "bpm_queue_url": bpm_queue_url, "data": json.loads(data_json), "distinct_values": distinct_values, "environment": environment, "questions_list": questions_list, "run_id": run_id, "survey": survey } } wrangled_data = lambda_client.invoke(FunctionName=method_name, Payload=json.dumps(payload)) logger.info("Successfully invoked method.") json_response = json.loads( wrangled_data.get("Payload").read().decode("UTF-8")) logger.info("JSON extracted from method response.") if not json_response["success"]: raise exception_classes.MethodFailure(json_response["error"]) aws_functions.save_to_s3(bucket_name, out_file_name, json_response["data"]) logger.info("Successfully sent data to s3.") if run_environment != "development": logger.info(aws_functions.delete_data(bucket_name, in_file_name)) logger.info("Successfully deleted input data from s3.") aws_functions.send_sns_message(sns_topic_arn, "Imputation - IQRs.") logger.info("Successfully sent message to sns.") except Exception as e: error_message = general_functions.handle_exception( e, current_module, run_id, context=context, bpm_queue_url=bpm_queue_url) finally: if (len(error_message)) > 0: logger.error(error_message) raise exception_classes.LambdaFailure(error_message) logger.info("Successfully completed module: " + current_module) return {"success": True}
def lambda_handler(event, context): """ This wrangler is used to prepare data for the apply factors statistical method. The method requires a column per question to store the factors. :param event: Contains all the variables which are required for the specific run. :param context: N/A :return: Success & None/Error - Type: JSON """ current_module = "Imputation Apply Factors - Wrangler." error_message = "" # Define run_id outside of try block run_id = 0 # Set-up variables for status message bpm_queue_url = None current_step_num = 4 try: # Retrieve run_id before input validation # Because it is used in exception handling run_id = event["RuntimeVariables"]["run_id"] # Set up clients lambda_client = boto3.client("lambda", region_name="eu-west-2") environment_variables = EnvironmentSchema().load(os.environ) runtime_variables = RuntimeSchema().load(event["RuntimeVariables"]) # Environment Variables bucket_name = environment_variables["bucket_name"] method_name = environment_variables["method_name"] response_type = environment_variables["response_type"] run_environment = environment_variables["run_environment"] # Runtime Variables bpm_queue_url = runtime_variables["bpm_queue_url"] current_data = runtime_variables["current_data"] distinct_values = runtime_variables["distinct_values"] environment = runtime_variables["environment"] factors_parameters = runtime_variables["factors_parameters"][ "RuntimeVariables"] in_file_name = runtime_variables["in_file_name"] out_file_name = runtime_variables["out_file_name"] previous_data = runtime_variables["previous_data"] questions_list = runtime_variables["questions_list"] reference = runtime_variables["unique_identifier"][0] region_column = factors_parameters["region_column"] regionless_code = factors_parameters["regionless_code"] sns_topic_arn = runtime_variables["sns_topic_arn"] sum_columns = runtime_variables["sum_columns"] survey = runtime_variables["survey"] total_steps = runtime_variables["total_steps"] except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) raise exception_classes.LambdaFailure(error_message) try: logger = general_functions.get_logger(survey, current_module, environment, run_id) except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) raise exception_classes.LambdaFailure(error_message) try: logger.info("Started - retrieved configuration variables.") # Get factors data from calculate_factors factors_dataframe = aws_functions.read_dataframe_from_s3( bucket_name, in_file_name) logger.info("Successfully retrieved factors data from s3") # Get data from module that preceded imputation input_data = aws_functions.read_dataframe_from_s3( bucket_name, current_data) # Split out non responder data from input non_responder_dataframe = input_data[input_data[response_type] == 1] logger.info("Successfully retrieved raw-input data from s3") # Read in previous period data for current period non-responders prev_period_data = aws_functions.read_dataframe_from_s3( bucket_name, previous_data) logger.info("Successfully retrieved previous period data from s3") # Filter so we only have those that responded in prev prev_period_data = prev_period_data[prev_period_data[response_type] == 2] prev_questions_list = produce_columns("prev_", questions_list, [reference]) for question in questions_list: prev_period_data = prev_period_data.rename( index=str, columns={question: "prev_" + question}) logger.info("Successfully renamed previous period data") non_responder_dataframe_with_prev = pd.merge( non_responder_dataframe, prev_period_data[prev_questions_list], on=reference, ) logger.info( "Successfully merged previous period data with non-responder df") # Merge the factors onto the non responders non_responders_with_factors = pd.merge( non_responder_dataframe_with_prev, factors_dataframe[produce_columns("imputation_factor_", questions_list, distinct_values)], on=distinct_values, how="inner", ) logger.info("Successfully merged non-responders with factors") # Collects all rows where an imputation factor doesn't exist. dropped_rows = non_responder_dataframe_with_prev[ ~non_responder_dataframe_with_prev[reference]. isin(non_responders_with_factors[reference])].dropna() if len(dropped_rows) > 0: merge_values = distinct_values merge_values.remove(region_column) # Collect the GB region imputation factors if they exist. regionless_factors = \ factors_dataframe[ produce_columns("imputation_factor_", questions_list, distinct_values) ][factors_dataframe[region_column] == regionless_code] if len(merge_values) != 0: # Basic merge where we have values to merge on. dropped_rows_with_factors = \ pd.merge(dropped_rows, regionless_factors, on=merge_values, how="inner") else: # Added a column to both dataframes to use for the merge. dropped_rows["Temp_Key"] = 0 regionless_factors["Temp_Key"] = 0 dropped_rows_with_factors = \ pd.merge(dropped_rows, regionless_factors, on="Temp_Key", how="inner") dropped_rows_with_factors = dropped_rows_with_factors.drop( "Temp_Key", axis=1) non_responders_with_factors = \ pd.concat([non_responders_with_factors, dropped_rows_with_factors]) logger.info("Successfully merged missing rows with non_responders") payload = { "RuntimeVariables": { "bpm_queue_url": bpm_queue_url, "data": json.loads( non_responders_with_factors.to_json(orient="records")), "environment": environment, "questions_list": questions_list, "run_id": run_id, "sum_columns": sum_columns, "survey": survey } } # Non responder data should now contain all previous values # and the imputation columns imputed_data = lambda_client.invoke( FunctionName=method_name, Payload=json.dumps(payload), ) logger.info("Successfully invoked method.") json_response = json.loads( imputed_data.get("Payload").read().decode("UTF-8")) logger.info("JSON extracted from method response.") if not json_response["success"]: raise exception_classes.MethodFailure(json_response["error"]) imputed_non_responders = pd.read_json(json_response["data"], dtype=False) # retrieve current responders from input data.. current_responders = input_data[input_data[response_type] == 2] # Joining Datasets Together. final_imputed = pd.concat([current_responders, imputed_non_responders]) logger.info("Successfully joined imputed data with responder data") # Create A List Of Factor Columns To Drop cols_to_drop = produce_columns( "imputation_factor_", questions_list, produce_columns("prev_", questions_list)) filtered_data = final_imputed.drop(cols_to_drop, axis=1) message = filtered_data.to_json(orient="records") aws_functions.save_to_s3(bucket_name, out_file_name, message) logger.info("Successfully sent data to s3.") if run_environment != "development": logger.info(aws_functions.delete_data(bucket_name, current_data)) logger.info(aws_functions.delete_data(bucket_name, previous_data)) logger.info(aws_functions.delete_data(bucket_name, in_file_name)) logger.info("Successfully deleted input data.") aws_functions.send_sns_message(sns_topic_arn, "Imputation - Apply Factors.") logger.info("Successfully sent message to sns.") except Exception as e: error_message = general_functions.handle_exception( e, current_module, run_id, context=context, bpm_queue_url=bpm_queue_url) finally: if (len(error_message)) > 0: logger.error(error_message) raise exception_classes.LambdaFailure(error_message) logger.info("Successfully completed module: " + current_module) # Send end status to BPM. status = "DONE" aws_functions.send_bpm_status(bpm_queue_url, current_module, status, run_id, current_step_num, total_steps) return {"success": True}
def lambda_handler(event, context): """ Prepares data for and calls the Calculate imputation factors method by adding on the required columns needed by the method. :param event: Contains all the variables which are required for the specific run. :param context: lambda context :return: Success & None/Error - Type: JSON """ current_module = "Imputation Calculate Factors - Wrangler." error_message = "" # Define run_id outside of try block run_id = 0 # Set-up variables for status message bpm_queue_url = None try: # Retrieve run_id before input validation # Because it is used in exception handling run_id = event["RuntimeVariables"]["run_id"] # Set up clients lambda_client = boto3.client("lambda", region_name="eu-west-2") environment_variables = EnvironmentSchema().load(os.environ) runtime_variables = RuntimeSchema().load(event["RuntimeVariables"]) # Environment Variables bucket_name = environment_variables["bucket_name"] method_name = environment_variables["method_name"] run_environment = environment_variables["run_environment"] # Runtime Variables bpm_queue_url = runtime_variables["bpm_queue_url"] distinct_values = runtime_variables["distinct_values"] environment = runtime_variables["environment"] factors_parameters = runtime_variables["factors_parameters"] in_file_name = runtime_variables["in_file_name"] out_file_name = runtime_variables["out_file_name"] period_column = runtime_variables["period_column"] questions_list = runtime_variables["questions_list"] sns_topic_arn = runtime_variables["sns_topic_arn"] survey = runtime_variables['survey'] except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) raise exception_classes.LambdaFailure(error_message) try: logger = general_functions.get_logger(survey, current_module, environment, run_id) except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) raise exception_classes.LambdaFailure(error_message) try: logger.info("Started - retrieved configuration variables.") data = aws_functions.read_dataframe_from_s3(bucket_name, in_file_name) logger.info("Successfully retrieved data") factor_columns = imp_func.\ produce_columns("imputation_factor_", questions_list) # create df columns needed for method for factor in factor_columns: data[factor] = 0 payload = { "RuntimeVariables": { "bpm_queue_url": bpm_queue_url, "data": json.loads(data.to_json(orient="records")), "environment": environment, "questions_list": questions_list, "distinct_values": distinct_values, "factors_parameters": factors_parameters, "run_id": run_id, "survey": survey } } # invoke the method to calculate the factors calculate_factors = lambda_client.invoke(FunctionName=method_name, Payload=json.dumps(payload)) logger.info("Successfully invoked method.") json_response = json.loads( calculate_factors.get("Payload").read().decode("UTF-8")) logger.info("JSON extracted from method response.") if not json_response["success"]: raise exception_classes.MethodFailure(json_response["error"]) output_df = pd.read_json(json_response["data"], dtype=False) distinct_values.append(period_column) columns_to_keep = imp_func.produce_columns("imputation_factor_", questions_list, distinct_values) final_df = output_df[columns_to_keep].drop_duplicates().to_json( orient="records") aws_functions.save_to_s3(bucket_name, out_file_name, final_df) logger.info("Successfully sent data to s3.") if run_environment != "development": logger.info(aws_functions.delete_data(bucket_name, in_file_name)) logger.info("Successfully deleted input data.") aws_functions.send_sns_message(sns_topic_arn, "Imputation - Calculate Factors.") logger.info("Successfully sent message to sns.") except Exception as e: error_message = general_functions.handle_exception( e, current_module, run_id, context=context, bpm_queue_url=bpm_queue_url) finally: if (len(error_message)) > 0: logger.error(error_message) raise exception_classes.LambdaFailure(error_message) logger.info("Successfully completed module: " + current_module) return {"success": True}
def lambda_handler(event, context): """ Generates an aggregated DataFrame containing the mean value for each of the period on period percentage movements, grouped by region and strata. :param event: JSON payload that contains: json_data, questions_list Type: JSON. :param context: Context object :return: Success - {"success": True/False, "data"/"error": "JSON String"/"Message"} """ current_module = "Means - Method" error_message = "" # Define run_id outside of try block run_id = 0 # Set-up variables for status message bpm_queue_url = None try: # Retrieve run_id before input validation # Because it is used in exception handling run_id = event["RuntimeVariables"]["run_id"] runtime_variables = RuntimeSchema().load(event["RuntimeVariables"]) # Runtime Variables bpm_queue_url = runtime_variables["bpm_queue_url"] distinct_values = runtime_variables["distinct_values"] environment = runtime_variables["environment"] json_data = runtime_variables["data"] questions_list = runtime_variables["questions_list"] survey = runtime_variables["survey"] except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) return {"success": False, "error": error_message} try: logger = general_functions.get_logger(survey, current_module, environment, run_id) except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) return {"success": False, "error": error_message} try: logger.info("Started - retrieved configuration variables.") movement_columns = imp_func.produce_columns("movement_", questions_list) df = pd.DataFrame(json_data) logger.info("Successfully retrieved data from event.") workingdf = df[movement_columns + distinct_values] counts = workingdf.groupby(distinct_values).count() # Rename columns to fit naming standards for column in movement_columns: counts.rename( columns={column: column + "_count"}, inplace=True, ) # Create DataFrame which sums the movements grouped by region and strata sums = workingdf.groupby(distinct_values).sum() # Rename columns to fit naming standards for column in movement_columns: sums.rename( columns={column: column + "_sum"}, inplace=True, ) counts = counts.reset_index(level=distinct_values) sums = sums.reset_index(level=distinct_values) moves = sums.merge( counts, left_on=distinct_values, right_on=distinct_values, how="left", ) # Join on movements and counts on region & strata to DataFrame df = pd.merge(df, moves, on=distinct_values, how="left") for question in questions_list: df["mean_" + question] = df.apply( lambda x: x["movement_" + question + "_sum"] / x[ "movement_" + question + "_count"] if x["movement_" + question + "_count"] > 0 else 0, axis=1, ) logger.info("Successfully finished calculations of means.") final_output = {"data": df.to_json(orient="records")} except Exception as e: error_message = general_functions.handle_exception( e, current_module, run_id, context=context, bpm_queue_url=bpm_queue_url) finally: if (len(error_message)) > 0: logger.error(error_message) return {"success": False, "error": error_message} logger.info("Successfully completed module: " + current_module) final_output["success"] = True return final_output
def lambda_handler(event, context): """ Calculates imputation factor for each question, in each aggregated group. :param event: JSON payload that contains: factors_type, json_data, questions_list - Type: JSON. :param context: lambda context :return: Success - {"success": True/False, "data"/"error": "JSON String"/"Message"} """ current_module = "Calculate Factors - Method" error_message = "" # Define run_id outside of try block run_id = 0 # Set-up variables for status message bpm_queue_url = None try: # Retrieve run_id before input validation # Because it is used in exception handling run_id = event["RuntimeVariables"]["run_id"] runtime_variables = RuntimeSchema().load(event["RuntimeVariables"]) # Pick Correct Schema factors_parameters = runtime_variables["factors_parameters"][ "RuntimeVariables"] factors_type = factors_parameters["factors_type"] factors_name = ''.join(word.title() for word in factors_type.split('_')) factors_schema = getattr(imp_func, factors_name + "Schema") factors = factors_schema().load(factors_parameters) # Runtime Variables bpm_queue_url = runtime_variables["bpm_queue_url"] df = pd.DataFrame(runtime_variables["data"]) distinct_values = runtime_variables["distinct_values"] environment = runtime_variables["environment"] questions_list = runtime_variables["questions_list"] survey = runtime_variables["survey"] except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) return {"success": False, "error": error_message} try: logger = general_functions.get_logger(survey, current_module, environment, run_id) except Exception as e: error_message = general_functions.handle_exception(e, current_module, run_id, context=context) return {"success": False, "error": error_message} try: logger.info("Started - retrieved configuration variables.") # Get relative calculation function calculation = getattr(imp_func, factors_type) # Pass the distinct values to the factors function in its parameters factors["distinct_values"] = distinct_values # Some surveys will need to use the regional mean, extract them ahead of time if "regional_mean" in factors: region_column = factors["region_column"] regional_mean = factors["regional_mean"] regionless_code = factors["regionless_code"] survey_column = factors["survey_column"] # split to get only regionless data gb_rows = df.loc[df[region_column] == regionless_code] # produce column names means_columns = imp_func.produce_columns("mean_", questions_list) counts_columns = imp_func.\ produce_columns("movement_", questions_list, suffix="_count") gb_columns = \ means_columns +\ counts_columns +\ distinct_values +\ [survey_column] factor_columns = imp_func.\ produce_columns("imputation_factor_", questions_list, distinct_values+[survey_column]) # select only gb columns and then drop duplicates, leaving one row per strata gb_rows = gb_rows[gb_columns].drop_duplicates() factors[regional_mean] = "" # calculate gb factors ahead of time gb_rows = gb_rows.apply( lambda x: calculation(x, questions_list, **factors), axis=1) # reduce gb_rows to distinct_values, survey, and the factors gb_factors = gb_rows[factor_columns] # add gb_factors to factors parameters to send to calculation factors[regional_mean] = gb_factors df = df.apply(lambda x: calculation(x, questions_list, **factors), axis=1) logger.info("Calculated Factors for " + str(questions_list)) factors_dataframe = df logger.info("Successfully finished calculations of factors") final_output = {"data": factors_dataframe.to_json(orient="records")} except Exception as e: error_message = general_functions.handle_exception( e, current_module, run_id, context=context, bpm_queue_url=bpm_queue_url) finally: if (len(error_message)) > 0: logger.error(error_message) return {"success": False, "error": error_message} logger.info("Successfully completed module: " + current_module) final_output["success"] = True return final_output