def filter_messages(cls, data, project_start_date, project_end_date, filter_test_messages=True): # Filter out test messages sent by AVF. if filter_test_messages: data = MessageFilters.filter_test_messages(data) else: log.debug( "Not filtering out test messages (because the pipeline configuration json key " "'FilterTestMessages' was set to false)") # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ]) # Filter out runs sent outwith the project start and end dates time_keys = { plan.time_field for plan in PipelineConfiguration.RQA_CODING_PLANS } data = MessageFilters.filter_time_range(data, time_keys, project_start_date, project_end_date) return data
def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range( data, cls.SENT_ON_KEY, PipelineConfiguration.PROJECT_START_DATE, PipelineConfiguration.PROJECT_END_DATE) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Output RQA and follow up surveys messages to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output RQA and follow up messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: rqa_and_follow_up_messages = [] # This test works because the only codes which have been applied at this point are TRUE_MISSING. # If any other coding is done above, this test will need to change for td in data: if plan.raw_field in td: rqa_and_follow_up_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_and_follow_up_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def filter_rqa_noise_other_project(messages): """ Filters out RQA messages which have been labelled as Noise_Other_Project(NOP). :param messages: List of message objects to filter. :type messages: list of TracedData :return: Filtered list. :rtype: list of TracedData """ # Filter radio question answers labelled as Noise_Other_Project data = MessageFilters.filter_rqa_noise_other_project(messages) return data
def generate(data, production_csv_output_path): production_keys = ["uid"] for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in production_keys: production_keys.append(plan.raw_field) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in production_keys: production_keys.append(plan.raw_field) not_noise = MessageFilters.filter_noise(data, "noise", lambda x: x) with open(production_csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv(not_noise, f, headers=production_keys) return data
log.info("Loading the raw data...") data = LoadData.load_raw_data(user, raw_data_dir, pipeline_configuration) log.info("Translating source Keys...") data = TranslateSourceKeys.translate_source_keys(user, data, pipeline_configuration) if pipeline_configuration.move_ws_messages: log.info("Pre-filtering empty message objects...") # This is a performance optimisation to save execution time + memory when moving WS messages, by removing # the need to mark and process a high volume of empty message objects as 'NR' in WS correction. # Empty message objects represent flow runs where the participants never sent a message e.g. from an advert # flow run where we asked someone a question but didn't receive a response. data = MessageFilters.filter_empty_messages(data, [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ]) log.info("Moving WS messages...") data = WSCorrection.move_wrong_scheme_messages(user, data, prev_coded_dir_path) else: log.info( "Not moving WS messages (because the 'MoveWSMessages' key in the pipeline configuration " "json was set to 'false')") log.info("Auto Coding...") data = AutoCode.auto_code(user, data, pipeline_configuration, icr_output_dir, coded_dir_path) log.info("Exporting production CSV...")
data = WSCorrection.move_wrong_scheme_messages(user, data, prev_coded_dir_path) else: log.info( "Not moving WS messages (because the 'MoveWSMessages' key in the pipeline configuration " "json was set to 'false')") log.info("Auto Coding...") data = AutoCode.auto_code(user, data, pipeline_configuration, icr_output_dir, coded_dir_path) log.info("Applying Manual Codes from Coda...") data = ApplyManualCodes.apply_manual_codes(user, data, prev_coded_dir_path) log.info("Filtering out Messages labelled as Noise_Other_Channel...") data = MessageFilters.filter_noise_other_channel(data) log.info("Exporting production CSV...") data = ProductionFile.generate(data, production_csv_output_path) log.info( "Tagging listening group participants & Generating Analysis CSVs...") messages_data, individuals_data = AnalysisFile.generate( user, data, pipeline_configuration, raw_data_dir, csv_by_message_output_path, csv_by_individual_output_path) log.info("Writing messages TracedData to file...") IOUtils.ensure_dirs_exist_for_file(messages_json_output_path) with open(messages_json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(messages_data, f)
data = CombineRawDatasets.combine_raw_datasets(user, messages_datasets, coalesced_surveys_datasets) # Infer which RQA coding plans to use from the operator. # This 'hack' is necessary because the rqa coding plans are still not being set in the configuration json. if pipeline_configuration.filter_operator == "golis": log.info("Running in Bossaso mode") PipelineConfiguration.RQA_CODING_PLANS = PipelineConfiguration.BOSSASO_RQA_CODING_PLANS else: assert pipeline_configuration.filter_operator == "hormud", "FilterOperator must be either 'golis' or 'hormud'" log.info("Running in Baidoa mode") PipelineConfiguration.RQA_CODING_PLANS = PipelineConfiguration.BAIDOA_RQA_CODING_PLANS if pipeline_configuration.filter_operator is not None: data = MessageFilters.filter_operator( data, "operator_coded", CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value( pipeline_configuration.filter_operator)) log.info("Translating Rapid Pro Keys...") data = TranslateRapidProKeys.translate_rapid_pro_keys( user, data, pipeline_configuration, prev_coded_dir_path) log.info("Redirecting WS messages...") data = WSCorrection.move_wrong_scheme_messages(user, data, prev_coded_dir_path) log.info("Auto Coding Messages...") data = AutoCodeShowMessages.auto_code_show_messages( user, data, pipeline_configuration, icr_output_dir, coded_dir_path) log.info("Exporting production CSV...")
def auto_code_show_messages(cls, user, data, pipeline_configuration, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if pipeline_configuration.filter_test_messages: data = MessageFilters.filter_test_messages(data) else: log.debug( "Not filtering out test messages (because the pipeline configuration json key " "'FilterTestMessages' was set to false)") # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ]) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range( data, cls.SENT_ON_KEY, pipeline_configuration.project_start_date, pipeline_configuration.project_end_date) # Skipping auto-assigning noise, as an experiment on this project. # If it turns out we need this, uncomment this block. # for td in data: # is_noise = True # for rqa_key in cls.RQA_KEYS: # if rqa_key in td and not somali.DemographicCleaner.is_noise(td[rqa_key], min_length=10): # is_noise = False # td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time())) # TODO: Label each message with channel keys # Channels.set_channel_keys(user, data, cls.SENT_ON_KEY, # pipeline_configuration.project_start_date, pipeline_configuration.project_end_date) # Filter for messages which aren't noise (in order to export to Coda and export for ICR) not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY, lambda x: x) # Compute the number of RQA messages that were the empty string log.debug( "Counting the number of empty string messages for each raw radio show field..." ) raw_rqa_fields = [] for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in raw_rqa_fields: raw_rqa_fields.append(plan.raw_field) cls.log_empty_string_stats(data, raw_rqa_fields) # Compute the number of survey messages that were the empty string log.debug( "Counting the number of empty string messages for each survey field..." ) raw_survey_fields = [] for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in raw_survey_fields: raw_survey_fields.append(plan.raw_field) survey_data = dict() for td in data: survey_data[td["uid"]] = td cls.log_empty_string_stats(survey_data.values(), raw_survey_fields) # Output messages which aren't noise to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, not_noise, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [] for td in not_noise: if plan.raw_field in td: rqa_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
log.info("Loading the raw data...") data = LoadData.load_raw_data(user, raw_data_dir, pipeline_configuration) log.info("Translating Rapid Pro Keys...") data = TranslateRapidProKeys.translate_rapid_pro_keys( user, data, pipeline_configuration) if pipeline_configuration.move_ws_messages: log.info("Pre-filtering empty message objects...") # This is a performance optimisation to save execution time + memory when moving WS messages, by removing # the need to mark and process a high volume of empty message objects as 'NR' in WS correction. # Empty message objects represent flow runs where the participants never sent a message e.g. from an advert # flow run where we asked someone a question but didn't receive a response. data = MessageFilters.filter_empty_messages(data, [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ]) log.info("Moving WS messages...") data = WSCorrection.move_wrong_scheme_messages(user, data, prev_coded_dir_path) else: log.info( "Not moving WS messages (because the 'MoveWSMessages' key in the pipeline configuration " "json was set to 'false')") log.info("Auto Coding...") data = AutoCode.auto_code(user, data, pipeline_configuration, icr_output_dir, coded_dir_path) log.info("Exporting production CSV...")