def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range( data, cls.SENT_ON_KEY, PipelineConfiguration.PROJECT_START_DATE, PipelineConfiguration.PROJECT_END_DATE) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Output RQA and follow up surveys messages to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output RQA and follow up messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: rqa_and_follow_up_messages = [] # This test works because the only codes which have been applied at this point are TRUE_MISSING. # If any other coding is done above, this test will need to change for td in data: if plan.raw_field in td: rqa_and_follow_up_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_and_follow_up_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def generate(data, production_csv_output_path): production_keys = ["uid"] for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in production_keys: production_keys.append(plan.raw_field) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in production_keys: production_keys.append(plan.raw_field) not_noise = MessageFilters.filter_noise(data, "noise", lambda x: x) with open(production_csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv(not_noise, f, headers=production_keys) return data
def generate(data, production_csv_output_path): production_keys = ["uid"] for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: if plan.raw_field not in production_keys: production_keys.append(plan.raw_field) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in production_keys: production_keys.append(plan.raw_field) # Not perfoming message filtering at this stage for this test-pipeline. with open(production_csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=production_keys) return data
def generate(data, production_csv_output_path): production_keys = ["uid"] for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in production_keys: production_keys.append(plan.raw_field) for plan in PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.raw_field not in production_keys: production_keys.append(plan.raw_field) for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: if plan.raw_field not in production_keys: production_keys.append(plan.raw_field) with open(production_csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv(data, f, headers=production_keys) return data
def test_export_traced_data_iterable_to_csv(self): file_path = path.join(self.test_dir, "csv_test.csv") # Test exporting wrong data type data = list(generate_traced_data_iterable()) with open(file_path, "w") as f: try: TracedDataCSVIO.export_traced_data_iterable_to_csv(data[0], f) self.fail("Exporting the wrong data type did not raise an assertion error") except AssertionError as e: self.assertEqual(str(e), _td_type_error_string) # Test exporting normal data, including requesting an unknown header. data = generate_traced_data_iterable() with open(file_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv(data, f, headers=["URN", "Gender", "Non-Existent"]) self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/csv_export_expected.csv"))
def export_to_csv(user, data, csv_path, export_keys, consent_withdrawn_key): # Convert codes to their string/matrix values for td in data: analysis_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.SINGLE: analysis_dict[cc.analysis_file_key] = \ cc.code_scheme.get_code_with_code_id(td[cc.coded_field]["CodeID"]).string_value else: assert cc.coding_mode == CodingModes.MULTIPLE show_matrix_keys = [] for code in cc.code_scheme.codes: show_matrix_keys.append( f"{cc.analysis_file_key}{code.string_value}") for label in td[cc.coded_field]: code_string_value = cc.code_scheme.get_code_with_code_id( label["CodeID"]).string_value analysis_dict[ f"{cc.analysis_file_key}{code_string_value}"] = Codes.MATRIX_1 for key in show_matrix_keys: if key not in analysis_dict: analysis_dict[key] = Codes.MATRIX_0 td.append_data( analysis_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Hide data from participants who opted out ConsentUtils.set_stopped(user, data, consent_withdrawn_key, additional_keys=export_keys) with open(csv_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=export_keys)
def export_icr(cls, data, icr_output_dir): # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [] for td in data: if plan.raw_field in td: rqa_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field])
def test_import_csv_to_traced_data_iterable(self): file_path = "tests/traced_data/resources/csv_import_data.csv" with open(file_path, "r") as f: exported = list(generate_traced_data_iterable()) imported = list(TracedDataCSVIO.import_csv_to_traced_data_iterable("test_user", f)) self.assertEqual(len(exported), len(imported)) for x, y in zip(exported, imported): self.assertSetEqual(set(x.items()), set(y.items()))
def auto_code_surveys(cls, user, data, icr_output_dir, coda_output_dir): # Auto-code surveys for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: rqa_messages = [] for td in data: if plan.raw_field in td: rqa_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def generate(user, data, csv_by_message_output_path, csv_by_individual_output_path): # Serializer is currently overflowing # TODO: Investigate/address the cause of this. sys.setrecursionlimit(15000) consent_withdrawn_key = "consent_withdrawn" for td in data: td.append_data({consent_withdrawn_key: Codes.FALSE}, Metadata(user, Metadata.get_call_location(), time.time())) # Set the list of raw/coded keys which survey_keys = [] for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.analysis_file_key is not None and plan.analysis_file_key not in survey_keys: survey_keys.append(plan.analysis_file_key) if plan.raw_field not in survey_keys: survey_keys.append(plan.raw_field) # Convert survey codes to their string values for td in data: td.append_data( { plan.analysis_file_key: plan.code_scheme.get_code_with_id( td[plan.coded_field]["CodeID"]).string_value for plan in PipelineConfiguration.SURVEY_CODING_PLANS if plan.analysis_file_key is not None }, Metadata(user, Metadata.get_call_location(), time.time())) # Convert RQA binary codes to their string values for td in data: td.append_data( { plan.binary_analysis_file_key: plan.binary_code_scheme.get_code_with_id( td[plan.binary_coded_field]["CodeID"]).string_value for plan in PipelineConfiguration.RQA_CODING_PLANS if plan.binary_code_scheme is not None }, Metadata(user, Metadata.get_call_location(), time.time())) # Translate the RQA reason codes to matrix values matrix_keys = [] for plan in PipelineConfiguration.RQA_CODING_PLANS: show_matrix_keys = list() for code in plan.code_scheme.codes: show_matrix_keys.append( f"{plan.analysis_file_key}{code.string_value}") AnalysisKeys.set_matrix_keys(user, data, show_matrix_keys, plan.code_scheme, plan.coded_field, plan.analysis_file_key) matrix_keys.extend(show_matrix_keys) binary_keys = [ plan.binary_analysis_file_key for plan in PipelineConfiguration.RQA_CODING_PLANS if plan.binary_analysis_file_key is not None ] equal_keys = ["uid"] equal_keys.extend(survey_keys) concat_keys = [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ] bool_keys = [ consent_withdrawn_key, # "sms_ad", # "radio_promo", # "radio_show", # "non_logical_time", # "radio_participation_s02e01", # "radio_participation_s02e02", # "radio_participation_s02e03", # "radio_participation_s02e04", # "radio_participation_s02e05", # "radio_participation_s02e06", ] # Export to CSV export_keys = ["uid"] export_keys.extend(bool_keys) export_keys.extend(matrix_keys) export_keys.extend(binary_keys) export_keys.extend(concat_keys) export_keys.extend(survey_keys) # Set consent withdrawn based on presence of data coded as "stop" ConsentUtils.determine_consent_withdrawn( user, data, PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key) # Set consent withdrawn based on stop codes from radio question answers for td in data: for plan in PipelineConfiguration.RQA_CODING_PLANS: if td[f"{plan.analysis_file_key}{Codes.STOP}"] == Codes.MATRIX_1: td.append_data({consent_withdrawn_key: Codes.TRUE}, Metadata(user, Metadata.get_call_location(), time.time())) if plan.binary_code_scheme is not None: if td[plan.binary_coded_field]["CodeID"] == \ plan.binary_code_scheme.get_code_with_control_code(Codes.STOP).code_id: td.append_data({consent_withdrawn_key: Codes.TRUE}, Metadata(user, Metadata.get_call_location(), time.time())) # Fold data to have one respondent per row to_be_folded = [] for td in data: to_be_folded.append(td.copy()) folded_data = FoldTracedData.fold_iterable_of_traced_data( user, data, fold_id_fn=lambda td: td["uid"], equal_keys=equal_keys, concat_keys=concat_keys, matrix_keys=matrix_keys, bool_keys=bool_keys, binary_keys=binary_keys) # Fix-up _NA and _NC keys, which are currently being set incorrectly by # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows for td in folded_data: for plan in PipelineConfiguration.RQA_CODING_PLANS: if td.get(plan.raw_field, "") != "": td.append_data( { f"{plan.analysis_file_key}{Codes.TRUE_MISSING}": Codes.MATRIX_0 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) contains_non_nc_key = False for key in matrix_keys: if key.startswith(plan.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \ and td.get(key) == Codes.MATRIX_1: contains_non_nc_key = True if not contains_non_nc_key: td.append_data( { f"{plan.analysis_file_key}{Codes.NOT_CODED}": Codes.MATRIX_1 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Process consent ConsentUtils.set_stopped(user, data, consent_withdrawn_key, additional_keys=export_keys) ConsentUtils.set_stopped(user, folded_data, consent_withdrawn_key, additional_keys=export_keys) # Output to CSV with one message per row with open(csv_by_message_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=export_keys) with open(csv_by_individual_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( folded_data, f, headers=export_keys) return data
help="CSV file containing demographics of CAPYEI students. ") parser.add_argument("json_output_path", metavar="json-output-path", help="Path to serialized TracedData JSON file") args = parser.parse_args() user = args.user phone_uuid_path = args.phone_uuid_table_path demog_dataset_path = args.demog_dataset_path json_output_path = args.json_output_path with open(phone_uuid_path, "r") as f: phone_uuids = PhoneNumberUuidTable.load(f) with open(demog_dataset_path, "r") as f: traced_demog = TracedDataCSVIO.import_csv_to_traced_data_iterable( user, f) traced_demog = list(traced_demog) for td in traced_demog: uuid_dict = { "avf_phone_id": phone_uuids.add_phone(td["final_phone"]) } td.append_data( uuid_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Write the UUIDs out to a file with open(phone_uuid_path, "w") as f: phone_uuids.dump(f) # Output TracedData to JSON. IOUtils.ensure_dirs_exist(json_output_path)
def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.TEST_KEYS) # Filter out runs sent outside the project start and end dates data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY, cls.PROJECT_START_DATE, cls.PROJECT_END_DATE) # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.binary_coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Output messagges for Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f ) print("Coda message files successfully exported") # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: test_pipeline_messages = [] na_messages = [] for td in data: if plan.coded_field not in td: test_pipeline_messages.append(td) else: assert len(td[plan.coded_field]) == 1 assert td[plan.coded_field][0]["CodeID"] == \ plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id icr_messages = ICRTools.generate_sample_for_icr( test_pipeline_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field] ) print("ICR files successfully exported") return data
def generate(user, data, csv_by_message_output_path, csv_by_individual_output_path): # Serializer is currently overflowing # TODO: Investigate/address the cause of this. sys.setrecursionlimit(15000) consent_withdrawn_key = "consent_withdrawn" for td in data: td.append_data({consent_withdrawn_key: Codes.FALSE}, Metadata(user, Metadata.get_call_location(), time.time())) # Set the list of keys to be exported and how they are to be handled when folding export_keys = ["uid", consent_withdrawn_key] bool_keys = [ consent_withdrawn_key # "sms_ad", # "radio_promo", # "radio_show", # "non_logical_time", # "radio_participation_s02e01", # "radio_participation_s02e02", # "radio_participation_s02e03", # "radio_participation_s02e04", # "radio_participation_s02e05", # "radio_participation_s02e06", ] equal_keys = ["uid"] concat_keys = [] matrix_keys = [] binary_keys = [] for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.SINGLE: export_keys.append(cc.analysis_file_key) if cc.folding_mode == FoldingModes.ASSERT_EQUAL: equal_keys.append(cc.analysis_file_key) elif cc.folding_mode == FoldingModes.YES_NO_AMB: binary_keys.append(cc.analysis_file_key) else: assert False, f"Incompatible folding_mode {plan.folding_mode}" else: assert cc.folding_mode == FoldingModes.MATRIX for code in cc.code_scheme.codes: export_keys.append( f"{cc.analysis_file_key}{code.string_value}") matrix_keys.append( f"{cc.analysis_file_key}{code.string_value}") export_keys.append(plan.raw_field) if plan.raw_field_folding_mode == FoldingModes.CONCATENATE: concat_keys.append(plan.raw_field) elif plan.raw_field_folding_mode == FoldingModes.ASSERT_EQUAL: equal_keys.append(plan.raw_field) else: assert False, f"Incompatible raw_field_folding_mode {plan.raw_field_folding_mode}" # Convert codes to their string/matrix values for td in data: analysis_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.SINGLE: analysis_dict[cc.analysis_file_key] = \ cc.code_scheme.get_code_with_id(td[cc.coded_field]["CodeID"]).string_value else: assert cc.coding_mode == CodingModes.MULTIPLE show_matrix_keys = [] for code in cc.code_scheme.codes: show_matrix_keys.append( f"{cc.analysis_file_key}{code.string_value}") for label in td.get(cc.coded_field, []): code_string_value = cc.code_scheme.get_code_with_id( label['CodeID']).string_value analysis_dict[ f"{cc.analysis_file_key}{code_string_value}"] = Codes.MATRIX_1 for key in show_matrix_keys: if key not in analysis_dict: analysis_dict[key] = Codes.MATRIX_0 td.append_data( analysis_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Set consent withdrawn based on presence of data coded as "stop" ConsentUtils.determine_consent_withdrawn( user, data, PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key) # Fold data to have one respondent per row to_be_folded = [] for td in data: to_be_folded.append(td.copy()) folded_data = FoldTracedData.fold_iterable_of_traced_data( user, data, fold_id_fn=lambda td: td["uid"], equal_keys=equal_keys, concat_keys=concat_keys, matrix_keys=matrix_keys, bool_keys=bool_keys, binary_keys=binary_keys) # Fix-up _NA and _NC keys, which are currently being set incorrectly by # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows for td in folded_data: for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.MULTIPLE: if td.get(plan.raw_field, "") != "": td.append_data( { f"{cc.analysis_file_key}{Codes.TRUE_MISSING}": Codes.MATRIX_0 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) contains_non_nc_key = False for key in matrix_keys: if key.startswith(cc.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \ and td.get(key) == Codes.MATRIX_1: contains_non_nc_key = True if not contains_non_nc_key: td.append_data( { f"{cc.analysis_file_key}{Codes.NOT_CODED}": Codes.MATRIX_1 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Process consent ConsentUtils.set_stopped(user, data, consent_withdrawn_key, additional_keys=export_keys) ConsentUtils.set_stopped(user, folded_data, consent_withdrawn_key, additional_keys=export_keys) # Output to CSV with one message per row with open(csv_by_message_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=export_keys) with open(csv_by_individual_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( folded_data, f, headers=export_keys) return data, folded_data
TracedDataCodaIO.import_coda_to_traced_data_iterable( user, data, key_of_raw, key_of_clean, f, True)) else: assert coding_mode == "coding-csv", "coding_mode was not one of 'coda' or 'coding-csv'" # Merge manually coded CSV files into the cleaned dataset with open(path.join(coded_input_path, "{}.csv".format(key_of_raw)), "r") as f: data = list( TracedDataCodingCSVIO. import_coding_csv_to_traced_data_iterable( user, data, key_of_raw, key_of_clean, key_of_raw, key_of_clean, f, True)) # Write coded data back out to disk if os.path.dirname(json_output_path) is not "" and not os.path.exists( os.path.dirname(json_output_path)): os.makedirs(os.path.dirname(json_output_path)) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True) # Export coded data to CSV for analysis if os.path.dirname(csv_output_path) is not "" and not os.path.exists( os.path.dirname(csv_output_path)): os.makedirs(os.path.dirname(csv_output_path)) with open(csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=["avf_phone_id", key_of_raw, key_of_clean])
os.makedirs(os.path.dirname(json_output_path)) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True) # Output to a more human-friendly CSV. if os.path.dirname(csv_output_path) is not "" and not os.path.exists( os.path.dirname(csv_output_path)): os.makedirs(os.path.dirname(csv_output_path)) with open(csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=[ "avf_phone_id", "{} (Run ID) - {}".format(variable_name, flow_name), "{} (Time) - {}".format(variable_name, flow_name), "{} (Text) - {}".format(variable_name, flow_name) ]) # Output messages to Coda IOUtils.ensure_dirs_exist_for_file(coda_output_path) with open(coda_output_path, "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda( data, "{} (Text) - {}".format(variable_name, flow_name), f) # Get 200 non-noise messages and output to CSVs for ICR. print("Noise items:") show_message_key = "{} (Text) - {}".format(variable_name, flow_name) not_noise = []
for td in folded: d = dict() for key in folded_column_keys: if key not in td: d[key] = Codes.TRUE_MISSING td.append_data( d, Metadata(user, Metadata.get_call_location(), time.time())) # Export to CSV export_keys = ["avf_phone_id", "Group"] export_keys.extend(list(folded_column_keys)) export_keys.extend(survey_keys) export_keys.extend(matrix_keys) export_keys.sort() print("Writing 1/2") with open(csv_by_individual_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv(folded, f, headers=export_keys) print("Writing 2/2") # Hack an unused output field to write traced data to (for debug) # FIXME with open(csv_by_message_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True) # TracedDataCSVIO.export_traced_data_iterable_to_csv(data, f, headers=export_keys)
print(" Stopped Respondents:") stopped_ids = set() for td in all_messages: stop_d = dict() for output_key in output_keys: if td[output_key] == "stop": stopped_ids.add(td["phone_uuid"]) for k in output_keys: stop_d[k] = "stop" stop_d["consent_clean"] = CodeBooks.yes_no[Codes.NO] if "consent_clean" not in stop_d: stop_d["consent_clean"] = CodeBooks.yes_no[Codes.YES] td.append_data( stop_d, Metadata(user, Metadata.get_call_location(), time.time())) print(" " + str(len(stopped_ids))) output_keys.insert(2, "consent_clean") # Output analysis TracedData to JSON IOUtils.ensure_dirs_exist_for_file(json_output_path) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(all_messages, f, pretty_print=True) # Output analysis file as CSV IOUtils.ensure_dirs_exist_for_file(csv_output_path) with open(csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( all_messages, f, output_keys)
def auto_code_show_messages(cls, user, data, pipeline_configuration, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if pipeline_configuration.filter_test_messages: data = MessageFilters.filter_test_messages(data) else: log.debug( "Not filtering out test messages (because the pipeline configuration json key " "'FilterTestMessages' was set to false)") # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ]) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range( data, cls.SENT_ON_KEY, pipeline_configuration.project_start_date, pipeline_configuration.project_end_date) # Skipping auto-assigning noise, as an experiment on this project. # If it turns out we need this, uncomment this block. # for td in data: # is_noise = True # for rqa_key in cls.RQA_KEYS: # if rqa_key in td and not somali.DemographicCleaner.is_noise(td[rqa_key], min_length=10): # is_noise = False # td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time())) # TODO: Label each message with channel keys # Channels.set_channel_keys(user, data, cls.SENT_ON_KEY, # pipeline_configuration.project_start_date, pipeline_configuration.project_end_date) # Filter for messages which aren't noise (in order to export to Coda and export for ICR) not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY, lambda x: x) # Compute the number of RQA messages that were the empty string log.debug( "Counting the number of empty string messages for each raw radio show field..." ) raw_rqa_fields = [] for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in raw_rqa_fields: raw_rqa_fields.append(plan.raw_field) cls.log_empty_string_stats(data, raw_rqa_fields) # Compute the number of survey messages that were the empty string log.debug( "Counting the number of empty string messages for each survey field..." ) raw_survey_fields = [] for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in raw_survey_fields: raw_survey_fields.append(plan.raw_field) survey_data = dict() for td in data: survey_data[td["uid"]] = td cls.log_empty_string_stats(survey_data.values(), raw_survey_fields) # Output messages which aren't noise to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, not_noise, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [] for td in not_noise: if plan.raw_field in td: rqa_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
args = parser.parse_args() user = args.user[0] input_path = args.input[0] json_output_path = args.json_output[0] csv_output_path = args.csv_output[0] # Load data from JSON file with open(input_path, "r") as f: data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Filter out messages which are only 1 character long data = list(filter(lambda td: len(td["Message"]) > 1, data)) # Write json output if os.path.dirname(json_output_path) is not "" and not os.path.exists( os.path.dirname(json_output_path)): os.makedirs(os.path.dirname(json_output_path)) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True) # Write CSV output if os.path.dirname(csv_output_path) is not "" and not os.path.exists( os.path.dirname(csv_output_path)): os.makedirs(os.path.dirname(csv_output_path)) with open(csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, ["avf_phone_id", "avf_message_id", "Date", "Message"])
def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY, cls.PROJECT_START_DATE, cls.PROJECT_END_DATE) # Tag messages which are noise as being noise for td in data: is_noise = True for rqa_key in cls.RQA_KEYS: if rqa_key in td and not somali.DemographicCleaner.is_noise( td[rqa_key], min_length=10): is_noise = False td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time())) # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[ plan.binary_coded_field] = na_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Filter for messages which aren't noise (in order to export to Coda and export for ICR) not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY, lambda x: x) # Output messages which aren't noise to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, not_noise, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [] for td in not_noise: # This test works because the only codes which have been applied at this point are TRUE_MISSING. # If any other coding is done above, this test will need to change. if plan.coded_field not in td: rqa_messages.append(td) else: assert len(td[plan.coded_field]) == 1 assert td[plan.coded_field][0]["CodeID"] == \ plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
if START_TIME <= utc_time <= END_TIME: inside_time_window.append(td) else: print("Dropping: {}".format(utc_time)) print("{}:{} Dropped as outside time/Total".format( len(show_messages) - len(inside_time_window), len(show_messages))) show_messages = inside_time_window # Output messages to a CSV file IOUtils.ensure_dirs_exist_for_file(csv_output_path) run_id_key = "{} (Run ID) - {}".format(variable_name, flow_name) raw_text_key = "{} (Text) - {}".format(variable_name, flow_name) with open(csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( show_messages, f, headers=["avf_phone_id", run_id_key, raw_text_key]) # Output messages to Coda IOUtils.ensure_dirs_exist_for_file(coda_output_path) if os.path.exists(prev_coda_path): # TODO: Modifying this line once the coding frame has been developed to include lots of Nones feels a bit # TODO: cumbersome. We could instead modify export_traced_data_iterable_to_coda to support a prev_f argument. # TODO: Modify by adding code scheme keys once they are ready scheme_keys = { "Relevance": None, "Code 1": None, "Code 2": None, "Code 3": None, "Code 4": None }
TOTAL_FGD_CONTACTS = 100 # Load phone uuid table with open(phone_uuid_table_path, "r") as f: phone_uuids = PhoneNumberUuidTable.load(f) # Load FGD/CC survey responses with open(fgd_cc_input_path, "r") as f: fgd_cc_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Load the previous export prev_exports = [] if prev_exports_path is not None: with open(prev_exports_path, "r") as f: prev_exports = list( TracedDataCSVIO.import_csv_to_traced_data_iterable(user, f)) # Load coded demog surveys with open(demog_surveys_input_path, "r") as f: surveys = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Filter out people who haven't answered the fgd_cc consent question fgd_cc_consent_key = "Response_1 (Category) - wt_fgd_cc" fgd_cc_data = [td for td in fgd_cc_data if fgd_cc_consent_key in td] # Filter out people that we have exported in the past prev_contacts = {td["Phone Number"] for td in prev_exports} fgd_cc_data = [ td for td in fgd_cc_data if "+{}".format( phone_uuids.get_phone(td["avf_phone_id"])) not in prev_contacts ]