def set_matrix_keys(user, td, show_keys, coded_shows_prefix, radio_q_prefix): matrix_d = dict() special = None if td.get("{}_NC".format(coded_shows_prefix)) == "1": special = "0" if td.get("{}_stop".format(coded_shows_prefix)) == "1": special = "stop" for output_key in td: if output_key.startswith(coded_shows_prefix): code_key = output_key.replace(coded_shows_prefix, radio_q_prefix) if code_key.endswith("_NC") or code_key.endswith("_stop"): continue show_keys.add(code_key) if special is not None: matrix_d[code_key] = special else: matrix_d[code_key] = td[output_key] td.append_data( matrix_d, Metadata(user, Metadata.get_call_location(), time.time()))
def test_hide_keys(self): td = self.generate_test_data() self.assertEqual(td["id"], "0") self.assertEqual(td["phone"], "+441632000001") self.assertEqual(td["gender"], "man") with self.assertRaisesRegex(KeyError, "age"): td.hide_keys({"age"}, Metadata("test_user", "hide_keys", time.time())) td.hide_keys({"gender", "phone"}, Metadata("test_user", "hide_keys", time.time())) self.assertTrue("id" in td) self.assertFalse("phone" in td) self.assertFalse("gender" in td) self.assertRaises(KeyError, lambda: td["gender"]) self.assertRaises(KeyError, lambda: td["phone"]) self.assertEqual(td.get("gender"), None) self.assertEqual(td.get("id"), "0") self.assertSetEqual(set(td.keys()), {"id"}) self.assertDictEqual(dict(td.items()), {"id": "0"}) self.assertSetEqual(set(td.values()), {"0"}) self.assertEqual(len(td), 1) with self.assertRaisesRegex(KeyError, "gender"): td.hide_keys({"gender"}, Metadata("test_user", "hide_keys", time.time())) td.append_data({"gender": "female"}, Metadata("test_user", "add_gender", time.time())) self.assertTrue("gender" in td) self.assertEqual(td["gender"], "female")
def label_somalia_operator(user, traced_runs, phone_number_uuid_table): # Set the operator codes for each message. uuids = {td["avf_phone_id"] for td in traced_runs} uuid_to_phone_lut = phone_number_uuid_table.uuid_to_data_batch(uuids) for td in traced_runs: operator_raw = uuid_to_phone_lut[td["avf_phone_id"]][:5] # Returns the country code 252 and the next two digits operator_code = PhoneCleaner.clean_operator(operator_raw) if operator_code == Codes.NOT_CODED: operator_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_OPERATOR, CodeSchemes.SOMALIA_OPERATOR.get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location() ) else: operator_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_OPERATOR, CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value(operator_code), Metadata.get_call_location() ) td.append_data({ "operator_raw": operator_raw, "operator_coded": operator_label.to_dict() }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def compute_message_ids(cls, user, data, message_key, message_id_key_to_write): """ Appends a message id to each object in the provided iterable of TracedData. Message ids are set by computing the SHA of the value at each `message_key`, so are guaranteed to be stable. If the `message_key` is not found in a TracedData object in the iterable, no message id is assigned. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to set the message ids of. :type data: iterable of TracedData :param message_key: Key in TracedData objects of the raw text to generate message ids from. :type message_key: str :param message_id_key_to_write: Key in TracedData objects to write the message id to. :type message_id_key_to_write: str """ for td in data: if message_key in td: td.append_data( { message_id_key_to_write: SHAUtils.sha_string(td[message_key]) }, Metadata(user, Metadata.get_call_location(), time.time()))
def determine_consent_withdrawn(cls, user, data, coding_plans, withdrawn_key="consent_withdrawn"): """ Determines whether consent has been withdrawn, by searching for Codes.STOP in the given list of keys. TracedData objects where a stop code is found will have the key-value pair <withdrawn_key>: Codes.TRUE appended. Objects where a stop code is not found are not modified. Note that this does not actually set any other keys to Codes.STOP. Use Consent.set_stopped for this purpose. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to determine consent for. :type data: iterable of TracedData :param coding_plans: :type coding_plans: iterable of CodingPlan :param withdrawn_key: Name of key to use for the consent withdrawn field. :type withdrawn_key: str """ for td in data: if cls.td_has_stop_code(td, coding_plans): td.append_data({withdrawn_key: Codes.TRUE}, Metadata(user, Metadata.get_call_location(), time.time()))
def remap_key_names(cls, user, data, pipeline_configuration): """ Remaps key names. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to remap the key names of. :type data: iterable of TracedData :param pipeline_configuration: Pipeline configuration. :type pipeline_configuration: PipelineConfiguration """ for td in data: remapped = dict() for remapping in pipeline_configuration.rapid_pro_key_remappings: if remapping.is_activation_message: continue old_key = remapping.rapid_pro_key new_key = remapping.pipeline_key if td.get(old_key) is not None and new_key not in td: remapped[new_key] = td[old_key] td.append_data(remapped, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def tag_beneficiary_participants(user, data, pipeline_configuration, raw_data_dir): """ This tags uids who are our partners beneficiaries. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to tag listening group participation to. :type data: iterable of TracedData :param raw_data_dir: Directory containing de-identified beneficiary contacts CSVs containing beneficiary data stored as `avf-phone-uuid` and `location` columns. :type user: str :param pipeline_configuration: Pipeline configuration. :type pipeline_configuration: PipelineConfiguration """ beneficiary_uids = set() # Contains avf-phone ids of partner's beneficiaries. # Read beneficiary file CSVs data for beneficiary_file_url in pipeline_configuration.beneficiary_file_urls: with open(f'{raw_data_dir}/{beneficiary_file_url.split("/")[-1]}', "r", encoding='utf-8-sig') as f: beneficiary_data = list(csv.DictReader(f)) for row in beneficiary_data: beneficiary_uids.add(row['avf-phone-uuid']) # 1.Check if a participant is part of the beneficiary contacts then tag true or false otherwise # Example - "beneficiary": true for td in data: beneficiary_data = dict() # of uid repeat and weekly listening group participation data beneficiary_data["beneficiary"] = td["uid"] in beneficiary_uids td.append_data(beneficiary_data, Metadata(user, Metadata.get_call_location(), time.time()))
def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir): # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Auto-code remaining data for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable(user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # Output survey answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, 'w') as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f ) print("Coda demogs files successfully exported") return data
def set_stopped(user, data, withdrawn_key="consent_withdrawn", additional_keys=None): """ For each TracedData object in an iterable whose 'withdrawn_key' is Codes.True, sets every other key to Codes.STOP. If there is no withdrawn_key or the value is not Codes.True, that TracedData object is not modified. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to set to stopped if consent has been withdrawn. :type data: iterable of TracedData :param withdrawn_key: Key in each TracedData object which indicates whether consent has been withdrawn. :type withdrawn_key: str :param additional_keys: Additional keys to set to 'STOP' (e.g. keys not already in some TracedData objects) :type additional_keys: list of str | None """ if additional_keys is None: additional_keys = [] for td in data: if td.get(withdrawn_key) == Codes.TRUE: stop_dict = { key: Codes.STOP for key in list(td.keys()) + additional_keys if key != withdrawn_key } td.append_data( stop_dict, Metadata(user, Metadata.get_call_location(), time.time()))
def set_channel_keys(cls, user, data, time_key): for td in data: timestamp = isoparse(td[time_key]) channel_dict = dict() # Set channel ranges time_range_matches = 0 matching_ranges = [] for key, ranges in cls.CHANNEL_RANGES.items(): if cls.timestamp_is_in_ranges(timestamp, ranges, matching_ranges): time_range_matches += 1 channel_dict[key] = Codes.TRUE else: channel_dict[key] = Codes.FALSE # Set time as NON_LOGICAL if it doesn't fall in range of the **sms ad/radio promo/radio_show** if time_range_matches == 0: # Assert in range of project assert PipelineConfiguration.PROJECT_START_DATE <= timestamp < PipelineConfiguration.PROJECT_END_DATE, \ f"Timestamp {td[time_key]} out of range of project" channel_dict[cls.NON_LOGICAL_KEY] = Codes.TRUE else: assert time_range_matches == 1, f"Time '{td[time_key]}' matches multiple time ranges {matching_ranges}" channel_dict[cls.NON_LOGICAL_KEY] = Codes.FALSE # Set show ranges for key, ranges in cls.SHOW_RANGES.items(): if cls.timestamp_is_in_ranges(timestamp, ranges, matching_ranges): channel_dict[key] = Codes.TRUE else: channel_dict[key] = Codes.FALSE td.append_data(channel_dict, Metadata(user, Metadata.get_call_location(), time.time()))
def test_fold_groups(self): data = [TracedData({"x": c}, Metadata("test_user", Metadata.get_call_location(), i)) for i, c in enumerate(["a", "b", "c", "d", "e"])] groups = [ [data[0]], [data[1], data[2], data[3]], [data[4]] ] def fold_fn(td_1, td_2): td_1 = td_1.copy() td_2 = td_2.copy() folded_dict = {"x": "{}{}".format(td_1["x"], td_2["x"])} td_1.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 10)) td_2.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 11)) folded = td_1 td_1.append_traced_data("folded_with", td_2, Metadata("test_user", Metadata.get_call_location(), 12)) return folded folded_data = FoldTracedData.fold_groups(groups, fold_fn) self.assertDictEqual(dict(folded_data[0].items()), {"x": "a"}) self.assertDictEqual(dict(folded_data[1].items()), {"x": "bcd"}) self.assertDictEqual(dict(folded_data[2].items()), {"x": "e"})
def set_show_ids(cls, user, data, pipeline_configuration): """ Sets a show pipeline key for each message, using the presence of Rapid Pro value keys to determine which show each message belongs to. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to set the show ids of. :type data: iterable of TracedData :param pipeline_configuration: Pipeline configuration. :type pipeline_configuration: PipelineConfiguration """ for td in data: show_dict = dict() for remapping in pipeline_configuration.rapid_pro_key_remappings: if not remapping.is_activation_message: continue if td.get(remapping.rapid_pro_key) is not None: assert "rqa_message" not in show_dict show_dict["rqa_message"] = td[remapping.rapid_pro_key] show_dict["show_pipeline_key"] = remapping.pipeline_key td.append_data( show_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def test_update_iterable(self): data_dicts = [ {"id": "A", "message": "hello"}, {"id": "B", "message": "hello"}, {"id": "A", "message": "hi"} ] data = [ TracedData(d, Metadata("test_user", "data_generator", time.time())) for d in data_dicts ] updates_dicts = [ {"id": "A", "gender": "male"}, {"id": "B", "gender": "female", "age": 20} ] updates = [ TracedData(d, Metadata("test_user", "data_generator", time.time())) for d in updates_dicts ] TracedData.update_iterable("test_user", "id", data, updates, "demographics") expected_dicts = [ {"id": "A", "message": "hello", "gender": "male"}, {"id": "B", "message": "hello", "gender": "female", "age": 20}, {"id": "A", "message": "hi", "gender": "male"} ] for td, expected_dict in zip(data, expected_dicts): self.assertDictEqual(dict(td.items()), expected_dict)
def set_show_ids(cls, user, data, show_id_map): """ Sets a show_id for each message, using the presence of Rapid Pro value keys to determine which show each message belongs to. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to set the show ids of. :type data: iterable of TracedData :param show_id_map: Dictionary of Rapid Pro value key to show id. :type show_id_map: dict of str -> int """ for td in data: show_dict = dict() for message_key, show_id in show_id_map.items(): if message_key in td: assert "rqa_message" not in show_dict show_dict["rqa_message"] = td[message_key] show_dict["show_id"] = show_id td.append_data( show_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def _impute_coding_error_codes(user, data): for td in data: coding_error_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if f"{plan.raw_field}_WS_correct_dataset" in td: if td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"] == \ CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id: for cc in plan.coding_configurations: if cc.coding_mode == CodingModes.SINGLE: coding_error_dict[cc.coded_field] = \ CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location() ).to_dict() else: assert cc.coding_mode == CodingModes.MULTIPLE coding_error_dict[cc.coded_field] = [ CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme. get_code_with_control_code( Codes.CODING_ERROR), Metadata.get_call_location()).to_dict( ) ] td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), time.time()))
def set_channel_keys(cls, user, data, time_key): for td in data: timestamp = isoparse(td[time_key]) channel_dict = dict() # Set channel ranges time_range_matches = 0 for key, ranges in cls.CHANNEL_RANGES.items(): if cls.timestamp_is_in_ranges(timestamp, ranges): time_range_matches += 1 channel_dict[key] = Codes.TRUE else: channel_dict[key] = Codes.FALSE # Set time as NON_LOGICAL if it doesn't fall in range of the **sms ad/radio promo/radio_show** if time_range_matches == 0: # Assert in range of project assert isoparse("2018-12-02T00:00:00+03:00") <= timestamp < isoparse("2018-12-31T00:00:00+03:00"), \ f"Timestamp {td[time_key]} out of range of project" channel_dict[cls.NON_LOGICAL_KEY] = Codes.TRUE else: assert time_range_matches == 1, f"Time '{td[time_key]}' matches multiple time ranges" channel_dict[cls.NON_LOGICAL_KEY] = Codes.FALSE # Set show ranges for key, ranges in cls.SHOW_RANGES.items(): if cls.timestamp_is_in_ranges(timestamp, ranges): channel_dict[key] = Codes.TRUE else: channel_dict[key] = Codes.FALSE td.append_data( channel_dict, Metadata(user, Metadata.get_call_location(), time.time()))
def convert_facebook_comments_to_traced_data(user, dataset_name, raw_comments, facebook_uuid_table): log.info(f"Converting {len(raw_comments)} Facebook comments to TracedData...") facebook_uuids = {comment["from"]["id"] for comment in raw_comments} facebook_to_uuid_lut = facebook_uuid_table.data_to_uuid_batch(facebook_uuids) traced_comments = [] # Use a placeholder avf facebook id for now, to make the individuals file work until we know if we'll be able # to see Facebook user ids or not. for comment in raw_comments: comment["created_time"] = isoparse(comment["created_time"]).isoformat() validators.validate_utc_iso_string(comment["created_time"]) comment_dict = { "avf_facebook_id": facebook_to_uuid_lut[comment["from"]["id"]] } for k, v in comment.items(): comment_dict[f"{dataset_name}.{k}"] = v traced_comments.append( TracedData(comment_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info(f"Converted {len(traced_comments)} Facebook comments to TracedData") return traced_comments
def _remap_radio_show_by_time_range(cls, user, data, time_key, show_pipeline_key_to_remap_to, range_start=None, range_end=None, time_to_adjust_to=None): """ Remaps radio show messages received in the given time range to another radio show. Optionally adjusts the datetime of re-mapped messages to a constant. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to set the show ids of. :type data: iterable of TracedData :param time_key: Key in each TracedData of an ISO 8601-formatted datetime string to read the message sent on time from. :type time_key: str :param show_pipeline_key_to_remap_to: Pipeline key to assign to messages received within the given time range. :type show_pipeline_key_to_remap_to: str :param range_start: Start datetime for the time range to remap radio show messages from, inclusive. If None, defaults to the beginning of time. :type range_start: datetime | None :param range_end: End datetime for the time range to remap radio show messages from, exclusive. If None, defaults to the end of time. :type range_end: datetime | None :param time_to_adjust_to: Datetime to assign to the `time_key` field of re-mapped shows. If None, re-mapped shows will not have timestamps re-adjusted. :type time_to_adjust_to: datetime | None """ if range_start is None: range_start = pytz.utc.localize(datetime.min) if range_end is None: range_end = pytz.utc.localize(datetime.max) log.info( f"Remapping messages in time range {range_start.isoformat()} to {range_end.isoformat()} " f"to show {show_pipeline_key_to_remap_to}...") remapped_count = 0 for td in data: if time_key in td and range_start <= isoparse( td[time_key]) < range_end: remapped_count += 1 remapped = {"show_pipeline_key": show_pipeline_key_to_remap_to} if time_to_adjust_to is not None: remapped[time_key] = time_to_adjust_to.isoformat() td.append_data( remapped, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) log.info( f"Remapped {remapped_count} messages to show {show_pipeline_key_to_remap_to}" )
def fetch_from_recovery_csv(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, recovery_csv_source): log.info("Fetching data from a Recovery CSV...") for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls: flow_name = blob_url.split('/')[-1].split('.')[ 0] # Takes the name between the last '/' and the '.csv' ending traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl" if os.path.exists(traced_runs_output_path): log.info( f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download" ) continue log.info(f"Downloading recovered data from '{blob_url}'...") raw_csv_string = StringIO( google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, blob_url)) raw_data = list(csv.DictReader(raw_csv_string)) log.info(f"Downloaded {len(raw_data)} recovered messages") log.info("Converting the recovered messages to TracedData...") traced_runs = [] for i, row in enumerate(raw_data): raw_date = row["ReceivedOn"] if len(raw_date) == len("dd/mm/YYYY HH:MM"): parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M") else: parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M:%S") localized_date = pytz.timezone("Africa/Mogadishu").localize( parsed_raw_date) assert row["Sender"].startswith("avf-phone-uuid-"), \ f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \ f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py." d = { "avf_phone_id": row["Sender"], "message": row["Message"], "received_on": localized_date.isoformat(), "run_id": SHAUtils.sha_dict(row) } traced_runs.append( TracedData( d, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))) log.info("Converted the recovered messages to TracedData") log.info( f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..." ) IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_jsonl( traced_runs, f) log.info(f"Exported TracedData")
def apply(cls, user, code_books, td): code_book_data = dict() for coded_key, code_book in code_books.items(): code_book_data[coded_key] = cls.apply_code_book( code_book, td[coded_key]) td.append_data( code_book_data, Metadata(user, Metadata.get_call_location(), time.time()))
def tag_listening_groups_participants(cls, user, data, pipeline_configuration, raw_data_dir): """ This tags uids who participated in repeat listening groups and/or weekly listening group sessions. :param user: Identifier of the user running this program, for TracedData Metadata. :type user: str :param data: TracedData objects to tag listening group participation to. :type data: iterable of TracedData :param raw_data_dir: Directory containing de-identified listening groups contacts CSVs containing listening groups data stored as `Name` and `avf-phone-uuid` columns. :type user: str :param pipeline_configuration: Pipeline configuration. :type pipeline_configuration: PipelineConfiguration """ repeat_listening_group_participants = [] # Contains uids of listening group participants who will participate # in all listening group sessions. listening_group_participants = dict() # Contains lists of weekly listening group participants. The participants # will change each week. # Read repeat listening group participants CSV and add their uids to repeat_listening_group_participants lists if os.path.exists(f'{raw_data_dir}/repeat_listening_group.csv'): with open(f'{raw_data_dir}/repeat_listening_group.csv', "r", encoding='utf-8-sig') as f: repeat_listening_group_data = list(csv.DictReader(f)) for row in repeat_listening_group_data: repeat_listening_group_participants.append(row['avf-phone-uuid']) log.info(f'Loaded {len(repeat_listening_group_participants)} repeat listening group participants') else: log.warning(f'Skipping loading {raw_data_dir}/repeat_listening_group.csv, file not found!') # Read weekly listening group participants CSVs and add their uids to the respective radio-show # listening_group_participants lists listening_group_csvs = [] for listening_group_csv_url in pipeline_configuration.listening_group_csv_urls: listening_group_csvs.append(listening_group_csv_url.split("/")[-1]) for plan in PipelineConfiguration.RQA_CODING_PLANS: listening_group_participants[plan.dataset_name] = set() if plan.listening_group_filename in listening_group_csvs: with open(f'{raw_data_dir}/{plan.listening_group_filename}', "r", encoding='utf-8-sig') as f: plan_listening_group_data = list(csv.DictReader(f)) for row in plan_listening_group_data: listening_group_participants[plan.dataset_name].add(row['avf-phone-uuid']) log.info(f'Loaded {len(listening_group_participants[f"{plan.dataset_name}"])} ' f'{plan.dataset_name} listening group participants') else: log.warning(f'Skipping loading {plan.listening_group_filename},file not found!') # 1.Check if a participant is part of the repeat listening groups contacts then tag true or false otherwise # Example - "repeat_listening_group_participant": true # 2.Check if a participant participated in a radio show listening group then tag true or false otherwise # Example - "kakuma_s01e01_listening_group_participant": false for td in data: listening_group_participation = dict() # of uid repeat and weekly listening group participation data listening_group_participation["repeat_listening_group_participant"] = td["uid"] in repeat_listening_group_participants for plan in PipelineConfiguration.RQA_CODING_PLANS: listening_group_participation[f'{plan.dataset_name}_listening_group_participant'] = td['uid'] in listening_group_participants[plan.dataset_name] td.append_data(listening_group_participation, Metadata(user, Metadata.get_call_location(), time.time()))
def test_append_traced_data(self): # Note that this only tests failing appends. Successful appends are tested by the other methods in this suite. message_td = self.generate_message_td() demog_1_td = self.generate_demog_1_td() demog_1_td.append_data({"message": "should-fail"}, Metadata("test_user", "conflicting_message", time.time())) self.assertRaises(AssertionError, lambda: message_td.append_traced_data( "demog_1", demog_1_td, Metadata("test_user", "demog_1_append", time.time())))
def generate_test_data(): test_data = list(generate_traced_data_iterable()) test_data[1].append_data({"Gender": "f", "Gender_Coded": "Female"}, Metadata("test_user", "gender_coder", 10)) test_data[2].append_traced_data("Age_Data", TracedData({"age": 4}, Metadata("test_user", "age_generator", 11)), Metadata("test_user", "age_merger", 12)) return test_data
def generate_test_data(cls): """Returns a new TracedData object with example id, phone, and gender fields""" message_td = cls.generate_message_td() demog_1_td = cls.generate_demog_1_td() demog_2_td = cls.generate_demog_2_td() message_td.append_traced_data("demog_1", demog_1_td, Metadata("test_user", "demog_1_append", time.time())) message_td.append_traced_data("demog_2", demog_2_td, Metadata("test_user", "demog_2_append", time.time())) return message_td
def set_matrix_keys(user, data, all_matrix_keys, plan, code_ids, coded_key, matrix_prefix=""): for td in data: matrix_d = dict() for label in td.get(coded_key, []): matrix_d[f"{matrix_prefix}{code_ids[plan.code_scheme['Name']][label['CodeID']]}"] = Codes.MATRIX_1 for key in all_matrix_keys: if key not in matrix_d: matrix_d[key] = Codes.MATRIX_0 td.append_data(matrix_d, Metadata(user, Metadata.get_call_location(), time.time()))
def set_matrix_keys(user, data, all_matrix_keys, scheme, coded_key, matrix_prefix=""): for td in data: matrix_d = dict() for label in td.get(coded_key, []): matrix_d[f"{matrix_prefix}{scheme.get_code_with_id(label['CodeID']).string_value}"] = Codes.MATRIX_1 for key in all_matrix_keys: if key not in matrix_d: matrix_d[key] = Codes.MATRIX_0 td.append_data(matrix_d, Metadata(user, Metadata.get_call_location(), time.time()))
def coalesce_traced_runs_by_key(user, traced_runs, coalesce_key): coalesced_runs = dict() for run in traced_runs: if run[coalesce_key] not in coalesced_runs: coalesced_runs[run[coalesce_key]] = run else: coalesced_runs[run[coalesce_key]].append_data( dict(run.items()), Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) return list(coalesced_runs.values())
def test_fold_traced_data(self): td_1_dict = { "equal_1": 4, "equal_2": "xyz", "concat": "abc", "matrix_1": Codes.MATRIX_0, "matrix_2": Codes.MATRIX_0, "bool_1": Codes.FALSE, "bool_2": Codes.TRUE, "yes_no_1": Codes.YES, "yes_no_2": Codes.YES, "other_1": "other 1", "other_2": "other 2" } td_2_dict = { "equal_1": 4, "equal_2": "xyz", "concat": "def", "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0, "bool_1": Codes.TRUE, "bool_2": Codes.TRUE, "yes_no_1": Codes.YES, "yes_no_2": Codes.NO, "other_1": "other", } td_1 = TracedData(td_1_dict, Metadata("test_user", Metadata.get_call_location(), 0)) td_2 = TracedData(td_2_dict, Metadata("test_user", Metadata.get_call_location(), 1)) fold_strategies = { "equal_1": FoldStrategies.assert_equal, "equal_2": FoldStrategies.assert_equal, "concat": FoldStrategies.concatenate, "bool_1": FoldStrategies.boolean_or, "bool_2": FoldStrategies.boolean_or, "matrix_1": FoldStrategies.matrix, "matrix_2": FoldStrategies.matrix, "yes_no_1": FoldStrategies.yes_no_amb, "yes_no_2": FoldStrategies.yes_no_amb } folded_td = FoldTracedData.fold_traced_data("test_user", td_1, td_2, fold_strategies) # Test input tds unchanged self.assertDictEqual(dict(td_1.items()), td_1_dict) self.assertDictEqual(dict(td_2.items()), td_2_dict) # Test folded td has expected values self.assertDictEqual( dict(folded_td.items()), { "equal_1": 4, "equal_2": "xyz", "concat": "abc;def", "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0, "bool_1": Codes.TRUE, "bool_2": Codes.TRUE, "yes_no_1": Codes.YES, "yes_no_2": Codes.AMBIVALENT } )
def import_csv_to_traced_data_iterable(user, f): """ Loads a CSV into new TracedData objects. :param user: Identifier of user running this program. :type user: str :param f: File to import from. :type f: file-like :return: TracedData objects imported from the provided file. :rtype: generator of TracedData """ for row in csv.DictReader(f): yield TracedData( dict(row), Metadata(user, Metadata.get_call_location(), time.time()))
def apply_cleaner_to_text(cls, cleaner, text, scheme, set_checked=False): """ Applies a cleaning function to a text, and returns a label if the cleaned value wasn't NC. :param cleaner: Cleaning function to apply. :type cleaner: function of str -> str :param text: Text to apply the cleaner to. :type text: str :param scheme: Scheme containing codes which the string returned from the `cleaner` can be matched against. :type scheme: core_data_modules.data_models.CodeScheme :param set_checked: Whether to set the `checked` property of the applied Label. :type set_checked: bool """ clean_value = cleaner(text) # Don't label data which the cleaners couldn't code if clean_value == Codes.NOT_CODED: return None # Construct a label for the clean_value returned by the cleaner code_id = scheme.get_code_with_match_value(clean_value) origin_id = Metadata.get_function_location(cleaner) return cls.make_label_from_cleaner_code(scheme, code_id, origin_id, set_checked=set_checked)