def make_label_from_cleaner_code(scheme, code, origin_id, origin_name="Pipeline Auto-Coder", date_time_utc=None, set_checked=False): """ Constructs a new Label object from a code determined by a pipeline cleaner. :param scheme: Scheme which the `code` argument belongs to. :type scheme: core_data_modules.data_models.CodeScheme :param code: Code to construct the label from. :type code: Code :param origin_id: Identifier of the origin of this label. :type origin_id: str :param origin_name: Name of the origin of this label. :type origin_name: str :param date_time_utc: Date to set in the label as an ISO string in UTC, or None. If None, uses the current system time in UTC. :type date_time_utc: str | None :param set_checked: Whether to set the `checked` property of the returned Label. :type set_checked: bool :return: A new label. :rtype: Label """ if date_time_utc is None: date_time_utc = TimeUtils.utc_now_as_iso_string() origin = Origin(origin_id, origin_name, "External") return Label(scheme.scheme_id, code.code_id, date_time_utc, origin, checked=set_checked)
def make_label(scheme, code, origin_id, origin_name="Pipeline Auto-Coder", date_time_utc=None): if date_time_utc is None: date_time_utc = datetime.now().astimezone(pytz.utc).isoformat() origin = Origin(origin_id, origin_name, "External") return Label(scheme, code, date_time_utc, origin, checked=False)
def test_fold_list_of_labels(self): na_code = Code("code-NA", "Control", "NA", -10, "NA", True, control_code=Codes.TRUE_MISSING) nr_code = Code("code-NR", "Control", "NR", -20, "NR", True, control_code=Codes.NOT_REVIEWED) nc_code = Code("code-NC", "Control", "NC", -30, "NC", True, control_code=Codes.NOT_CODED) normal_1_code = Code("code-normal-1", "Normal", "Normal 1", 1, "normal_1", True) normal_2_code = Code("code-normal-2", "Normal", "Normal 2", 2, "normal_2", True) scheme_1 = CodeScheme("scheme-1", "Scheme 1", "1", [na_code, nr_code, nc_code, normal_1_code, normal_2_code]) scheme_2 = CodeScheme("scheme-2", "Scheme 2", "2", []) na_label = Label("scheme-1", "code-NA", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict() nr_label = Label("scheme-1", "code-NR", "2019-10-01T12:25:18Z", Origin("x", "test", "automatic")).to_dict() nc_label = Label("scheme-1", "code-NC", "2019-10-01T12:30:00Z", Origin("x", "test", "automatic")).to_dict() na_label_2 = Label("scheme-1", "code-NA", "2019-10-01T13:00:00Z", Origin("x", "test", "automatic")).to_dict() normal_1_label = Label("scheme-1", "code-normal-1", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict() normal_1_label_2 = Label("scheme-1", "code-normal-1", "2019-10-03T00:00:00Z", Origin("x", "test", "automatic")).to_dict() normal_2_label = Label("scheme-1", "code-normal-2", "2019-10-01T15:00:00Z", Origin("x", "test", "automatic")).to_dict() # Test empty lists are rejected self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [], [])) self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [na_label], [])) # Test lists containing only NA labels return a single NA label self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [na_label]), [na_label]) self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [na_label_2]), [na_label]) # Test lists containing an NA label and another label (including another NA label) are rejected self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [na_label, na_label], [na_label])) self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [na_label, normal_1_label], [na_label])) # Test folding a normal label with an NA label self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [normal_1_label]), [normal_1_label]) # Test folding various combinations of only normal labels self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [normal_1_label]), [normal_1_label]) self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label, normal_2_label], [normal_1_label]), [normal_1_label, normal_2_label]) self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label, normal_2_label], [normal_1_label_2]), [normal_1_label, normal_2_label]) # Test folding normal labels with a control code that isn't NA or NC self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label, normal_2_label], [nr_label]), [normal_1_label, normal_2_label, nr_label]) # Test folding a label from a different code scheme self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_2, [normal_1_label], [na_label])) # (make sure that test would have been ok with the correct code scheme) FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [na_label]) # Test folding normal codes with NC codes self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [nc_label], [nc_label]), [nc_label]) self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [nc_label]), [nc_label]) self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [nc_label]), [normal_1_label]) self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [normal_2_label, nc_label]), [normal_1_label, normal_2_label])
def test_assert_label_ids_equal(self): self.assertEqual(FoldStrategies.assert_label_ids_equal( Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict(), Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict() ), Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict()) self.assertEqual(FoldStrategies.assert_label_ids_equal( Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict(), Label("scheme-1", "code-2", "2019-10-14T12:20:14Z", Origin("y", "test-2", "manual")).to_dict() ), Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict()) try: FoldStrategies.assert_label_ids_equal( Label("scheme-1", "code-1", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict(), Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict() ), self.fail("No AssertionError raised") except AssertionError as e: if str(e) == "No AssertionError raised": raise e self.assertEqual(str(e), "Labels should have the same SchemeID and CodeID, but at least one of those is different " "(differing values were {'SchemeID': 'scheme-1', 'CodeID': 'code-1'} " "and {'SchemeID': 'scheme-1', 'CodeID': 'code-2'})") try: FoldStrategies.assert_label_ids_equal( Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict(), Label("scheme-2", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict() ), self.fail("No AssertionError raised") except AssertionError as e: if str(e) == "No AssertionError raised": raise e self.assertEqual(str(e), "Labels should have the same SchemeID and CodeID, but at least one of those is different " "(differing values were {'SchemeID': 'scheme-1', 'CodeID': 'code-2'} " "and {'SchemeID': 'scheme-2', 'CodeID': 'code-2'})")
def import_coda_2_to_traced_data_iterable_multi_coded( cls, user, data, message_id_key, scheme_key_map, f=None): """ Codes keys in an iterable of TracedData objects by using the codes from a Coda 2 messages JSON file. Data which is has not been checked in the Coda file is coded using the provided nr_label (irrespective of whether there was an automatic code there before). Only the 'primary' schemes should be passed in. Schemes that have been duplicated using the duplicate_scheme tool in CodaV2/data_tools will be detected as being associated with the primary scheme automatically. TODO: Data which has been assigned a code under one scheme but none of the others needs to coded as NC not NR TODO: Or, do this in Coda so as to remove ambiguity from the perspective of the RAs? :param user: Identifier of user running this program. :type user: str :param data: TracedData objects to be coded using the Coda file. :type data: iterable of TracedData :param message_id_key: Key in TracedData objects of the message ids. :type message_id_key: str :param scheme_key_map: Dictionary of (key in TracedData objects to assign labels to) -> (Scheme in the Coda messages file to retrieve the labels from) :type scheme_key_map: dict of str -> iterable of Scheme :param f: Coda data file to import codes from, or None. If None, assigns NOT_REVIEWED codes to everything. :type f: file-like | None """ if f is None: f = cls._make_empty_file() # Build a lookup table of MessageID -> SchemeID -> Labels coda_dataset = cls._dataset_lut_from_messages_file( f, scheme_key_map.values()) # Filter out TracedData objects that do not contain a message id key data = [td for td in data if message_id_key in td] # Apply the labels from Coda to each TracedData item in data for td in data: for coded_key, scheme in scheme_key_map.items(): # Get labels for this (message id, scheme id) from the look-up table labels = coda_dataset.get(td[message_id_key], dict()).get(scheme.scheme_id, []) # Get the currently assigned list of labels for this multi-coded scheme, # and construct a look-up table of scheme id -> label td_labels = td.get(coded_key, []) td_labels_lut = { label["SchemeID"]: Label.from_dict(label) for label in td_labels } for label in reversed(labels): # Update the relevant label in this traced data's list of labels with the new label, # and append the whole new list to the traced data. td_labels_lut[label.scheme_id] = label td_labels = list(td_labels_lut.values()) td.append_data( {coded_key: [label.to_dict() for label in td_labels]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Delete any labels that are SPECIAL-MANUALLY_UNCODED for scheme_id, label in list(td_labels_lut.items()): if label.code_id == "SPECIAL-MANUALLY_UNCODED": del td_labels_lut[scheme_id] td_labels = list(td_labels_lut.values()) td.append_data( { coded_key: [label.to_dict() for label in td_labels] }, Metadata(user, Metadata.get_call_location(), time.time())) # If no manual labels have been set and are checked, set a code for NOT_REVIEWED checked_codes_count = 0 labels = td.get(coded_key) if labels is not None: for label in labels: if label["Checked"]: checked_codes_count += 1 if checked_codes_count == 0: nr_label = CleaningUtils.make_label_from_cleaner_code( scheme, scheme.get_code_with_control_code(Codes.NOT_REVIEWED), Metadata.get_call_location()) td.append_data({coded_key: [nr_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), time.time())) # Normalise the scheme ids of all the imported labels labels = [Label.from_dict(d) for d in td[coded_key]] for label in labels: assert label.scheme_id.startswith(scheme.scheme_id) label.scheme_id = scheme.scheme_id # De-duplicate the imported labels by selecting the first label with each code id. # This is required in cases where the same label was applied to this message under different columns # of the same code scheme, and is possible now that we have normalised the scheme ids. unique_labels_by_code_id = [] seen_code_ids = set() for label in labels: if label.code_id not in seen_code_ids: unique_labels_by_code_id.append(label) seen_code_ids.add(label.code_id) td.append_data( { coded_key: [ label.to_dict() for label in unique_labels_by_code_id ] }, Metadata(user, Metadata.get_call_location(), time.time()))
def export_traced_data_iterable_to_coda_2(cls, data, raw_key, creation_date_time_key, message_id_key, scheme_key_map, f): """ Exports an iterable of TracedData to a messages json file suitable for upload into Coda V2. Data is de-duplicated on export. This function will not export data objects which do not contain the raw_key, or for which the value at the raw_key is an empty string. Data which has been coded as NOT_CODED will be exported but without the NOT_CODED label. TracedData objects with the same message id must have the same labels applied, otherwise this exporter will fail. :param data: Data to export to Coda V2. :type data: iterable of TracedData :param raw_key: Key in TracedData objects of the raw messages. :type raw_key: str :param creation_date_time_key: Key in TracedData objects of when the message was created. :type creation_date_time_key: str :param message_id_key: Key in TracedData objects of the message id. Message Ids may be set using TracedDataCoda2IO.add_message_ids. :type message_id_key: str :param scheme_key_map: Dictionary of (key in TracedData objects of coded data to export) -> (Scheme for that key) :type scheme_key_map: dict of str -> Scheme :param f: File to write exported JSON file to. :type f: file-like """ # Filter data for elements which contain a value for the given raw key that isn't empty string filtered_data = [td for td in data if td.get(raw_key, "") != ""] cls._assert_uniquely_coded(filtered_data, message_id_key, scheme_key_map.keys()) filtered_data = cls._filter_duplicates(filtered_data, message_id_key, creation_date_time_key) coda_messages = [] # List of Coda V2 Message objects to be exported for td in filtered_data: # Export labels for this row which are not Codes.NOT_CODED labels = [] for coded_key, scheme in scheme_key_map.items(): if coded_key in td and scheme.get_code_with_code_id( td[coded_key] ["CodeID"]).control_code != Codes.NOT_CODED: labels.append(Label.from_firebase_map(td[coded_key])) # Create a Coda message object for this row message = Message(message_id=td[message_id_key], text=td[raw_key], creation_date_time_utc=isoparse( td[creation_date_time_key]).astimezone( pytz.utc).isoformat(), labels=labels) coda_messages.append(message) json.dump([m.to_firebase_map() for m in coda_messages], f, sort_keys=True, indent=2, separators=(", ", ": "))
messages = [Message.from_firebase_map(d) for d in json.load(f)] log.info(f"Loaded {len(messages)} messages") log.info(f"Performing merge ({code_ids_to_merge} -> '{merged_code_id}')...") merged_count = 0 # A count of the number of labels that were remapped to the merged value, for sense-check logging for msg in messages: processed_scheme_ids = set() for label in list(msg.labels): # Skip labels that are not the latest assignment under each scheme if label.scheme_id in processed_scheme_ids: continue processed_scheme_ids.add(label.scheme_id) if label.code_id in code_ids_to_merge: msg.labels.insert( 0, Label(label.scheme_id, merged_code_id, TimeUtils.utc_now_as_iso_string(), Origin(Metadata.get_call_location(), "Auto Code-Merge", "External"), checked=label.checked)) merged_count += 1 log.info(f"Merged {merged_count} labels to '{merged_code_id}'") log.info( f"Exporting code-merged Coda messages to '{messages_output_file_path}'...") with open(messages_output_file_path, "w") as f: json.dump([msg.to_firebase_map() for msg in messages], f, indent=2) log.info("Done")
def predict_labels_for_dataset(dataset_id): DATASET_ID = dataset_id fcw.set_dataset_autolabel_complete(DATASET_ID, 0.0) log(f"Predicting labels for: {DATASET_ID}") code_scheme_ids = fcw.get_code_scheme_ids(DATASET_ID) log(f"Code_Scheme_IDs for: {code_scheme_ids}") code_schemes = {} for code_scheme_id in code_scheme_ids: fb_map_scheme = fcw.get_code_scheme(DATASET_ID, code_scheme_id) code_schemes[code_scheme_id] = Scheme.from_firebase_map(fb_map_scheme) log(f"Code_schemes: {len(code_schemes)}") messages_fb = fcw.get_all_messages(DATASET_ID) messages = [] seq_num_map = {} for message_fb in messages_fb: seq_num_map[message_fb["MessageID"]] = message_fb["SequenceNumber"] # Work around interpretation with firebase rewriting '1.0' to '1' for label_map in message_fb["Labels"]: if "Confidence" in label_map: label_map["Confidence"] = float(label_map["Confidence"]) messages.append(Message.from_firebase_map(message_fb)) log(f"Messages: {len(messages)}") for scheme_id in code_scheme_ids: log(f"Processing scheme: {scheme_id}") messages_for_model = [] labels_for_model = [] for message in messages: for label in message.labels: if label.scheme_id != scheme_id: continue if label.code_id == "SPECIAL-MANUALLY_UNCODED": continue if not label.checked: continue messages_for_model.append(message.text) labels_for_model.append(label.code_id) break log(f"Messages for model: {len(labels_for_model)}") model, scores = model_utils.build_and_evaluate(messages_for_model, labels_for_model) log(f"Model built") log(f"Scores: {str(scores)}") dt_time = pytz.utc.localize( datetime.utcnow()).isoformat(timespec="microseconds") origin = Origin("label_predictor", "Label Predictor", "Automatic") messages_to_predict = [] message_update_batch = [] i = 0 for message in messages: i = i + 1 if i % 100 == 0: fcw.set_dataset_autolabel_complete(DATASET_ID, i / len(messages)) print(f"{i} messages / {len(messages)} processed") if len(message.labels) != 0 and message.labels[0].checked: continue msg = message.text pred_label = model.predict([msg])[0] pred_distance = model.decision_function([msg])[0] max_confidence = max(model.predict_proba([msg])[0]) if (max_confidence > 0.8): label = Label(scheme_id, pred_label, dt_time, origin, confidence=max_confidence) message.labels = [label] firebase_map = message.to_firebase_map() firebase_map["SequenceNumber"] = seq_num_map[ message.message_id] message_update_batch.append(firebase_map) if (len(message_update_batch) > 100): fcw.set_messages_content_batch(DATASET_ID, message_update_batch) log(f"Messages updated {len(message_update_batch)}") message_update_batch.clear() fcw.set_messages_content_batch(DATASET_ID, message_update_batch) log(f"Messages updated {len(message_update_batch)}") fcw.set_dataset_autolabel_complete(DATASET_ID, 1.0)