def make_label_from_cleaner_code(scheme,
                                     code,
                                     origin_id,
                                     origin_name="Pipeline Auto-Coder",
                                     date_time_utc=None,
                                     set_checked=False):
        """
        Constructs a new Label object from a code determined by a pipeline cleaner.

        :param scheme: Scheme which the `code` argument belongs to.
        :type scheme: core_data_modules.data_models.CodeScheme
        :param code: Code to construct the label from.
        :type code: Code
        :param origin_id: Identifier of the origin of this label.
        :type origin_id: str
        :param origin_name: Name of the origin of this label.
        :type origin_name: str
        :param date_time_utc: Date to set in the label as an ISO string in UTC, or None.
                              If None, uses the current system time in UTC.
        :type date_time_utc: str | None
        :param set_checked: Whether to set the `checked` property of the returned Label.
        :type set_checked: bool
        :return: A new label.
        :rtype: Label
        """
        if date_time_utc is None:
            date_time_utc = TimeUtils.utc_now_as_iso_string()

        origin = Origin(origin_id, origin_name, "External")

        return Label(scheme.scheme_id,
                     code.code_id,
                     date_time_utc,
                     origin,
                     checked=set_checked)
Example #2
0
    def make_label(scheme,
                   code,
                   origin_id,
                   origin_name="Pipeline Auto-Coder",
                   date_time_utc=None):
        if date_time_utc is None:
            date_time_utc = datetime.now().astimezone(pytz.utc).isoformat()

        origin = Origin(origin_id, origin_name, "External")

        return Label(scheme, code, date_time_utc, origin, checked=False)
    def test_fold_list_of_labels(self):
        na_code = Code("code-NA", "Control", "NA", -10, "NA", True, control_code=Codes.TRUE_MISSING)
        nr_code = Code("code-NR", "Control", "NR", -20, "NR", True, control_code=Codes.NOT_REVIEWED)
        nc_code = Code("code-NC", "Control", "NC", -30, "NC", True, control_code=Codes.NOT_CODED)
        normal_1_code = Code("code-normal-1", "Normal", "Normal 1", 1, "normal_1", True)
        normal_2_code = Code("code-normal-2", "Normal", "Normal 2", 2, "normal_2", True)
        scheme_1 = CodeScheme("scheme-1", "Scheme 1", "1", [na_code, nr_code, nc_code, normal_1_code, normal_2_code])

        scheme_2 = CodeScheme("scheme-2", "Scheme 2", "2", [])

        na_label = Label("scheme-1", "code-NA", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict()
        nr_label = Label("scheme-1", "code-NR", "2019-10-01T12:25:18Z", Origin("x", "test", "automatic")).to_dict()
        nc_label = Label("scheme-1", "code-NC", "2019-10-01T12:30:00Z", Origin("x", "test", "automatic")).to_dict()
        na_label_2 = Label("scheme-1", "code-NA", "2019-10-01T13:00:00Z", Origin("x", "test", "automatic")).to_dict()
        normal_1_label = Label("scheme-1", "code-normal-1", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict()
        normal_1_label_2 = Label("scheme-1", "code-normal-1", "2019-10-03T00:00:00Z", Origin("x", "test", "automatic")).to_dict()
        normal_2_label = Label("scheme-1", "code-normal-2", "2019-10-01T15:00:00Z", Origin("x", "test", "automatic")).to_dict()

        # Test empty lists are rejected
        self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [], []))
        self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [na_label], []))

        # Test lists containing only NA labels return a single NA label
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [na_label]), [na_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [na_label_2]), [na_label])

        # Test lists containing an NA label and another label (including another NA label) are rejected
        self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [na_label, na_label], [na_label]))
        self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [na_label, normal_1_label], [na_label]))

        # Test folding a normal label with an NA label
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [normal_1_label]), [normal_1_label])
        
        # Test folding various combinations of only normal labels
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [normal_1_label]), [normal_1_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label, normal_2_label], [normal_1_label]),
                         [normal_1_label, normal_2_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label, normal_2_label], [normal_1_label_2]),
                         [normal_1_label, normal_2_label])

        # Test folding normal labels with a control code that isn't NA or NC
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label, normal_2_label], [nr_label]),
                         [normal_1_label, normal_2_label, nr_label])

        # Test folding a label from a different code scheme
        self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_2, [normal_1_label], [na_label]))
        # (make sure that test would have been ok with the correct code scheme)
        FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [na_label])

        # Test folding normal codes with NC codes
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [nc_label], [nc_label]), [nc_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [nc_label]), [nc_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [nc_label]), [normal_1_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [normal_2_label, nc_label]),
                         [normal_1_label, normal_2_label])
    def test_assert_label_ids_equal(self):
        self.assertEqual(FoldStrategies.assert_label_ids_equal(
            Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict(),
            Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict()
        ), Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict())

        self.assertEqual(FoldStrategies.assert_label_ids_equal(
            Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict(),
            Label("scheme-1", "code-2", "2019-10-14T12:20:14Z", Origin("y", "test-2", "manual")).to_dict()
        ), Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict())

        try:
            FoldStrategies.assert_label_ids_equal(
                Label("scheme-1", "code-1", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict(),
                Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict()
            ),
            self.fail("No AssertionError raised")
        except AssertionError as e:
            if str(e) == "No AssertionError raised":
                raise e

            self.assertEqual(str(e),
                             "Labels should have the same SchemeID and CodeID, but at least one of those is different "
                             "(differing values were {'SchemeID': 'scheme-1', 'CodeID': 'code-1'} "
                             "and {'SchemeID': 'scheme-1', 'CodeID': 'code-2'})")

        try:
            FoldStrategies.assert_label_ids_equal(
                Label("scheme-1", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict(),
                Label("scheme-2", "code-2", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict()
            ),
            self.fail("No AssertionError raised")
        except AssertionError as e:
            if str(e) == "No AssertionError raised":
                raise e

            self.assertEqual(str(e),
                             "Labels should have the same SchemeID and CodeID, but at least one of those is different "
                             "(differing values were {'SchemeID': 'scheme-1', 'CodeID': 'code-2'} "
                             "and {'SchemeID': 'scheme-2', 'CodeID': 'code-2'})")
Example #5
0
    def import_coda_2_to_traced_data_iterable_multi_coded(
            cls, user, data, message_id_key, scheme_key_map, f=None):
        """
        Codes keys in an iterable of TracedData objects by using the codes from a Coda 2 messages JSON file.

        Data which is has not been checked in the Coda file is coded using the provided nr_label
        (irrespective of whether there was an automatic code there before).
        
        Only the 'primary' schemes should be passed in. Schemes that have been duplicated using the duplicate_scheme
        tool in CodaV2/data_tools will be detected as being associated with the primary scheme automatically.

        TODO: Data which has been assigned a code under one scheme but none of the others needs to coded as NC not NR
        TODO: Or, do this in Coda so as to remove ambiguity from the perspective of the RAs?

        :param user: Identifier of user running this program.
        :type user: str
        :param data: TracedData objects to be coded using the Coda file.
        :type data: iterable of TracedData
        :param message_id_key: Key in TracedData objects of the message ids.
        :type message_id_key: str
        :param scheme_key_map: Dictionary of (key in TracedData objects to assign labels to) ->
                            (Scheme in the Coda messages file to retrieve the labels from)
        :type scheme_key_map: dict of str -> iterable of Scheme
        :param f: Coda data file to import codes from, or None. If None, assigns NOT_REVIEWED codes to everything.
        :type f: file-like | None
        """
        if f is None:
            f = cls._make_empty_file()

        # Build a lookup table of MessageID -> SchemeID -> Labels
        coda_dataset = cls._dataset_lut_from_messages_file(
            f, scheme_key_map.values())

        # Filter out TracedData objects that do not contain a message id key
        data = [td for td in data if message_id_key in td]

        # Apply the labels from Coda to each TracedData item in data
        for td in data:
            for coded_key, scheme in scheme_key_map.items():
                # Get labels for this (message id, scheme id) from the look-up table
                labels = coda_dataset.get(td[message_id_key],
                                          dict()).get(scheme.scheme_id, [])

                # Get the currently assigned list of labels for this multi-coded scheme,
                # and construct a look-up table of scheme id -> label
                td_labels = td.get(coded_key, [])
                td_labels_lut = {
                    label["SchemeID"]: Label.from_dict(label)
                    for label in td_labels
                }

                for label in reversed(labels):
                    # Update the relevant label in this traced data's list of labels with the new label,
                    # and append the whole new list to the traced data.
                    td_labels_lut[label.scheme_id] = label

                    td_labels = list(td_labels_lut.values())
                    td.append_data(
                        {coded_key: [label.to_dict() for label in td_labels]},
                        Metadata(user, Metadata.get_call_location(),
                                 TimeUtils.utc_now_as_iso_string()))

                # Delete any labels that are SPECIAL-MANUALLY_UNCODED
                for scheme_id, label in list(td_labels_lut.items()):
                    if label.code_id == "SPECIAL-MANUALLY_UNCODED":
                        del td_labels_lut[scheme_id]
                        td_labels = list(td_labels_lut.values())
                        td.append_data(
                            {
                                coded_key:
                                [label.to_dict() for label in td_labels]
                            },
                            Metadata(user, Metadata.get_call_location(),
                                     time.time()))

                # If no manual labels have been set and are checked, set a code for NOT_REVIEWED
                checked_codes_count = 0
                labels = td.get(coded_key)
                if labels is not None:
                    for label in labels:
                        if label["Checked"]:
                            checked_codes_count += 1

                if checked_codes_count == 0:
                    nr_label = CleaningUtils.make_label_from_cleaner_code(
                        scheme,
                        scheme.get_code_with_control_code(Codes.NOT_REVIEWED),
                        Metadata.get_call_location())

                    td.append_data({coded_key: [nr_label.to_dict()]},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

                # Normalise the scheme ids of all the imported labels
                labels = [Label.from_dict(d) for d in td[coded_key]]
                for label in labels:
                    assert label.scheme_id.startswith(scheme.scheme_id)
                    label.scheme_id = scheme.scheme_id

                # De-duplicate the imported labels by selecting the first label with each code id.
                # This is required in cases where the same label was applied to this message under different columns
                # of the same code scheme, and is possible now that we have normalised the scheme ids.
                unique_labels_by_code_id = []
                seen_code_ids = set()
                for label in labels:
                    if label.code_id not in seen_code_ids:
                        unique_labels_by_code_id.append(label)
                        seen_code_ids.add(label.code_id)

                td.append_data(
                    {
                        coded_key: [
                            label.to_dict()
                            for label in unique_labels_by_code_id
                        ]
                    }, Metadata(user, Metadata.get_call_location(),
                                time.time()))
Example #6
0
    def export_traced_data_iterable_to_coda_2(cls, data, raw_key,
                                              creation_date_time_key,
                                              message_id_key, scheme_key_map,
                                              f):
        """
        Exports an iterable of TracedData to a messages json file suitable for upload into Coda V2.

        Data is de-duplicated on export.

        This function will not export data objects which do not contain the raw_key, or for which the value at the
        raw_key is an empty string.
        Data which has been coded as NOT_CODED will be exported but without the NOT_CODED label.
        TracedData objects with the same message id must have the same labels applied, otherwise this exporter will
        fail.

        :param data: Data to export to Coda V2.
        :type data: iterable of TracedData
        :param raw_key: Key in TracedData objects of the raw messages.
        :type raw_key: str
        :param creation_date_time_key: Key in TracedData objects of when the message was created.
        :type creation_date_time_key: str
        :param message_id_key: Key in TracedData objects of the message id.
                               Message Ids may be set using TracedDataCoda2IO.add_message_ids.
        :type message_id_key: str
        :param scheme_key_map: Dictionary of (key in TracedData objects of coded data to export) ->
                               (Scheme for that key)
        :type scheme_key_map: dict of str -> Scheme
        :param f: File to write exported JSON file to.
        :type f: file-like
        """
        # Filter data for elements which contain a value for the given raw key that isn't empty string
        filtered_data = [td for td in data if td.get(raw_key, "") != ""]

        cls._assert_uniquely_coded(filtered_data, message_id_key,
                                   scheme_key_map.keys())
        filtered_data = cls._filter_duplicates(filtered_data, message_id_key,
                                               creation_date_time_key)

        coda_messages = []  # List of Coda V2 Message objects to be exported
        for td in filtered_data:
            # Export labels for this row which are not Codes.NOT_CODED
            labels = []
            for coded_key, scheme in scheme_key_map.items():
                if coded_key in td and scheme.get_code_with_code_id(
                        td[coded_key]
                    ["CodeID"]).control_code != Codes.NOT_CODED:
                    labels.append(Label.from_firebase_map(td[coded_key]))

            # Create a Coda message object for this row
            message = Message(message_id=td[message_id_key],
                              text=td[raw_key],
                              creation_date_time_utc=isoparse(
                                  td[creation_date_time_key]).astimezone(
                                      pytz.utc).isoformat(),
                              labels=labels)

            coda_messages.append(message)

        json.dump([m.to_firebase_map() for m in coda_messages],
                  f,
                  sort_keys=True,
                  indent=2,
                  separators=(", ", ": "))
Example #7
0
    messages = [Message.from_firebase_map(d) for d in json.load(f)]
log.info(f"Loaded {len(messages)} messages")

log.info(f"Performing merge ({code_ids_to_merge} -> '{merged_code_id}')...")
merged_count = 0  # A count of the number of labels that were remapped to the merged value, for sense-check logging
for msg in messages:
    processed_scheme_ids = set()
    for label in list(msg.labels):
        # Skip labels that are not the latest assignment under each scheme
        if label.scheme_id in processed_scheme_ids:
            continue
        processed_scheme_ids.add(label.scheme_id)

        if label.code_id in code_ids_to_merge:
            msg.labels.insert(
                0,
                Label(label.scheme_id,
                      merged_code_id,
                      TimeUtils.utc_now_as_iso_string(),
                      Origin(Metadata.get_call_location(), "Auto Code-Merge",
                             "External"),
                      checked=label.checked))
            merged_count += 1
log.info(f"Merged {merged_count} labels to '{merged_code_id}'")

log.info(
    f"Exporting code-merged Coda messages to '{messages_output_file_path}'...")
with open(messages_output_file_path, "w") as f:
    json.dump([msg.to_firebase_map() for msg in messages], f, indent=2)
log.info("Done")
Example #8
0
def predict_labels_for_dataset(dataset_id):
    DATASET_ID = dataset_id
    fcw.set_dataset_autolabel_complete(DATASET_ID, 0.0)
    log(f"Predicting labels for: {DATASET_ID}")
    code_scheme_ids = fcw.get_code_scheme_ids(DATASET_ID)
    log(f"Code_Scheme_IDs for: {code_scheme_ids}")

    code_schemes = {}
    for code_scheme_id in code_scheme_ids:
        fb_map_scheme = fcw.get_code_scheme(DATASET_ID, code_scheme_id)
        code_schemes[code_scheme_id] = Scheme.from_firebase_map(fb_map_scheme)

    log(f"Code_schemes: {len(code_schemes)}")

    messages_fb = fcw.get_all_messages(DATASET_ID)
    messages = []
    seq_num_map = {}
    for message_fb in messages_fb:
        seq_num_map[message_fb["MessageID"]] = message_fb["SequenceNumber"]
        # Work around interpretation with firebase rewriting '1.0' to '1'
        for label_map in message_fb["Labels"]:
            if "Confidence" in label_map:
                label_map["Confidence"] = float(label_map["Confidence"])

        messages.append(Message.from_firebase_map(message_fb))

    log(f"Messages: {len(messages)}")

    for scheme_id in code_scheme_ids:
        log(f"Processing scheme: {scheme_id}")

        messages_for_model = []
        labels_for_model = []
        for message in messages:
            for label in message.labels:
                if label.scheme_id != scheme_id:
                    continue
                if label.code_id == "SPECIAL-MANUALLY_UNCODED":
                    continue
                if not label.checked:
                    continue

                messages_for_model.append(message.text)
                labels_for_model.append(label.code_id)
                break

        log(f"Messages for model: {len(labels_for_model)}")

        model, scores = model_utils.build_and_evaluate(messages_for_model,
                                                       labels_for_model)

        log(f"Model built")
        log(f"Scores: {str(scores)}")

        dt_time = pytz.utc.localize(
            datetime.utcnow()).isoformat(timespec="microseconds")
        origin = Origin("label_predictor", "Label Predictor", "Automatic")

        messages_to_predict = []
        message_update_batch = []
        i = 0
        for message in messages:
            i = i + 1
            if i % 100 == 0:
                fcw.set_dataset_autolabel_complete(DATASET_ID,
                                                   i / len(messages))
                print(f"{i} messages / {len(messages)} processed")

            if len(message.labels) != 0 and message.labels[0].checked:
                continue
            msg = message.text

            pred_label = model.predict([msg])[0]
            pred_distance = model.decision_function([msg])[0]
            max_confidence = max(model.predict_proba([msg])[0])

            if (max_confidence > 0.8):
                label = Label(scheme_id,
                              pred_label,
                              dt_time,
                              origin,
                              confidence=max_confidence)
                message.labels = [label]
                firebase_map = message.to_firebase_map()
                firebase_map["SequenceNumber"] = seq_num_map[
                    message.message_id]
                message_update_batch.append(firebase_map)

                if (len(message_update_batch) > 100):
                    fcw.set_messages_content_batch(DATASET_ID,
                                                   message_update_batch)
                    log(f"Messages updated {len(message_update_batch)}")
                    message_update_batch.clear()

        fcw.set_messages_content_batch(DATASET_ID, message_update_batch)
        log(f"Messages updated {len(message_update_batch)}")
        fcw.set_dataset_autolabel_complete(DATASET_ID, 1.0)