def _add_target_prob_to_res(self, res, row_data): res[Target.TARGET_PROB_FIELD] = parse_json_array( row_data[DFColumn.TARGET_PROBS]) res[Target.TARGET_LABEL_FIELD] = parse_json_array( row_data[DFColumn.TARGET_LABELS]) res[Target.TARGET_LOGITS_FIELD] = parse_json_array( row_data[DFColumn.TARGET_LOGITS])
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: sequence = data.parse_json_array(row_data[DFColumn.UTTERANCE]) features_list = [ self.featurizer.featurize(InputRecord(raw_text=utterance)) for utterance in sequence ] return { # features DatasetFieldName.TEXT_FIELD: [utterance.tokens for utterance in features_list], # labels DatasetFieldName.DOC_LABEL_FIELD: row_data[DFColumn.DOC_LABEL], DatasetFieldName.UTTERANCE_FIELD: row_data[DFColumn.UTTERANCE], }
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: sequence = data.parse_json_array(row_data[DFColumn.UTTERANCE]) features_list = [ self.featurizer.featurize(InputRecord(raw_text=utterance)) for utterance in sequence ] res = { # features ModelInput.WORD_FEAT: [utterance.tokens for utterance in features_list], # labels DatasetFieldName.DOC_LABEL_FIELD: row_data[DFColumn.DOC_LABEL], DatasetFieldName.UTTERANCE_FIELD: row_data[DFColumn.UTTERANCE], } if DFColumn.DENSE_FEAT in row_data: res[ModelInput.DENSE_FEAT] = row_data.get(DFColumn.DENSE_FEAT) return res
def preprocess_row(self, row_data: Dict[str, Any]) -> Dict[str, Any]: """Preprocess steps for a single input row: 1. apply tokenization to a sequence of utterances; 2. process dictionary features to align with the last utterance. 3. align word labels with the last utterance. Args: row_data (Dict[str, Any]): Dict of one row data with column names as keys. Keys includes "doc_label", "word_label", "text", "dict_feat", "word weight" and "doc weight". Returns: Dict[str, Any]: Preprocessed dict of one row data includes: "seq_word_feat" (list of list of string) tokenized words of sequence of utterances "word_feat" (list of string) tokenized words of last utterance "raw_word_label" (string) raw word label "token_range" (list of tuple) token ranges of word labels, each tuple contains the start position index and the end position index "utterance" (list of string) raw utterances "word_label" (list of string) list of labels of words in last utterance "doc_label" (string) doc label for intent classification "word_weight" (float) weight of word label "doc_weight" (float) weight of document label "dict_feat" (tuple, optional) tuple of three lists, the first is the label of each words, the second is the weight of the feature, the third is the length of the feature. """ sequence = data.parse_json_array(row_data[RawData.TEXT]) # ignore dictionary feature for context sentences other than the last one features_list = [ self.featurizer.featurize(InputRecord(raw_text=utterance)) for utterance in sequence[:-1] ] # adding dictionary feature for the last (current) message features_list.append( self.featurizer.featurize( InputRecord( raw_text=sequence[-1], raw_gazetteer_feats=row_data.get(ModelInput.DICT, ""), ))) res = { # features ModelInput.SEQ: [utterance.tokens for utterance in features_list], ModelInput.TEXT: features_list[-1].tokens, ModelInput.DICT: ( features_list[-1].gazetteer_feats, features_list[-1].gazetteer_feat_weights, features_list[-1].gazetteer_feat_lengths, ), ModelInput.CHAR: features_list[-1].characters, ModelInput.PRETRAINED: features_list[-1].pretrained_token_embedding, # labels DocLabelConfig._name: row_data[RawData.DOC_LABEL], # extra data # TODO move the logic to FloatField ExtraField.DOC_WEIGHT: row_data.get(RawData.DOC_WEIGHT) or 1.0, ExtraField.WORD_WEIGHT: row_data.get(RawData.WORD_WEIGHT) or 1.0, ExtraField.RAW_WORD_LABEL: row_data[RawData.WORD_LABEL], ExtraField.UTTERANCE: row_data[RawData.TEXT], ExtraField.TOKEN_RANGE: features_list[-1].token_ranges, } if WordLabelConfig._name in self.labels: # TODO move it into word label field res[WordLabelConfig._name] = data.align_slot_labels( features_list[-1].token_ranges, row_data[RawData.WORD_LABEL], self.labels[WordLabelConfig._name].use_bio_labels, ) return res