Beispiel #1
0
def create_examples(dialog_filename, slot_list, set_type):
    examples = []
    with open(dialog_filename) as f:
        dst_set = json.load(f)
    for dial in dst_set:
        dial_id = dial['dialogue_id']
        prev_ds = []
        for turn_id, turn in enumerate(dial['turns']):
            guid = '%s-%s-%s' % (set_type, dial_id, str(turn_id))
            (sys_utt_tok, sys_utt_tok_label_dict, usr_utt_tok,
             usr_utt_tok_label_dict,
             class_type_dict) = get_turn_label(turn,
                                               prev_ds,
                                               slot_list,
                                               dial_id,
                                               turn_id,
                                               slot_last_occurrence=True)
            examples.append(
                util.InputExample(guid=guid,
                                  text_a=sys_utt_tok,
                                  text_b=usr_utt_tok,
                                  text_a_label=sys_utt_tok_label_dict,
                                  text_b_label=usr_utt_tok_label_dict,
                                  class_label=class_type_dict))
            prev_ds = turn['dialogue_state']
    return examples
Beispiel #2
0
def create_examples(dialog_filename,
                    slot_list,
                    set_type,
                    use_asr_hyp=0,
                    exclude_unpointable=True):
    examples = []
    with open(dialog_filename) as f:
        dst_set = json.load(f)
    for dial in dst_set:
        for turn in dial['dialogue']:
            guid = '%s-%s-%s' % (set_type, str(
                dial['dialogue_idx']), str(turn['turn_idx']))

            sys_utt_tok = tokenize(turn['system_transcript'])

            usr_utt_tok_list = []
            if use_asr_hyp == 0:
                usr_utt_tok_list.append(tokenize(turn['transcript']))
            else:
                for asr_hyp, _ in turn['asr'][:use_asr_hyp]:
                    usr_utt_tok_list.append(tokenize(asr_hyp))

            turn_label = [[
                FIX.get(s.strip(), s.strip()),
                FIX.get(v.strip(), v.strip())
            ] for s, v in turn['turn_label']]

            for usr_utt_tok in usr_utt_tok_list:
                sys_utt_tok_label_dict = {}
                usr_utt_tok_label_dict = {}
                class_type_dict = {}
                for slot in slot_list:
                    label = 'none'
                    for [s, v] in turn_label:
                        if s == slot:
                            label = v
                            break
                    sys_utt_tok_label, usr_utt_tok_label, class_type = get_turn_label(
                        label,
                        sys_utt_tok,
                        usr_utt_tok,
                        slot_last_occurrence=True)
                    sys_utt_tok_label_dict[slot] = sys_utt_tok_label
                    usr_utt_tok_label_dict[slot] = usr_utt_tok_label
                    class_type_dict[slot] = class_type
                    if class_type == 'unpointable':
                        tf.logging.info(
                            'Unpointable: guid=%s, slot=%s, label=%s, usr_utt=%s, sys_utt=%s'
                            % (guid, slot, label, usr_utt_tok, sys_utt_tok))
                if 'unpointable' not in class_type_dict.values(
                ) or not exclude_unpointable:
                    examples.append(
                        util.InputExample(guid=guid,
                                          text_a=sys_utt_tok,
                                          text_b=usr_utt_tok,
                                          text_a_label=sys_utt_tok_label_dict,
                                          text_b_label=usr_utt_tok_label_dict,
                                          class_label=class_type_dict))
    return examples
Beispiel #3
0
def create_examples(dialog_filename, slot_list, set_type, use_asr_hyp=0,
                    exclude_unpointable=True):
    examples = []
    with open(dialog_filename) as f:
        dst_set = json.load(f)
    for i, dial in enumerate(dst_set):
        if i < 3:
            print('[DIAL][{}]'.format(i))
            pprint(dial)
        for turn in dial['dialogue']:
            guid = '%s-%s-%s' % (set_type,
                                 str(dial['dialogue_idx']),
                                 str(turn['turn_idx']))
            # get sys_utt tokenized
            sys_utt_tok = tokenize(turn['system_transcript'])

            # get usr_utt
            usr_utt_tok_list = []
            if use_asr_hyp == 0:    # not use asr hypothesis, instead transcript
                usr_utt_tok_list.append(tokenize(turn['transcript']))
            else:
                for asr_hyp, _ in turn['asr'][:use_asr_hyp]:
                    usr_utt_tok_list.append(tokenize(asr_hyp))

            # get slot-value pair (i.e "price range"-"moderate")
            turn_label = [[FIX.get(s.strip(), s.strip()), FIX.get(v.strip(), v.strip())] for s, v in turn['turn_label']]

            for usr_utt_tok in usr_utt_tok_list:
                sys_utt_tok_label_dict = {}
                usr_utt_tok_label_dict = {}
                class_type_dict = {}            # class_types = ['none', 'dontcare', 'copy_value', 'unpointable']
                for slot in slot_list:          # slot_list = ['area', 'food', 'price range']
                    label = 'none'
                    for [s, v] in turn_label:
                        if s == slot:           # s: area
                            label = v           # v: center
                            break
                    sys_utt_tok_label, usr_utt_tok_label, class_type = get_turn_label(
                        label, sys_utt_tok, usr_utt_tok,
                        slot_last_occurrence=True)
                    sys_utt_tok_label_dict[slot] = sys_utt_tok_label
                    usr_utt_tok_label_dict[slot] = usr_utt_tok_label
                    class_type_dict[slot] = class_type
                    if class_type == 'unpointable':
                        tf.logging.info(
                            'Unpointable: guid=%s, slot=%s, label=%s, usr_utt=%s, sys_utt=%s' % (
                                guid, slot, label, usr_utt_tok, sys_utt_tok))
                if 'unpointable' not in class_type_dict.values() or not exclude_unpointable:
                    """
                    if i < 10:
                        print("[guid][{}][{}]".format(i, guid))
                        print("[sys_utt_tok][{}][{}]".format(i, sys_utt_tok))
                        print("[usr_utt_tok][{}][{}]".format(i, usr_utt_tok))
                        print("[sys_utt_tok_label_dict][{}][{}]".format(i, sys_utt_tok_label_dict))
                        print("[usr_utt_tok_label_dict][{}][{}]".format(i, usr_utt_tok_label_dict))
                        print("[class_type_dict][{}][{}]".format(i, class_type_dict))
                    """
                    examples.append(util.InputExample(
                        guid=guid,
                        text_a=sys_utt_tok,
                        text_b=usr_utt_tok,
                        text_a_label=sys_utt_tok_label_dict,
                        text_b_label=usr_utt_tok_label_dict,
                        class_label=class_type_dict))
    return examples