Python init_test_data Examples, external.jjuraska_slug2slug.data_loader.init_test_data Python Examples

Example #1

0

Show file

File: data_analysis.py Project: varunSabnis/datatuner

def score_slot_realizations(path, filename):
    """Analyzes unrealized and hallucinated slot mentions in the utterances."""

    errors = []
    incorrect_slots = []
    # slot_cnt = 0

    # print('Analyzing missing slot realizations and hallucinations in ' + str(filename))

    # Read in the data
    data_cont = data_loader.init_test_data(os.path.join(path, filename))
    dataset_name = data_cont['dataset_name']
    mrs_orig, utterances_orig = data_cont['data']
    _, _, slot_sep, val_sep, val_sep_end = data_cont['separators']

    # Preprocess the MRs and the utterances
    mrs = [data_loader.preprocess_mr(mr, data_cont['separators']) for mr in mrs_orig]
    utterances = [data_loader.preprocess_utterance(utt) for utt in utterances_orig]

    # DEBUG PRINT
    # print('\n'.join([mr1 + '\n' + mr2 + '\n' for mr1, mr2 in zip(mrs_orig, mrs)]))

    for i, mr in enumerate(mrs):
        mr_dict = OrderedDict()

        # Extract the slot-value pairs into a dictionary
        for slot_value in mr.split(slot_sep):
            slot, value, _, _ = data_loader.parse_slot_and_value(slot_value, val_sep, val_sep_end)
            mr_dict[slot] = value

            # Count auxiliary slots
            # if not re.match(r'<!.*?>', slot):
            #     slot_cnt += 1

        # TODO: get rid of this hack
        # Move the food-slot to the end of the dict (because of delexing)
        if 'food' in mr_dict:
            food_val = mr_dict['food']
            del(mr_dict['food'])
            mr_dict['food'] = food_val

        # Delexicalize the MR and the utterance
        utterances[i] = data_loader.delex_sample(mr_dict, utterances[i], dataset=dataset_name)

        # Count the missing and hallucinated slots in the utterance
        cur_errors, cur_incorrect_slots = count_errors(utterances[i], mr_dict)
        errors.append(cur_errors)
        incorrect_slots.append(', '.join(cur_incorrect_slots))

    # DEBUG PRINT
    # print(slot_cnt)

    new_df = pd.DataFrame(columns=['mr', 'ref', 'errors', 'incorrect slots'])
    new_df['mr'] = mrs_orig
    new_df['ref'] = utterances_orig
    new_df['errors'] = errors
    new_df['incorrect slots'] = incorrect_slots

    filename_out = os.path.splitext(filename)[0] + ' [errors].csv'
    new_df.to_csv(os.path.join(path, filename_out), index=False, encoding='utf8')

Example #2

0

Show file

File: data_analysis.py Project: varunSabnis/datatuner

def align_slots(dataset, filename):
    """Aligns slots of the MRs with their mentions in the corresponding utterances."""

    alignments = []
    alignment_strings = []

    print('Aligning slots in ' + str(filename))

    # Read in the data
    data_cont = data_loader.init_test_data(os.path.join(config.DATA_DIR, dataset, filename))
    mrs_orig, utterances_orig = data_cont['data']
    _, _, slot_sep, val_sep, val_sep_end = data_cont['separators']

    # Preprocess the MRs and the utterances
    mrs = [data_loader.preprocess_mr(mr, data_cont['separators']) for mr in mrs_orig]
    utterances = [data_loader.preprocess_utterance(utt) for utt in utterances_orig]

    for i, mr in enumerate(mrs):
        mr_dict = OrderedDict()

        # Extract the slot-value pairs into a dictionary
        for slot_value in mr.split(slot_sep):
            slot, value, _, _ = data_loader.parse_slot_and_value(slot_value, val_sep, val_sep_end)
            mr_dict[slot] = value

        alignments.append(find_alignment(utterances[i], mr_dict))

    for i in range(len(utterances)):
        alignment_strings.append(' '.join(['({0}: {1})'.format(pos, slot) for pos, slot, _ in alignments[i]]))

    new_df = pd.DataFrame(columns=['mr', 'ref', 'alignment'])
    new_df['mr'] = mrs_orig
    new_df['ref'] = utterances_orig
    new_df['alignment'] = alignment_strings

    filename_out = os.path.splitext(filename)[0] + ' [aligned].csv'
    new_df.to_csv(os.path.join(config.DATA_DIR, dataset, filename_out), index=False, encoding='utf8')

Example #3

0

Show file

File: data_analysis.py Project: varunSabnis/datatuner

def analyze_contrast_relations(dataset, filename):
    """Identifies the slots involved in a contrast relation."""

    contrast_connectors = ['but', 'however', 'yet']
    slots_before = []
    slots_after = []

    print('Analyzing contrast relations in ' + str(filename))

    # Read in the data
    data_cont = data_loader.init_test_data(os.path.join(config.DATA_DIR, dataset, filename))
    mrs_orig, utterances_orig = data_cont['data']
    _, _, slot_sep, val_sep, val_sep_end = data_cont['separators']

    # Preprocess the MRs
    mrs = [data_loader.preprocess_mr(mr, data_cont['separators']) for mr in mrs_orig]

    for mr, utt in zip(mrs, utterances_orig):
        mr_dict = OrderedDict()
        mr_list_augm = []

        # Extract the slot-value pairs into a dictionary
        for slot_value in mr.split(slot_sep):
            slot, value, slot_orig, value_orig = data_loader.parse_slot_and_value(slot_value, val_sep, val_sep_end)
            mr_dict[slot] = value
            mr_list_augm.append((slot, value_orig))

        # Find the slot alignment
        alignment = find_alignment(utt, mr_dict)

        slot_before = None
        slot_after = None

        for contrast_conn in contrast_connectors:
            contrast_pos = utt.find(contrast_conn)
            if contrast_pos >= 0:
                slot_before = None
                slot_after = None

                for pos, slot, value in alignment:
                    slot_before = slot_after
                    slot_after = slot

                    if pos > contrast_pos:
                        break

                break

        slots_before.append(slot_before if slot_before is not None else '')
        slots_after.append(slot_after if slot_after is not None else '')

    # Calculate the frequency distribution of slots involved in a contrast relation
    contrast_slot_cnt = Counter()
    contrast_slot_cnt.update(slots_before + slots_after)
    del contrast_slot_cnt['']
    print('\n---- Slot distribution in contrast relations ----\n')
    print('\n'.join(slot + ': ' + str(freq) for slot, freq in contrast_slot_cnt.most_common()))

    # Calculate the frequency distribution of slot pairs involved in a contrast relation
    contrast_slot_cnt = Counter()
    slot_pairs = [tuple(sorted(slot_pair)) for slot_pair in zip(slots_before, slots_after) if slot_pair != ('', '')]
    contrast_slot_cnt.update(slot_pairs)
    print('\n---- Slot pair distribution in contrast relations ----\n')
    print('\n'.join(slot_pair[0] + ', ' + slot_pair[1] + ': ' + str(freq) for slot_pair, freq in contrast_slot_cnt.most_common()))

    new_df = pd.DataFrame(columns=['mr', 'ref', 'slot before contrast', 'slot after contrast'])
    new_df['mr'] = mrs_orig
    new_df['ref'] = utterances_orig
    new_df['slot before contrast'] = slots_before
    new_df['slot after contrast'] = slots_after

    filename_out = os.path.splitext(filename)[0] + ' [contrast relations].csv'
    new_df.to_csv(os.path.join(config.DATA_DIR, dataset, filename_out), index=False, encoding='utf8')

Example #4

0

Show file

File: data_analysis.py Project: varunSabnis/datatuner

def score_contrast(dataset, filename):
    """Determines whether the indicated contrast relation is correctly realized in the utterance."""

    contrast_connectors = ['but', 'however', 'yet']
    contrast_missed = []
    contrast_incorrectness = []
    contrast_total = []

    print('Analyzing contrast realizations in ' + str(filename))

    # Read in the data
    data_cont = data_loader.init_test_data(os.path.join(config.EVAL_DIR, dataset, filename))
    dataset_name = data_cont['dataset_name']
    mrs_orig, utterances_orig = data_cont['data']
    _, _, slot_sep, val_sep, val_sep_end = data_cont['separators']

    # Preprocess the MRs and the utterances
    mrs = [data_loader.preprocess_mr(mr, data_cont['separators']) for mr in mrs_orig]
    utterances = [data_loader.preprocess_utterance(utt) for utt in utterances_orig]

    for i, mr in enumerate(mrs):
        contrast_found = False
        contrast_correct = False
        contrast_slots = []
        mr_dict = OrderedDict()

        # Extract the slot-value pairs into a dictionary
        for slot_value in mr.split(slot_sep):
            slot, value, _, _ = data_loader.parse_slot_and_value(slot_value, val_sep, val_sep_end)

            # Extract slots to be contrasted
            if slot in [config.CONTRAST_TOKEN, config.CONCESSION_TOKEN]:
                contrast_slots.extend(value.split())
            else:
                mr_dict[slot] = value

        # Delexicalize the MR and the utterance
        utterances[i] = data_loader.delex_sample(mr_dict, utterances[i], dataset=dataset_name)

        # Determine the slot alignment in the utterance
        alignment = find_alignment(utterances[i], mr_dict)

        contrast_total.append(1 if len(contrast_slots) > 0 else 0)

        if len(contrast_slots) > 0:
            for contrast_conn in contrast_connectors:
                contrast_pos = utterances[i].find(contrast_conn)
                if contrast_pos < 0:
                    continue

                slot_left_pos = -1
                slot_right_pos = -1
                dist = 0

                contrast_found = True

                # Check whether the correct pair of slots was contrasted
                for pos, slot, _ in alignment:
                    # DEBUG PRINT
                    # print(alignment)
                    # print(contrast_slots)
                    # print()

                    if slot_left_pos > -1:
                        dist += 1

                    if slot in contrast_slots:
                        if slot_left_pos == -1:
                            slot_left_pos = pos
                        else:
                            slot_right_pos = pos
                            break

                if slot_left_pos > -1 and slot_right_pos > -1:
                    if slot_left_pos < contrast_pos < slot_right_pos and dist <= 2:
                        contrast_correct = True
                        break
        else:
            contrast_found = True
            contrast_correct = True

        contrast_missed.append(0 if contrast_found else 1)
        contrast_incorrectness.append(0 if contrast_correct else 1)

    new_df = pd.DataFrame(columns=['mr', 'ref', 'missed contrast', 'incorrect contrast', 'total contrast'])
    new_df['mr'] = mrs_orig
    new_df['ref'] = utterances_orig
    new_df['missed contrast'] = contrast_missed
    new_df['incorrect contrast'] = contrast_incorrectness
    new_df['total contrast'] = contrast_total

    filename_out = os.path.splitext(filename)[0] + ' [contrast eval].csv'
    new_df.to_csv(os.path.join(config.EVAL_DIR, dataset, filename_out), index=False, encoding='utf8')

Example #5

0

Show file

File: data_analysis.py Project: varunSabnis/datatuner

def score_emphasis(dataset, filename):
    """Determines how many of the indicated emphasis instances are realized in the utterance."""

    emph_missed = []
    emph_total = []

    print('Analyzing emphasis realizations in ' + str(filename))

    # Read in the data
    data_cont = data_loader.init_test_data(os.path.join(config.EVAL_DIR, dataset, filename))
    dataset_name = data_cont['dataset_name']
    mrs_orig, utterances_orig = data_cont['data']
    _, _, slot_sep, val_sep, val_sep_end = data_cont['separators']

    # Preprocess the MRs and the utterances
    mrs = [data_loader.preprocess_mr(mr, data_cont['separators']) for mr in mrs_orig]
    utterances = [data_loader.preprocess_utterance(utt) for utt in utterances_orig]

    for i, mr in enumerate(mrs):
        expect_emph = False
        emph_slots = set()
        mr_dict = OrderedDict()

        # Extract the slot-value pairs into a dictionary
        for slot_value in mr.split(slot_sep):
            slot, value, _, _ = data_loader.parse_slot_and_value(slot_value, val_sep, val_sep_end)

            # Extract slots to be emphasized
            if slot == config.EMPH_TOKEN:
                expect_emph = True
            else:
                mr_dict[slot] = value
                if expect_emph:
                    emph_slots.add(slot)
                    expect_emph = False

        # Delexicalize the MR and the utterance
        utterances[i] = data_loader.delex_sample(mr_dict, utterances[i], dataset=dataset_name)

        # Determine the slot alignment in the utterance
        alignment = find_alignment(utterances[i], mr_dict)

        emph_total.append(len(emph_slots))

        # Check how many emphasized slots were not realized before the name-slot
        for pos, slot, _ in alignment:
            # DEBUG PRINT
            # print(alignment)
            # print(emph_slots)
            # print()

            if slot == 'name':
                break

            if slot in emph_slots:
                emph_slots.remove(slot)

        emph_missed.append(len(emph_slots))

    new_df = pd.DataFrame(columns=['mr', 'ref', 'missed emphasis', 'total emphasis'])
    new_df['mr'] = mrs_orig
    new_df['ref'] = utterances_orig
    new_df['missed emphasis'] = emph_missed
    new_df['total emphasis'] = emph_total

    filename_out = os.path.splitext(filename)[0] + ' [emphasis eval].csv'
    new_df.to_csv(os.path.join(config.EVAL_DIR, dataset, filename_out), index=False, encoding='utf8')

Example #6

0

Show file

File: data_augmentation.py Project: varunSabnis/datatuner

def augment_with_aux_indicators(dataset,
                                filename,
                                indicators,
                                mode='all',
                                alt_contrast_mode=False):
    """Augment MRs in a dataset with auxiliary tokens indicating desired discourse phenomena in the corresponding
    utterances. Depending on the mode, the augmented dataset will only contain samples which exhibit 1.) at most one
    of the desired indicators ('single'), 2.) the one selected indicator only ('only'), or 3.) all the desired
    indicators at once ('combo'). The default mode ('all') keeps all samples in the dataset.
    """

    if indicators is None or len(indicators) == 0:
        return

    mrs_augm = []
    mrs_single = []
    utterances_single = []
    mrs_emph_only = []
    utterances_emph_only = []
    mrs_contrast_only = []
    utterances_contrast_only = []
    mrs_combo = []
    utterances_combo = []

    emph_only_ctr = 0
    contrast_only_ctr = 0
    combo_ctr = 0

    print('Augmenting MRs with ' + ' + '.join(indicators) + ' in ' +
          str(filename))

    # Read in the data
    data_cont = data_loader.init_test_data(
        os.path.join(config.DATA_DIR, dataset, filename))
    mrs, utterances = data_cont['data']
    _, _, slot_sep, val_sep, val_sep_end = data_cont['separators']

    for mr, utt in zip(mrs, utterances):
        mr_dict = OrderedDict()
        mr_list_augm = []

        # Extract the slot-value pairs into a dictionary
        for slot_value in mr.split(slot_sep):
            slot, value, slot_orig, value_orig = data_loader.parse_slot_and_value(
                slot_value, val_sep, val_sep_end)
            mr_dict[slot] = value
            mr_list_augm.append((slot, value_orig))
            # mrs[i] = mrs[i].replace(slot_orig, slot)

        # Find the slot alignment
        alignment = find_alignment(utt, mr_dict)

        # Augment the MR with auxiliary tokens
        if 'emphasis' in indicators:
            __add_emphasis_tokens(mr_list_augm, alignment)
        if 'contrast' in indicators:
            __add_contrast_tokens(mr_list_augm,
                                  utt,
                                  alignment,
                                  alt_mode=alt_contrast_mode)

        # Convert augmented MR from list to string representation
        mr_augm = (slot_sep + ' ').join([
            s + val_sep + v + (val_sep_end if val_sep_end is not None else '')
            for s, v in mr_list_augm
        ])

        mrs_augm.append(mr_augm)

        # Count and separate the different augmentation instances
        slots_augm = set([s for s, v in mr_list_augm])
        if config.EMPH_TOKEN in slots_augm and (
                config.CONTRAST_TOKEN in slots_augm
                or config.CONCESSION_TOKEN in slots_augm):
            mrs_combo.append(mr_augm)
            utterances_combo.append(utt)
            combo_ctr += 1
        else:
            mrs_single.append(mr_augm)
            utterances_single.append(utt)
            if config.EMPH_TOKEN in slots_augm:
                mrs_emph_only.append(mr_augm)
                utterances_emph_only.append(utt)
                emph_only_ctr += 1
            elif config.CONTRAST_TOKEN in slots_augm or config.CONCESSION_TOKEN in slots_augm:
                mrs_contrast_only.append(mr_augm)
                utterances_contrast_only.append(utt)
                contrast_only_ctr += 1

    print('# of MRs with emphasis only:', emph_only_ctr)
    print('# of MRs with contrast/concession only:', contrast_only_ctr)
    print('# of MRs with emphasis & contrast/concession:', combo_ctr)

    new_df = pd.DataFrame(columns=['mr', 'ref'])
    if mode == 'single':
        new_df['mr'] = mrs_single
        new_df['ref'] = utterances_single
    elif mode == 'only':
        if 'emphasis' in indicators:
            new_df['mr'] = mrs_emph_only
            new_df['ref'] = utterances_emph_only
        elif 'contrast' in indicators:
            new_df['mr'] = mrs_contrast_only
            new_df['ref'] = utterances_contrast_only
    elif mode == 'combo':
        new_df['mr'] = mrs_combo
        new_df['ref'] = utterances_combo
    else:
        new_df['mr'] = mrs_augm
        new_df['ref'] = utterances

    # Store augmented dataset to a new file
    filename_out = os.path.splitext(filename)[0] + '_augm_' + '_'.join(indicators) \
        + (('_' + mode) if mode != 'all' else '') + ('_alt' if alt_contrast_mode else '') + '.csv'
    new_df.to_csv(os.path.join(config.DATA_DIR, dataset, filename_out),
                  index=False,
                  encoding='utf8')

Example #7

0

Show file

File: data_augmentation.py Project: varunSabnis/datatuner

def augment_with_contrast_tgen(dataset, filename):
    """Augments the MRs with auxiliary tokens indicating a pair of slots that should be contrasted in the
    corresponding generated utterance. The output is in the format accepted by TGen.
    """

    contrast_connectors = ['but', 'however', 'yet']
    scalar_slots = get_scalar_slots()

    alignments = []
    contrasts = []

    print('Augmenting MRs with contrast in ' + str(filename))

    # Read in the data
    data_cont = data_loader.init_test_data(
        os.path.join(config.DATA_DIR, dataset, filename))
    mrs, utterances = data_cont['data']
    _, _, slot_sep, val_sep, val_sep_end = data_cont['separators']

    for i, mr in enumerate(mrs):
        mr_dict = OrderedDict()

        # Extract the slot-value pairs into a dictionary
        for slot_value in mr.split(slot_sep):
            slot, value, slot_orig, _ = data_loader.parse_slot_and_value(
                slot_value, val_sep, val_sep_end)
            mr_dict[slot] = value
            mrs[i] = mrs[i].replace(slot_orig, slot)

        alignments.append(find_alignment(utterances[i], mr_dict))

    for i in range(len(utterances)):
        contrasts.append(['none', 'none', 'none', 'none'])
        for contrast_conn in contrast_connectors:
            contrast_pos = utterances[i].find(contrast_conn)
            if contrast_pos >= 0:
                slot_before = None
                value_before = None
                slot_after = None
                value_after = None

                for pos, slot, value in alignments[i]:
                    if pos > contrast_pos:
                        if not slot_before:
                            break
                        if slot in scalar_slots:
                            slot_after = slot
                            value_after = value
                            break
                    else:
                        if slot in scalar_slots:
                            slot_before = slot
                            value_before = value

                if slot_before and slot_after:
                    if scalar_slots[slot_before][value_before] - scalar_slots[
                            slot_after][value_after] == 0:
                        contrasts[i][2] = slot_before
                        contrasts[i][3] = slot_after
                    else:
                        contrasts[i][0] = slot_before
                        contrasts[i][1] = slot_after

                break

    new_df = pd.DataFrame(columns=[
        'mr', 'ref', 'contrast1', 'contrast2', 'concession1', 'concession2'
    ])
    new_df['mr'] = mrs
    new_df['ref'] = utterances
    new_df['contrast1'] = [tup[0] for tup in contrasts]
    new_df['contrast2'] = [tup[1] for tup in contrasts]
    new_df['concession1'] = [tup[2] for tup in contrasts]
    new_df['concession2'] = [tup[3] for tup in contrasts]

    filename_out = ''.join(
        filename.split('.')[:-1]) + '_augm_contrast_tgen.csv'
    new_df.to_csv(os.path.join(config.DATA_DIR, dataset, filename_out),
                  index=False,
                  encoding='utf8')

Example #8

0

Show file

File: data_augmentation.py Project: varunSabnis/datatuner

def augment_by_utterance_splitting(dataset, filename, denoise_only=False):
    """Performs utterance splitting and augments the dataset with new pseudo-samples whose utterances are
    one sentence long. The MR of each pseudo-sample contains only slots mentioned in the corresponding sentence.
    Assumes a CSV or JSON file as input.
    """

    if not filename.lower().endswith(('.csv', '.json')):
        raise ValueError(
            'Unexpected file type. Please provide a CSV or JSON file as input.'
        )

    mrs_dicts = []
    data_new = []

    print('Performing utterance splitting on ' + str(filename))

    # Read in the data
    data_cont = data_loader.init_test_data(
        os.path.join(config.DATA_DIR, dataset, filename))
    mrs, utterances = data_cont['data']
    _, _, slot_sep, val_sep, val_sep_end = data_cont['separators']

    for i, mr in enumerate(mrs):
        mr_dict = OrderedDict()

        # Extract the slot-value pairs into a dictionary
        for slot_value in mr.split(slot_sep):
            slot, _, _, value_orig = data_loader.parse_slot_and_value(
                slot_value, val_sep, val_sep_end)
            mr_dict[slot] = value_orig

        mrs_dicts.append(mr_dict)

    new_mrs, new_utterances = split_content(mrs_dicts,
                                            utterances,
                                            filename,
                                            permute=False,
                                            denoise_only=denoise_only)

    suffix = ' [' + ('denoised' if denoise_only else 'utt. split') + ']'
    filename_out = os.path.splitext(filename)[0] + suffix + os.path.splitext(
        filename)[1]

    if filename.lower().endswith('.csv'):
        for row, mr in enumerate(new_mrs):
            if len(mr) == 0:
                continue

            mr_str = ', '.join(
                ['{0}[{1}]'.format(slot, value) for slot, value in mr.items()])

            data_new.append([mr_str, new_utterances[row]])

        # Write the augmented dataset to a new file
        pd.DataFrame(data_new).to_csv(os.path.join(config.DATA_DIR, dataset,
                                                   filename_out),
                                      header=['mr', 'ref'],
                                      index=False,
                                      encoding='utf8')
    elif filename.lower().endswith('.json'):
        for row, mr in enumerate(new_mrs):
            if len(mr) == 0:
                continue

            mr_str = mr.pop('da')
            mr_str += '(' + slot_sep.join([
                '{0}{1}{2}'.format(key.rstrip(string.digits), val_sep, value)
                for key, value in mr.items()
            ]) + ')'

            data_new.append([mr_str, new_utterances[row]])

        # Write the augmented dataset to a new file
        with io.open(os.path.join(config.DATA_DIR, dataset, filename_out),
                     'w',
                     encoding='utf8') as f_data_new:
            json.dump(data_new, f_data_new, indent=4)