Ejemplo n.º 1
0
    def process_instance(da, conc, mr, multi_ref_id):
        original_da = deepcopy(da)
        # why do the das need to be sorted? This seems weird
        # Anyway, we checked it gets sorted in delex_sent anyway so nothing to
        # do about it until later
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        # Originall we didn't want to lower case because it will make things
        # easier for udpipe later on, however ...
        # we changed our mind on this because the upper case characters are
        # messing with udpipe's ability to properly sentence tokenize.
        # we need underscores instead of dashes or else udpipe breaks it apart
        # text = re.sub(r"X-", r"X_", text)
        # Again running into problems with leaving x and as a capital letter
        # and also with udpipe randomly segmenting it but sometimes not. We
        # really need to find a more reliable sentence tokenizer / word
        # tokenizer
        text = text.lower().replace('x-', 'x')
        # We're testing out making xnear upper case to see if it reduces the
        # incorrect dropping of it by the deep parser
        text = text.replace('xnear', 'Xnear')

        # detokenize some of the apostrophe stuff because udpipe does it
        # differently. Namely removing spaces between letters and apostrophes
        text = re.sub(find_apostrophes, r"\1\2", text)
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

        # now for our own bastardized sentence tokenization and human eval
        # required stuff
        this_conc_sents = sent_tokenize(conc)
        num_sents = len(this_conc_sents)
        this_delex_sents = []
        for i, this_conc_sent in enumerate(this_conc_sents):
            text, _, _ = delex_sent(original_da, tokenize(this_conc_sent), slots_to_abstract, args.slot_names, repeated=True)
            text = text.lower().replace('x-', 'x')
            # We're testing out making xnear upper case to see if it reduces the
            # incorrect dropping of it by the deep parser
            text = text.replace('xnear', 'Xnear')
            # detokenize some of the apostrophe stuff because udpipe does it
            # differently. Namely removing spaces between letters and apostrophes
            text = re.sub(find_apostrophes, r"\1\2", text)
            this_delex_sents.append(text)

            # start appending the sentence specific ones
            sent_ids.append('_'.join([mr.replace(' ', ''), str(multi_ref_id),
                                      str(i)]))
            mrs_for_delex.append(mr)

        # now we're onto something else
        original_sents.append('\n'.join(this_conc_sents))
        delexicalised_sents.append('\n'.join(this_delex_sents))
Ejemplo n.º 2
0
    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        text = text.lower().replace('x-', 'X-')  # lowercase all but placeholders
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)
Ejemplo n.º 3
0
    def process_instance(conc_da, conc):
        # sort the DA using the same order as in E2E NLG data
        conc_da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value))
        conc_das.append(conc_da)

        text, da, abst = delex_sent(conc_da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        text = text.lower().replace('x-', 'X-')  # lowercase all but placeholders
        da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value))

        da_keys[str(da)] = da_keys.get(str(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)
Ejemplo n.º 4
0
    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)  # store the non-delexicalized version of the DA

        # delexicalize
        text, da, abst = delex_sent(da, conc, slots_to_abstract, args.slot_names)
        da.sort()  # delexicalization does not keep DAI order, need to sort again

        # store the DA
        text = fix_capitalization(text)
        conc = fix_capitalization(conc)

        da_keys[str(da)] = da_keys.get(str(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)
Ejemplo n.º 5
0
    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)  # store the non-delexicalized version of the DA

        # delexicalize
        text, da, abst = delex_sent(da, conc, slots_to_abstract, args.slot_names)
        da.sort()  # delexicalization does not keep DAI order, need to sort again

        # store the DA
        text = fix_capitalization(text)
        conc = fix_capitalization(conc)

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)
Ejemplo n.º 6
0
    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da,
                                    tokenize(conc),
                                    slots_to_abstract,
                                    args.slot_names,
                                    repeated=True)
        text = text.lower().replace('x-',
                                    'X-')  # lowercase all but placeholders
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)
Ejemplo n.º 7
0
def delexicalize_refs(mrs, refs, delex_slots, delex_output_file):
    delex_refs = []
    print('Delexicalizing...', end=' ', file=sys.stderr)
    for pos, (mr, ref) in enumerate(zip(mrs, refs)):
        delex_ref = []
        print(pos, end=' ', file=sys.stderr)
        sys.stderr.flush()
        for sent in ref:
            delex, _, absts = delex_sent(mr, [tok[0] for tok in sent],
                                         delex_slots,
                                         repeated=True)
            off = 0
            shift = 0
            delex_tagged = []
            for abst in absts:
                if abst.start == -1:  # skip delex instances not actually occurring in this sentence
                    continue
                delex_tagged.extend(sent[off + shift:abst.start + shift])
                # POS tag for all delex'd slots is usually NNP, except for phone numbers and counts
                delex_pos = {
                    'count': 'CD',
                    'phone': 'CD'
                }.get(abst.slot, 'NNP')
                delex_tagged.append(
                    [delex[abst.start], delex[abst.start], delex_pos])
                off = abst.end
                shift += abst.surface_form.count(' ')
            delex_tagged.extend(sent[off + shift:])
            delex_ref.append(delex_tagged)
        delex_refs.append(delex_ref)

    with codecs.open(delex_output_file + '.delex.txt', 'w', 'UTF-8') as fh:
        fh.write('\n'.join([
            ' '.join([tok[0] for sent in ref for tok in sent])
            for ref in delex_refs
        ]))

    write_output(delex_refs, 'lca', delex_output_file + '.delex.tag.lca.txt')
    write_output(delex_refs, 'collins',
                 delex_output_file + '.delex.tag.collins.txt')

    return delex_refs
Ejemplo n.º 8
0
def create_fake_data(real_data, columns, score_type='nlg'):
    """Given some real data, create additional fake data, using human references and
    distorting them. Will start from scores provided, or default to best possible score.
    @param real_data: a real data set, as pd.DataFrame
    @param columns: list of columns for the fake data set
    @param score_type: switch between Likert scale 1-6 ('nlg') and HTER ('hter')
    @return: a fake data set, with the given columns, some of them empty
    """
    def target_score(src_score, distort_step):
        if score_type == 'hter':
            return src_score + distort_step
        elif score_type == 'rank':
            return 1.  # ignore scores for ranks
        return max(1, min(4., src_score - distort_step))

    normalize = False
    best_score = 6.
    num_steps = 4
    if score_type == 'hter':
        normalize = True
        best_score = 0.
        num_steps = 5
    elif score_type == 'rank':
        best_score = 1.

    fake_data = pd.DataFrame(index=np.arange(len(real_data) * (num_steps + 1)),
                             columns=columns)
    vocab = {}

    # add references as perfect data items
    for idx, row in enumerate(real_data.itertuples()):
        fake_data.loc[idx]['orig_ref'] = row.orig_ref
        fake_data.loc[idx]['system_ref'] = row.orig_ref
        fake_data.loc[idx]['mr'] = row.mr
        fake_data.loc[idx]['is_real'] = 0
        for quant in ['naturalness', 'quality', 'informativeness']:
            fake_data.loc[idx][quant] = (getattr(row, quant) if (
                hasattr(row, quant) and getattr(row, quant) is not None
                and not np.isnan(getattr(row, quant))) else best_score)

        for tok in tokenize(row.orig_ref).split(' '):
            vocab[tok] = vocab.get(tok, 0) + 1

    lexicalizer = Lexicalizer(cfg={'mode': 'tokens'})  # default lexicalizer
    vocab = build_vocab(vocab)

    for distort_step in xrange(1, num_steps + 1):
        for idx, row in enumerate(real_data.itertuples(),
                                  start=distort_step * len(real_data)):

            fake_data.loc[idx]['orig_ref'] = row.orig_ref
            fake_data.loc[idx]['mr'] = row.mr
            fake_data.loc[idx]['is_real'] = 0

            # delexicalize data
            da = DA.parse_cambridge_da(row.mr)
            sent, _, lex_instr = delex_sent(da,
                                            tokenize(row.orig_ref).split(' '),
                                            DELEX_SLOTS)
            ref_len = len(sent)
            # distort
            sent = distort_sent(sent, distort_step, vocab)
            # lexicalize again
            sent = lexicalizer._tree_to_sentence([(tok, None) for tok in sent],
                                                 lex_instr)
            fake_data.loc[idx]['system_ref'] = ' '.join(sent)

            for quant in ['naturalness', 'quality', 'informativeness']:
                score = (getattr(row, quant) if (
                    hasattr(row, quant) and getattr(row, quant) is not None
                    and not np.isnan(getattr(row, quant))) else best_score)
                score = target_score(score, distort_step)
                fake_data.loc[idx][quant] = (((score / ref_len) *
                                              100) if normalize else score)

    return fake_data
Ejemplo n.º 9
0
def preprocess_sent(da, sent, delex_slots, delex_slot_names):
    sent = tokenize(sent.lower()).split(' ')
    if delex_slots:
        return delex_sent(da, sent, delex_slots, not delex_slot_names,
                          delex_slot_names)[0]
    return sent