コード例 #1
0
    def process_instance(da, conc, mr, multi_ref_id):
        original_da = deepcopy(da)
        # why do the das need to be sorted? This seems weird
        # Anyway, we checked it gets sorted in delex_sent anyway so nothing to
        # do about it until later
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        # Originall we didn't want to lower case because it will make things
        # easier for udpipe later on, however ...
        # we changed our mind on this because the upper case characters are
        # messing with udpipe's ability to properly sentence tokenize.
        # we need underscores instead of dashes or else udpipe breaks it apart
        # text = re.sub(r"X-", r"X_", text)
        # Again running into problems with leaving x and as a capital letter
        # and also with udpipe randomly segmenting it but sometimes not. We
        # really need to find a more reliable sentence tokenizer / word
        # tokenizer
        text = text.lower().replace('x-', 'x')
        # We're testing out making xnear upper case to see if it reduces the
        # incorrect dropping of it by the deep parser
        text = text.replace('xnear', 'Xnear')

        # detokenize some of the apostrophe stuff because udpipe does it
        # differently. Namely removing spaces between letters and apostrophes
        text = re.sub(find_apostrophes, r"\1\2", text)
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)

        # now for our own bastardized sentence tokenization and human eval
        # required stuff
        this_conc_sents = sent_tokenize(conc)
        num_sents = len(this_conc_sents)
        this_delex_sents = []
        for i, this_conc_sent in enumerate(this_conc_sents):
            text, _, _ = delex_sent(original_da, tokenize(this_conc_sent), slots_to_abstract, args.slot_names, repeated=True)
            text = text.lower().replace('x-', 'x')
            # We're testing out making xnear upper case to see if it reduces the
            # incorrect dropping of it by the deep parser
            text = text.replace('xnear', 'Xnear')
            # detokenize some of the apostrophe stuff because udpipe does it
            # differently. Namely removing spaces between letters and apostrophes
            text = re.sub(find_apostrophes, r"\1\2", text)
            this_delex_sents.append(text)

            # start appending the sentence specific ones
            sent_ids.append('_'.join([mr.replace(' ', ''), str(multi_ref_id),
                                      str(i)]))
            mrs_for_delex.append(mr)

        # now we're onto something else
        original_sents.append('\n'.join(this_conc_sents))
        delexicalised_sents.append('\n'.join(this_delex_sents))
コード例 #2
0
ファイル: delex.py プロジェクト: UFAL-DSG/tgen
def tokenize_normalize(tokens):
    """Perform further tokenization, normalize, lowercase.
    Return subtokenized text + reverse mapping."""
    subtoks = [tokenize(unidecode(tok.lower())).split(' ') for tok in tokens]
    rev_map = [pos for pos, tok in enumerate(subtoks) for _ in xrange(len(tok))]
    subtoks = [subtok for tok in subtoks for subtok in tok]
    return subtoks, rev_map
コード例 #3
0
def tokenize_normalize(tokens):
    """Perform further tokenization, normalize, lowercase.
    Return subtokenized text + reverse mapping."""
    subtoks = [tokenize(unidecode(tok.lower())).split(' ') for tok in tokens]
    rev_map = [pos for pos, tok in enumerate(subtoks) for _ in range(len(tok))]
    subtoks = [subtok for tok in subtoks for subtok in tok]
    return subtoks, rev_map
コード例 #4
0
ファイル: futil.py プロジェクト: tuetschek/ratpred
def read_outputs(filename):
    data = pd.read_csv(filename, sep=b"\t", encoding='UTF-8')
    if isinstance(data.iloc[len(data) - 1]['mr'], float):
        # XXX workaround to a strange bug that sometimes happens -- not sure how to get rid of it,
        # probably an error in Pandas
        print(
            '!!!Strangely need to remove an empty intstance from the end of %s'
            % filename)
        data = data[:-1]
    das = [DA.parse_cambridge_da(da) for da in data['mr']]

    # force string data type for empty human references
    data['orig_ref'] = data['orig_ref'].apply(
        lambda x: '' if not isinstance(x, basestring) else x)
    texts_ref = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')]
                 for sent in data['orig_ref']]
    texts_hyp = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')]
                 for sent in data['system_output']]
    if 'system_output2' not in data:
        data['system_output2'] = [None] * len(data)
    texts_hyp2 = [[(tok, None)
                   for tok in tokenize(sent.lower()).split(' ')] if isinstance(
                       sent, basestring) else None
                  for sent in data['system_output2']]
    inputs = [(da, text_ref, text_hyp, text_hyp2)
              for da, text_ref, text_hyp, text_hyp2 in zip(
                  das, texts_ref, texts_hyp, texts_hyp2)]

    # find out which columns were used for ratings
    target_cols = [
        c[:-len('_system_rating')] for c in data.columns
        if c.endswith('_system_rating')
    ]
    assert target_cols
    # compile data from all these columns
    outputs = {}
    for target_col in target_cols:
        outputs[target_col] = {
            subcol: list(data[target_col + '_' + subcol])
            for subcol in [
                'human_rating_raw', 'human_rating', 'system_rating_raw',
                'system_rating', 'rank_loss', 'rank_ok'
            ]
        }
    return (inputs, outputs)
コード例 #5
0
ファイル: convert.py プロジェクト: UFAL-DSG/tgen
    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        text = text.lower().replace('x-', 'X-')  # lowercase all but placeholders
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)
コード例 #6
0
    def process_instance(conc_da, conc):
        # sort the DA using the same order as in E2E NLG data
        conc_da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value))
        conc_das.append(conc_da)

        text, da, abst = delex_sent(conc_da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True)
        text = text.lower().replace('x-', 'X-')  # lowercase all but placeholders
        da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value))

        da_keys[str(da)] = da_keys.get(str(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)
コード例 #7
0
    def process_instance(da, conc):
        da.sort()
        conc_das.append(da)

        text, da, abst = delex_sent(da,
                                    tokenize(conc),
                                    slots_to_abstract,
                                    args.slot_names,
                                    repeated=True)
        text = text.lower().replace('x-',
                                    'X-')  # lowercase all but placeholders
        da.sort()

        da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1
        das.append(da)
        concs.append(conc)
        absts.append(abst)
        texts.append(text)
コード例 #8
0
ファイル: futil.py プロジェクト: tuetschek/ratpred
def preprocess_sent(da, sent, delex_slots, delex_slot_names):
    sent = tokenize(sent.lower()).split(' ')
    if delex_slots:
        return delex_sent(da, sent, delex_slots, not delex_slot_names,
                          delex_slot_names)[0]
    return sent