def process_instance(da, conc, mr, multi_ref_id): original_da = deepcopy(da) # why do the das need to be sorted? This seems weird # Anyway, we checked it gets sorted in delex_sent anyway so nothing to # do about it until later da.sort() conc_das.append(da) text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) # Originall we didn't want to lower case because it will make things # easier for udpipe later on, however ... # we changed our mind on this because the upper case characters are # messing with udpipe's ability to properly sentence tokenize. # we need underscores instead of dashes or else udpipe breaks it apart # text = re.sub(r"X-", r"X_", text) # Again running into problems with leaving x and as a capital letter # and also with udpipe randomly segmenting it but sometimes not. We # really need to find a more reliable sentence tokenizer / word # tokenizer text = text.lower().replace('x-', 'x') # We're testing out making xnear upper case to see if it reduces the # incorrect dropping of it by the deep parser text = text.replace('xnear', 'Xnear') # detokenize some of the apostrophe stuff because udpipe does it # differently. Namely removing spaces between letters and apostrophes text = re.sub(find_apostrophes, r"\1\2", text) da.sort() da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # now for our own bastardized sentence tokenization and human eval # required stuff this_conc_sents = sent_tokenize(conc) num_sents = len(this_conc_sents) this_delex_sents = [] for i, this_conc_sent in enumerate(this_conc_sents): text, _, _ = delex_sent(original_da, tokenize(this_conc_sent), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'x') # We're testing out making xnear upper case to see if it reduces the # incorrect dropping of it by the deep parser text = text.replace('xnear', 'Xnear') # detokenize some of the apostrophe stuff because udpipe does it # differently. Namely removing spaces between letters and apostrophes text = re.sub(find_apostrophes, r"\1\2", text) this_delex_sents.append(text) # start appending the sentence specific ones sent_ids.append('_'.join([mr.replace(' ', ''), str(multi_ref_id), str(i)])) mrs_for_delex.append(mr) # now we're onto something else original_sents.append('\n'.join(this_conc_sents)) delexicalised_sents.append('\n'.join(this_delex_sents))
def tokenize_normalize(tokens): """Perform further tokenization, normalize, lowercase. Return subtokenized text + reverse mapping.""" subtoks = [tokenize(unidecode(tok.lower())).split(' ') for tok in tokens] rev_map = [pos for pos, tok in enumerate(subtoks) for _ in xrange(len(tok))] subtoks = [subtok for tok in subtoks for subtok in tok] return subtoks, rev_map
def tokenize_normalize(tokens): """Perform further tokenization, normalize, lowercase. Return subtokenized text + reverse mapping.""" subtoks = [tokenize(unidecode(tok.lower())).split(' ') for tok in tokens] rev_map = [pos for pos, tok in enumerate(subtoks) for _ in range(len(tok))] subtoks = [subtok for tok in subtoks for subtok in tok] return subtoks, rev_map
def read_outputs(filename): data = pd.read_csv(filename, sep=b"\t", encoding='UTF-8') if isinstance(data.iloc[len(data) - 1]['mr'], float): # XXX workaround to a strange bug that sometimes happens -- not sure how to get rid of it, # probably an error in Pandas print( '!!!Strangely need to remove an empty intstance from the end of %s' % filename) data = data[:-1] das = [DA.parse_cambridge_da(da) for da in data['mr']] # force string data type for empty human references data['orig_ref'] = data['orig_ref'].apply( lambda x: '' if not isinstance(x, basestring) else x) texts_ref = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')] for sent in data['orig_ref']] texts_hyp = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')] for sent in data['system_output']] if 'system_output2' not in data: data['system_output2'] = [None] * len(data) texts_hyp2 = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')] if isinstance( sent, basestring) else None for sent in data['system_output2']] inputs = [(da, text_ref, text_hyp, text_hyp2) for da, text_ref, text_hyp, text_hyp2 in zip( das, texts_ref, texts_hyp, texts_hyp2)] # find out which columns were used for ratings target_cols = [ c[:-len('_system_rating')] for c in data.columns if c.endswith('_system_rating') ] assert target_cols # compile data from all these columns outputs = {} for target_col in target_cols: outputs[target_col] = { subcol: list(data[target_col + '_' + subcol]) for subcol in [ 'human_rating_raw', 'human_rating', 'system_rating_raw', 'system_rating', 'rank_loss', 'rank_ok' ] } return (inputs, outputs)
def process_instance(da, conc): da.sort() conc_das.append(da) text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'X-') # lowercase all but placeholders da.sort() da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text)
def process_instance(conc_da, conc): # sort the DA using the same order as in E2E NLG data conc_da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value)) conc_das.append(conc_da) text, da, abst = delex_sent(conc_da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'X-') # lowercase all but placeholders da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value)) da_keys[str(da)] = da_keys.get(str(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text)
def preprocess_sent(da, sent, delex_slots, delex_slot_names): sent = tokenize(sent.lower()).split(' ') if delex_slots: return delex_sent(da, sent, delex_slots, not delex_slot_names, delex_slot_names)[0] return sent