def process_dataset(self, input_data): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods). @param input_data: path to the input JSON file with the data """ # load data from JSON self._das = [] self._texts = [] with codecs.open(input_data, 'r', encoding='UTF-8') as fh: data = json.load(fh) for inst in data: da = DA.parse_cambridge_da(inst['da']) da.sort() self._das.append(da) self._texts.append(self.analyze(inst['text'])) # delexicalize DAs and sentences self._create_delex_texts() self._create_delex_das() # return the result out = [] for da, text, delex_da, delex_text, abst in zip(self._das, self._texts, self._delex_das, self._delex_texts, self._absts): out.append(Inst(da, text, delex_da, delex_text, abst)) return out
def main(in_file, out_mrs, out_refs): abst_das = [] conc_das = [] conc_da_texts = [] abst_texts = [] with codecs.open(in_file, 'r', 'UTF-8') as fh: for line in fh: line = line.strip() if line.startswith('FULL_DA'): line = re.sub('^FULL_DA = ', '', line) conc_das.append(DA.parse_cambridge_da(line)) conc_da_texts.append(line) elif line.startswith('ABSTRACT_DA'): line = re.sub('^ABSTRACT_DA = ', '', line) abst_das.append(DA.parse_cambridge_da(line)) elif line.startswith('->'): line = re.sub('^-> "', '', line) line = re.sub('";\s*$', '', line) line = re.sub(r'\[([a-z]+)\+X\]X', r'X-\1', line) line = re.sub(r'\[[^\]]*\]', '', line) abst_texts.append(line) conc_texts = [] for abst_da, conc_da, abst_text in zip(abst_das, conc_das, abst_texts): text = abst_text for abst_dai, conc_dai in zip(abst_da.dais, conc_da.dais): assert abst_dai.slot == conc_dai.slot if abst_dai.value.startswith('X'): text = text.replace('X-' + abst_dai.slot, conc_dai.value, 1) text = re.sub(r'the The', 'The', text) conc_texts.append(text) with codecs.open(out_mrs, 'w', 'UTF-8') as fh: fh.write("\n".join(conc_da_texts)) with codecs.open(out_refs, 'w', 'UTF-8') as fh: fh.write("\n".join(conc_texts))
def interactive_input(das_type='cambridge', delex_slots=set(), delex_slot_names=False, delex_das=False, input_da=True, input_ref=False): da = None if input_da: da = raw_input('Enter DA : ').decode('utf-8').strip() if not da: return None if das_type == 'text': da = [(tok, None) for tok in preprocess_sent(None, da, False, False)] else: da = DA.parse_cambridge_da(da) if delex_das: da = da.get_delexicalized(delex_slots) ref = None if input_ref: ref = raw_input('Enter reference : ').decode('utf-8').strip() if not ref: return None ref = [ (tok, None) for tok in preprocess_sent(da, ref, delex_slots, delex_slot_names) ] hyp = raw_input('Enter system output 1: ').decode('utf-8').strip() if not hyp: return None hyp = [(tok, None) for tok in preprocess_sent(da, hyp, delex_slots, delex_slot_names)] hyp2 = raw_input('Enter system output 2: ').decode('utf-8').strip() if not hyp2: hyp2 = [] else: hyp2 = [ (tok, None) for tok in preprocess_sent(da, hyp2, delex_slots, delex_slot_names) ] return (da, ref, hyp, hyp2)
def read_outputs(filename): data = pd.read_csv(filename, sep=b"\t", encoding='UTF-8') if isinstance(data.iloc[len(data) - 1]['mr'], float): # XXX workaround to a strange bug that sometimes happens -- not sure how to get rid of it, # probably an error in Pandas print( '!!!Strangely need to remove an empty intstance from the end of %s' % filename) data = data[:-1] das = [DA.parse_cambridge_da(da) for da in data['mr']] # force string data type for empty human references data['orig_ref'] = data['orig_ref'].apply( lambda x: '' if not isinstance(x, basestring) else x) texts_ref = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')] for sent in data['orig_ref']] texts_hyp = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')] for sent in data['system_output']] if 'system_output2' not in data: data['system_output2'] = [None] * len(data) texts_hyp2 = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')] if isinstance( sent, basestring) else None for sent in data['system_output2']] inputs = [(da, text_ref, text_hyp, text_hyp2) for da, text_ref, text_hyp, text_hyp2 in zip( das, texts_ref, texts_hyp, texts_hyp2)] # find out which columns were used for ratings target_cols = [ c[:-len('_system_rating')] for c in data.columns if c.endswith('_system_rating') ] assert target_cols # compile data from all these columns outputs = {} for target_col in target_cols: outputs[target_col] = { subcol: list(data[target_col + '_' + subcol]) for subcol in [ 'human_rating_raw', 'human_rating', 'system_rating_raw', 'system_rating', 'rank_loss', 'rank_ok' ] } return (inputs, outputs)
def convert(args): """Main function – read in the JSON data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions # statistics about different DAs da_keys = {} turns = 0 def process_instance(da, conc): da.sort() conc_das.append(da) # store the non-delexicalized version of the DA # delexicalize text, da, abst = delex_sent(da, conc, slots_to_abstract, args.slot_names) da.sort( ) # delexicalization does not keep DAI order, need to sort again # store the DA text = fix_capitalization(text) conc = fix_capitalization(conc) da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # process the input data and store it in memory with open(args.in_file, 'r') as fh: data = json.load(fh, encoding='UTF-8') for dialogue in data: if isinstance(dialogue, dict): for turn in dialogue['dial']: da = DA.parse_cambridge_da(turn['S']['dact']) if args.skip_hello and len( da) == 1 and da[0].da_type == 'hello': continue # skip hello() DAs conc = postprocess_sent(turn['S']['ref']) process_instance(da, conc) turns += 1 else: da = DA.parse_cambridge_da(dialogue[0]) conc = postprocess_sent(dialogue[1]) process_instance(da, conc) turns += 1 print 'Processed', turns, 'turns.' print '%d different DAs.' % len(da_keys) print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))) if args.split: # get file name prefixes and compute data sizes for all the parts to be split out_names = re.split(r'[, ]+', args.out_name) data_sizes = [int(part_size) for part_size in args.split.split(':')] assert len(out_names) == len(data_sizes) # compute sizes for all but the 1st part (+ round them up, as Wen does) total = float(sum(data_sizes)) remain = turns for part_no in xrange(len(data_sizes) - 1, 0, -1): part_size = int(ceil(turns * (data_sizes[part_no] / total))) data_sizes[part_no] = part_size remain -= part_size # put whatever remained into the 1st part data_sizes[0] = remain else: # use just one part -- containing all the data data_sizes = [turns] out_names = [args.out_name] # write all data parts for part_size, part_name in zip(data_sizes, out_names): # create multiple lexicalized references for each instance by relexicalizing sentences # with the same DA from the same part if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']: # group sentences with the same DA da_groups = {} for da, text, abst in zip(das[0:part_size], texts[0:part_size], absts[0:part_size]): da_groups[unicode(da)] = da_groups.get(unicode(da), []) da_groups[unicode(da)].append( (text, filter_abst(abst, slots_to_abstract))) for da_str in da_groups.keys(): seen = set() uniq = [] for text, abst in da_groups[da_str]: sig = text + "\n" + ' '.join( [a.slot + str(a.start) for a in abst]) if sig not in seen: seen.add(sig) uniq.append((text, abst)) da_groups[da_str] = uniq # relexicalize all abstract sentences for each DA relex = [] for da, abst in zip(das[0:part_size], absts[0:part_size]): relex.append( relexicalize(da_groups[unicode(da)], filter_abst(abst, slots_to_abstract))) with open(part_name + '-ref.txt', 'w') as fh: for relex_pars in relex: fh.write("\n".join(relex_pars).encode('utf-8') + "\n\n") with open(part_name + '-das.txt', 'w') as fh: for da in das[0:part_size]: fh.write(unicode(da).encode('utf-8') + "\n") del das[0:part_size] with open(part_name + '-conc_das.txt', 'w') as fh: for conc_da in conc_das[0:part_size]: fh.write(unicode(conc_da).encode('utf-8') + "\n") del conc_das[0:part_size] with open(part_name + '-conc.txt', 'w') as fh: for conc in concs[0:part_size]: fh.write(conc.encode('utf-8') + "\n") del concs[0:part_size] with open(part_name + '-abst.txt', 'w') as fh: for abst in absts[0:part_size]: fh.write("\t".join([unicode(a) for a in abst]).encode('utf-8') + "\n") del absts[0:part_size] with open(part_name + '-text.txt', 'w') as fh: for text in texts[0:part_size]: fh.write(text.encode('utf-8') + "\n") del texts[0:part_size]
def create_fake_data(real_data, columns, score_type='nlg'): """Given some real data, create additional fake data, using human references and distorting them. Will start from scores provided, or default to best possible score. @param real_data: a real data set, as pd.DataFrame @param columns: list of columns for the fake data set @param score_type: switch between Likert scale 1-6 ('nlg') and HTER ('hter') @return: a fake data set, with the given columns, some of them empty """ def target_score(src_score, distort_step): if score_type == 'hter': return src_score + distort_step elif score_type == 'rank': return 1. # ignore scores for ranks return max(1, min(4., src_score - distort_step)) normalize = False best_score = 6. num_steps = 4 if score_type == 'hter': normalize = True best_score = 0. num_steps = 5 elif score_type == 'rank': best_score = 1. fake_data = pd.DataFrame(index=np.arange(len(real_data) * (num_steps + 1)), columns=columns) vocab = {} # add references as perfect data items for idx, row in enumerate(real_data.itertuples()): fake_data.loc[idx]['orig_ref'] = row.orig_ref fake_data.loc[idx]['system_ref'] = row.orig_ref fake_data.loc[idx]['mr'] = row.mr fake_data.loc[idx]['is_real'] = 0 for quant in ['naturalness', 'quality', 'informativeness']: fake_data.loc[idx][quant] = (getattr(row, quant) if ( hasattr(row, quant) and getattr(row, quant) is not None and not np.isnan(getattr(row, quant))) else best_score) for tok in tokenize(row.orig_ref).split(' '): vocab[tok] = vocab.get(tok, 0) + 1 lexicalizer = Lexicalizer(cfg={'mode': 'tokens'}) # default lexicalizer vocab = build_vocab(vocab) for distort_step in xrange(1, num_steps + 1): for idx, row in enumerate(real_data.itertuples(), start=distort_step * len(real_data)): fake_data.loc[idx]['orig_ref'] = row.orig_ref fake_data.loc[idx]['mr'] = row.mr fake_data.loc[idx]['is_real'] = 0 # delexicalize data da = DA.parse_cambridge_da(row.mr) sent, _, lex_instr = delex_sent(da, tokenize(row.orig_ref).split(' '), DELEX_SLOTS) ref_len = len(sent) # distort sent = distort_sent(sent, distort_step, vocab) # lexicalize again sent = lexicalizer._tree_to_sentence([(tok, None) for tok in sent], lex_instr) fake_data.loc[idx]['system_ref'] = ' '.join(sent) for quant in ['naturalness', 'quality', 'informativeness']: score = (getattr(row, quant) if ( hasattr(row, quant) and getattr(row, quant) is not None and not np.isnan(getattr(row, quant))) else best_score) score = target_score(score, distort_step) fake_data.loc[idx][quant] = (((score / ref_len) * 100) if normalize else score) return fake_data
def read_bagel_data(): with codecs.open('data/bagel-refs.tag.ngram.txt', 'r', 'UTF-8') as fh: refs = [split_tags(inst.strip()) for inst in fh.readlines()] with codecs.open('data/bagel-mrs.txt', 'r', 'UTF-8') as fh: mrs = [DA.parse_cambridge_da(mr) for mr in fh.readlines()] return mrs, refs
def convert(args): """Main function – read in the JSON data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions # statistics about different DAs da_keys = {} turns = 0 def process_instance(da, conc): da.sort() conc_das.append(da) # store the non-delexicalized version of the DA # delexicalize text, da, abst = delex_sent(da, conc, slots_to_abstract, args.slot_names) da.sort() # delexicalization does not keep DAI order, need to sort again # store the DA text = fix_capitalization(text) conc = fix_capitalization(conc) da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # process the input data and store it in memory with open(args.in_file, 'r') as fh: data = json.load(fh, encoding='UTF-8') for dialogue in data: if isinstance(dialogue, dict): for turn in dialogue['dial']: da = DA.parse_cambridge_da(turn['S']['dact']) if args.skip_hello and len(da) == 1 and da[0].da_type == 'hello': continue # skip hello() DAs conc = postprocess_sent(turn['S']['ref']) process_instance(da, conc) turns += 1 else: da = DA.parse_cambridge_da(dialogue[0]) conc = postprocess_sent(dialogue[1]) process_instance(da, conc) turns += 1 print 'Processed', turns, 'turns.' print '%d different DAs.' % len(da_keys) print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))) if args.split: # get file name prefixes and compute data sizes for all the parts to be split out_names = re.split(r'[, ]+', args.out_name) data_sizes = [int(part_size) for part_size in args.split.split(':')] assert len(out_names) == len(data_sizes) # compute sizes for all but the 1st part (+ round them up, as Wen does) total = float(sum(data_sizes)) remain = turns for part_no in xrange(len(data_sizes) - 1, 0, -1): part_size = int(ceil(turns * (data_sizes[part_no] / total))) data_sizes[part_no] = part_size remain -= part_size # put whatever remained into the 1st part data_sizes[0] = remain else: # use just one part -- containing all the data data_sizes = [turns] out_names = [args.out_name] # write all data parts for part_size, part_name in zip(data_sizes, out_names): # create multiple lexicalized references for each instance by relexicalizing sentences # with the same DA from the same part if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']: # group sentences with the same DA da_groups = {} for da, text, abst in zip(das[0:part_size], texts[0:part_size], absts[0:part_size]): da_groups[unicode(da)] = da_groups.get(unicode(da), []) da_groups[unicode(da)].append((text, filter_abst(abst, slots_to_abstract))) for da_str in da_groups.keys(): seen = set() uniq = [] for text, abst in da_groups[da_str]: sig = text + "\n" + ' '.join([a.slot + str(a.start) for a in abst]) if sig not in seen: seen.add(sig) uniq.append((text, abst)) da_groups[da_str] = uniq # relexicalize all abstract sentences for each DA relex = [] for da, abst in zip(das[0:part_size], absts[0:part_size]): relex.append(relexicalize(da_groups[unicode(da)], filter_abst(abst, slots_to_abstract))) with open(part_name + '-ref.txt', 'w') as fh: for relex_pars in relex: fh.write("\n".join(relex_pars).encode('utf-8') + "\n\n") with open(part_name + '-das.txt', 'w') as fh: for da in das[0:part_size]: fh.write(unicode(da).encode('utf-8') + "\n") del das[0:part_size] with open(part_name + '-conc_das.txt', 'w') as fh: for conc_da in conc_das[0:part_size]: fh.write(unicode(conc_da).encode('utf-8') + "\n") del conc_das[0:part_size] with open(part_name + '-conc.txt', 'w') as fh: for conc in concs[0:part_size]: fh.write(conc.encode('utf-8') + "\n") del concs[0:part_size] with open(part_name + '-abst.txt', 'w') as fh: for abst in absts[0:part_size]: fh.write("\t".join([unicode(a) for a in abst]).encode('utf-8') + "\n") del absts[0:part_size] with open(part_name + '-text.txt', 'w') as fh: for text in texts[0:part_size]: fh.write(text.encode('utf-8') + "\n") del texts[0:part_size]
def read_data(filename, target_cols, das_type='cambridge', delex_slots=set(), delex_slot_names=False, delex_das=False): """Read the input data from a TSV file.""" refs_cache = {} def cached_preprocess_sent(da, sent): """we're caching since with generated data, we're likely to parse the same sentence many times.""" if (da, sent) not in refs_cache: refs_cache[(da, sent)] = preprocess_sent(da, sent, delex_slots, delex_slot_names) return list(refs_cache[(da, sent)]) log_info("Reading %s..." % filename) data = pd.read_csv(filename, sep=b"\t", encoding='UTF-8') log_info("Loaded %d instances." % len(data)) # force data type to string if the data set doesn't contain human references data['orig_ref'] = data['orig_ref'].apply( lambda x: '' if not isinstance(x, basestring) else x) log_info("Adapted refs data type.") if das_type == 'text': # for MT output classification das = [[(tok, None) for tok in preprocess_sent(None, sent, False, False)] for sent in data['mr']] else: das = [DA.parse_cambridge_da(da) for da in data['mr']] log_info("Parsed DAs.") texts_ref = [[(tok, None) for tok in cached_preprocess_sent(da, sent)] for da, sent in zip(das, data['orig_ref'])] log_info("Preprocessed human refs.") texts_hyp = [[(tok, None) for tok in cached_preprocess_sent(da, sent)] for da, sent in zip(das, data['system_ref'])] log_info("Preprocessed system outputs.") # alternative reference with rating difference / use to compare if 'system_ref2' in data.columns: texts_hyp2 = [[(tok, None) for tok in cached_preprocess_sent(da, sent)] if isinstance(sent, basestring) else None for da, sent in zip(das, data['system_ref2'])] else: texts_hyp2 = [None] * len(texts_hyp) log_info("Preprocessed 2nd system outputs.") # DA delexicalization must take place after text delexicalization if das_type != 'text' and delex_das: das = [da.get_delexicalized(delex_slots) for da in das] log_info("Delexicalized DAs.") # fake data indicator if 'is_real' in data.columns: real_indics = [0 if indic == 0 else 1 for indic in data['is_real']] else: real_indics = [1 for _ in xrange(len(data))] log_info("Retrieved is_real indications.") inputs = [(da, ref, hyp, hyp2, ri) for da, ref, hyp, hyp2, ri in zip( das, texts_ref, texts_hyp, texts_hyp2, real_indics)] log_info("Built inputs list.") targets = np.array( data[[target_cols] if not isinstance(target_cols, list ) else target_cols], dtype=np.float) log_info("Built targets list.") return inputs, targets