def read_system_training_data(filename): insts = [] for inst in pd.read_csv(filename, index_col=None, encoding='UTF-8').to_dict('records'): insts.append({ 'dataset': 'E2E', 'mr': DA.parse_diligent_da(inst['mr']).to_cambridge_da_string(), 'delex_mr': DA.parse_diligent_da(inst['mr']).get_delexicalized( set(['name', 'near'])).to_cambridge_da_string(), 'system': 'HUMAN', 'system_ref': None, 'orig_ref': inst['ref'], 'informativeness': None, 'naturalness': None, 'quality': None, 'is_real': 0 }) log_info( "Using %d different training human references to create fake pairs" % len(insts)) return insts
def reclassify_mr(ref, gold_mr=DA()): """Classify the MR given a text. Can use a gold-standard MR to make the classification more precise (in case of ambiguity, goes with the gold-standard value). Returns a dict-based MR format for the system output MR and the gold-standard MR.""" # convert MR to dict for comparing & checking against mr_dict = {} for dai in gold_mr.dais: mr_dict[dai.slot] = mr_dict.get(dai.slot, {}) val = CAPITALIZE[dai.slot][dai.value.lower()] mr_dict[dai.slot][val] = mr_dict[dai.slot].get(val, 0) + 1 # create MR dict representation of the output text # first, collect all value matches matches = [] for slot in REALIZATIONS.keys(): # verbatim slot if not isinstance(REALIZATIONS[slot], dict): matches.extend([ Match(slot, CAPITALIZE[slot][match.group(0).lower()], match) for match in REALIZATIONS[slot].finditer(ref) ]) # slot with variable realizations else: # collect all matches for all values for value in REALIZATIONS[slot].keys(): matches.extend([ Match(slot, CAPITALIZE[slot][value.lower()], match) for match in REALIZATIONS[slot][value].finditer(ref) ]) # then filter out those that are substrings/duplicates (let only one value match, # preferrably the one indicated by the true MR -- check with the MR dict) filt_matches = [] for match in matches: skip = False for other_match in matches: if match is other_match: continue if (match.is_substring(other_match) or ( match.is_same_string(other_match) and (other_match.value in mr_dict.get(other_match.slot, {}).keys() or other_match in filt_matches))): skip = True break if not skip: filt_matches.append(match) # now put it all into a dict out_dict = {} for match in filt_matches: out_dict[match.slot] = out_dict.get(match.slot, {}) out_dict[match.slot][match.value] = out_dict[match.slot].get(value, 0) + 1 return DA.parse_dict(out_dict)
def _delex_das(self): """Delexicalize DAs in the buffers, save them separately.""" out = [] for da in self._das: delex_da = DA() for dai in da: delex_dai = DAI( dai.da_type, dai.slot, 'X-' + dai.slot if (dai.value not in [None, 'none', 'dont_care'] and dai.slot in self._abst_slots) else dai.value) delex_da.append(delex_dai) out.append(delex_da) self._delexed_das = out
def parse_cambridge_da(da_text): """Parse a DA string into DAIs (DA types, slots, and values).""" da = DA() for dai_text in re.finditer(r'(\??[a-z_]+)\(([^)]*)\)', da_text): da_type, svps = dai_text.groups() if not svps: # no slots/values (e.g. 'hello()') da.append(DAI(da_type, None, None)) continue # we have some slots/values – split them into DAIs svps = re.split('(?<! )[,;]', svps) for svp in svps: if '=' not in svp: # no value, e.g. '?request(near)' da.append(DAI(da_type, svp, None)) continue # we have a value slot, value = svp.split('=', 1) if re.match(r'^\'.*\'$', value): value = value[1:-1] assert not re.match(r'^\'', value) and not re.match(r'\'$', value) da.append(DAI(da_type, slot, value)) return da
def _delex_das(self): """Delexicalize DAs in the buffers, save them separately.""" out = [] for da in self._das: delex_da = DA() for dai in da: delex_dai = DAI(dai.da_type, dai.slot, 'X-' + dai.slot if (dai.value not in [None, 'none', 'dont_care'] and dai.slot in self._abst_slots) else dai.value) delex_da.append(delex_dai) out.append(delex_da) self._delexed_das = out
def evaluate_file(self, das_file, ttree_file): """Evaluate the reranking classifier on a given pair of DA/tree files (show the total Hamming distance and total number of DAIs) @param das_file: DA file path @param ttree_file: trees/sentences file path @return: a tuple (total DAIs, distance) """ log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees/tokens from ' + ttree_file + '...') trees = read_trees_or_tokens(ttree_file, self.mode, self.language, self.selector) if self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas') tot_len = 0 tot_dist = 0 classif_das = [] for da, tree in zip(das, trees): tot_len += len(da) dist, classif = self.dist_to_da(da, [tree], return_classif=True) tot_dist += dist[0] classif_das.append(DA.parse_features(classif[0])) return tot_len, tot_dist, classif_das
def _init_training(self, das_file, ttree_file, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. """ # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes self.input_shape = [list(self.X[0].shape)] self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network()
def process_dataset(self, input_data): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods). @param input_data: path to the input JSON file with the data """ # load data from JSON self._das = [] self._texts = [] with codecs.open(input_data, 'r', encoding='UTF-8') as fh: data = json.load(fh) for inst in data: da = DA.parse_cambridge_da(inst['da']) da.sort() self._das.append(da) self._texts.append(self.analyze(inst['text'])) # delexicalize DAs and sentences self._create_delex_texts() self._create_delex_das() # return the result out = [] for da, text, delex_da, delex_text, abst in zip(self._das, self._texts, self._delex_das, self._delex_texts, self._absts): out.append(Inst(da, text, delex_da, delex_text, abst)) return out
def process_files(self, input_text_file, input_da_file, skip_hello=False): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods). @param input_text_file: path to the input file with sentences @param input_da_file: path to the input file with DAs @param skip_hello: skip hello() DAs (remove them from the output?) """ # load DAs self._das = [] with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh: for line in fh: self._das.append(DA.parse(line.strip())) # load & process sentences self._sents = [] with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh: for line in fh: self._sents.append(self.analyze(line.strip())) assert (len(self._das) == len(self._sents)) # skip hello() DAs, if required if skip_hello: pos = 0 while pos < len(self._das): da = self._das[pos] if len(da) == 1 and da[0].da_type == 'hello': del self._das[pos] del self._sents[pos] else: pos += 1 # delexicalize DAs and sentences self._delex_texts() self._delex_das()
def process_files(self, input_text_file, input_da_file, skip_hello=False): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods). @param input_text_file: path to the input file with sentences @param input_da_file: path to the input file with DAs @param skip_hello: skip hello() DAs (remove them from the output?) """ # load DAs self._das = [] with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh: for line in fh: self._das.append(DA.parse(line.strip())) # load & process sentences self._sents = [] with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh: for line in fh: self._sents.append(self.analyze(line.strip())) assert(len(self._das) == len(self._sents)) # skip hello() DAs, if required if skip_hello: pos = 0 while pos < len(self._das): da = self._das[pos] if len(da) == 1 and da[0].da_type == 'hello': del self._das[pos] del self._sents[pos] else: pos += 1 # delexicalize DAs and sentences self._delex_texts() self._delex_das()
def main(in_file, out_mrs, out_refs): abst_das = [] conc_das = [] conc_da_texts = [] abst_texts = [] with codecs.open(in_file, 'r', 'UTF-8') as fh: for line in fh: line = line.strip() if line.startswith('FULL_DA'): line = re.sub('^FULL_DA = ', '', line) conc_das.append(DA.parse_cambridge_da(line)) conc_da_texts.append(line) elif line.startswith('ABSTRACT_DA'): line = re.sub('^ABSTRACT_DA = ', '', line) abst_das.append(DA.parse_cambridge_da(line)) elif line.startswith('->'): line = re.sub('^-> "', '', line) line = re.sub('";\s*$', '', line) line = re.sub(r'\[([a-z]+)\+X\]X', r'X-\1', line) line = re.sub(r'\[[^\]]*\]', '', line) abst_texts.append(line) conc_texts = [] for abst_da, conc_da, abst_text in zip(abst_das, conc_das, abst_texts): text = abst_text for abst_dai, conc_dai in zip(abst_da.dais, conc_da.dais): assert abst_dai.slot == conc_dai.slot if abst_dai.value.startswith('X'): text = text.replace('X-' + abst_dai.slot, conc_dai.value, 1) text = re.sub(r'the The', 'The', text) conc_texts.append(text) with codecs.open(out_mrs, 'w', 'UTF-8') as fh: fh.write("\n".join(conc_da_texts)) with codecs.open(out_refs, 'w', 'UTF-8') as fh: fh.write("\n".join(conc_texts))
def _init_training(self, das_file, ttree_file, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. """ # read input log_info('Reading DAs from ' + das_file + '...') das = read_das(das_file) log_info('Reading t-trees from ' + ttree_file + '...') ttree_doc = read_ttrees(ttree_file) trees = trees_from_doc(ttree_doc, self.language, self.selector) # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das] self.y = self.da_vect.fit_transform(self.y) # initialize I/O shapes self.input_shape = [list(self.X[0].shape)] self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network()
def process_files(self, input_text_file, input_da_file): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods).""" # load DAs self._das = [] with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh: for line in fh: self._das.append(DA.parse(line.strip())) # load & process sentences self._sents = [] with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh: for line in fh: self._sents.append(self.analyze(line.strip())) assert (len(self._das) == len(self._sents)) # delexicalize DAs and sentences self._delex_texts() self._delex_das()
def interactive_input(das_type='cambridge', delex_slots=set(), delex_slot_names=False, delex_das=False, input_da=True, input_ref=False): da = None if input_da: da = raw_input('Enter DA : ').decode('utf-8').strip() if not da: return None if das_type == 'text': da = [(tok, None) for tok in preprocess_sent(None, da, False, False)] else: da = DA.parse_cambridge_da(da) if delex_das: da = da.get_delexicalized(delex_slots) ref = None if input_ref: ref = raw_input('Enter reference : ').decode('utf-8').strip() if not ref: return None ref = [ (tok, None) for tok in preprocess_sent(da, ref, delex_slots, delex_slot_names) ] hyp = raw_input('Enter system output 1: ').decode('utf-8').strip() if not hyp: return None hyp = [(tok, None) for tok in preprocess_sent(da, hyp, delex_slots, delex_slot_names)] hyp2 = raw_input('Enter system output 2: ').decode('utf-8').strip() if not hyp2: hyp2 = [] else: hyp2 = [ (tok, None) for tok in preprocess_sent(da, hyp2, delex_slots, delex_slot_names) ] return (da, ref, hyp, hyp2)
def read_outputs(filename): data = pd.read_csv(filename, sep=b"\t", encoding='UTF-8') if isinstance(data.iloc[len(data) - 1]['mr'], float): # XXX workaround to a strange bug that sometimes happens -- not sure how to get rid of it, # probably an error in Pandas print( '!!!Strangely need to remove an empty intstance from the end of %s' % filename) data = data[:-1] das = [DA.parse_cambridge_da(da) for da in data['mr']] # force string data type for empty human references data['orig_ref'] = data['orig_ref'].apply( lambda x: '' if not isinstance(x, basestring) else x) texts_ref = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')] for sent in data['orig_ref']] texts_hyp = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')] for sent in data['system_output']] if 'system_output2' not in data: data['system_output2'] = [None] * len(data) texts_hyp2 = [[(tok, None) for tok in tokenize(sent.lower()).split(' ')] if isinstance( sent, basestring) else None for sent in data['system_output2']] inputs = [(da, text_ref, text_hyp, text_hyp2) for da, text_ref, text_hyp, text_hyp2 in zip( das, texts_ref, texts_hyp, texts_hyp2)] # find out which columns were used for ratings target_cols = [ c[:-len('_system_rating')] for c in data.columns if c.endswith('_system_rating') ] assert target_cols # compile data from all these columns outputs = {} for target_col in target_cols: outputs[target_col] = { subcol: list(data[target_col + '_' + subcol]) for subcol in [ 'human_rating_raw', 'human_rating', 'system_rating_raw', 'system_rating', 'rank_loss', 'rank_ok' ] } return (inputs, outputs)
def process_file(tagger_model, input_file): detok = Detokenizer() df = pd.read_csv(input_file, sep="\t", encoding="UTF-8") raw_mrs = list(df['MR']) raw_refs = [detok.detokenize(text) for text in list(df['output'])] mrs = [DA.parse_diligent_da(mr) for mr in raw_mrs] tagger = MorphoTagger(tagger_model) tagged_refs = [tagger.tag(line) for line in raw_refs] for ff in ['ngram', 'lca', 'collins']: write_output(tagged_refs, ff, re.sub(r'\.tsv', '.tag.%s.txt' % ff, input_file)) stats = data_stats(mrs, tagged_refs, { 'name': [], 'near': [] }, re.sub(r'\.tsv', '', input_file)) return stats
def convert(args): src = pd.read_csv(args.src_file, index_col=None, encoding='utf-8') df = pd.DataFrame(index=np.arange(len(src)), columns=COLUMNS) for src_col, trg_col in COLUMN_MAP.iteritems(): if isinstance(trg_col, list): for trg_col_ in trg_col: df[trg_col_] = src[src_col] else: df[trg_col] = src[src_col] df['mr'] = [ DA.parse_diligent_da(da).to_cambridge_da_string() for da in src['mr'] ] df['is_real'] = np.ones(len(src), dtype=np.int32) df['dataset'] = ['INLG'] * len(src) df['system'] = ['human'] * len(src) df.to_csv(args.out_file, columns=COLUMNS, sep=b"\t", index=False, encoding='UTF-8')
def convert(args): src = lines_to_list(args.src_file) if args.das: src = [DA.parse(da_text).to_cambridge_da_string() for da_text in src] ref = lines_to_list(args.ref_file) columns = ['mr', 'orig_ref'] df = pd.DataFrame.from_dict({'mr': src, 'orig_ref': ref}) if args.system_output: sys = lines_to_list(args.system_output) df['system_ref'] = sys columns.append('system_ref') if args.score: score = [float(score) for score in lines_to_list(args.score)] df['quality'] = score columns.append('quality') df.to_csv(args.out_file, columns=columns, sep=b"\t", index=False, encoding='UTF-8')
def read_sfx_data(): with codecs.open('data/sfrest-refs.tag.ngram.txt', 'r', 'UTF-8') as fh: refs = [split_tags(inst.strip()) for inst in fh.readlines()] with codecs.open('data/sfrest-mrs.txt', 'r', 'UTF-8') as fh: mrs = [DA.parse(mr.strip()) for mr in fh.readlines()] return mrs, refs
def delex_sent(da, conc, abst_slots, use_slot_names=True, delex_slot_names=False): """Abstract the given slots in the given sentence (replace them with X). @param da: concrete DA @param conc: concrete sentence text (string -- split only on whitespace, or list of tokens) @param abst_slots: a set of slots to be abstracted @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X? @return: a tuple of the abstracted text (in the same format as conc), abstracted DA, \ and abstraction instructions """ return_string = False if isinstance(conc, basestring): toks = conc.split(' ') return_string = True else: toks = conc absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the abstracted DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the 'abstracted' DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue pos = find_value(dai.value, toks, toks_mask) # if the value is to be abstracted, replace the value in the abstracted DAI # and save abstraction instruction (even if not found in the sentence) if dai.slot in abst_slots and dai.value != 'dont_care': abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append(Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) if delex_slot_names: for dai in sorted([dai for dai in da if dai.slot is not None], key=lambda dai: len(dai.slot), reverse=True): pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask) if dai.slot in abst_slots: absts.append(Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be abstracted absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be abstracted on the output if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0: continue # replace the text with the placeholder (X-slot/X-value, X-slot-name, X) if delex_slot_names and abst.value is None: toks[abst.start - shift:abst.end - shift] = ['X-slot'] elif use_slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks) if return_string else toks, abst_da, absts
def read_bagel_data(): with codecs.open('data/bagel-refs.tag.ngram.txt', 'r', 'UTF-8') as fh: refs = [split_tags(inst.strip()) for inst in fh.readlines()] with codecs.open('data/bagel-mrs.txt', 'r', 'UTF-8') as fh: mrs = [DA.parse_cambridge_da(mr) for mr in fh.readlines()] return mrs, refs
def parse_mr(mr_text): return DA.parse_diligent_da(mr_text).get_delexicalized( set(['name', 'near']))
def delex_sent(da, sent, delex_slots, use_slot_names=True, delex_slot_names=False, repeated=False): """Delexicalize ("abstract") the given slots in the given sentence (replace them with X or X-slot_name). @param da: concrete DA @param sent: lexicalized sentence text (string -- split only on whitespace, or list of tokens) @param delex_slots: a set of slots to be delexicalized, or a dict (with a set of values to \ leave untouched for each slot) @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X? @return: a tuple of the abstracted text (in the same format as sent), delexicalized DA, \ and abstraction instructions """ return_string = False if isinstance(sent, basestring): toks = sent.split(' ') return_string = True else: toks = sent if isinstance(delex_slots, set): # convert sets to dicts delex_slots = {slot: set() for slot in delex_slots} absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the delexicalized DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the delexicalized (abstracted) DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue # search for the 1st or all occurrences found = 0 pos = (-1, -1) while found < 1 or (repeated and pos != (-1, -1)): pos = find_value(dai.value, toks, toks_mask) # if the value is to be delexicalize, replace the value in the delexicalized DAI # and save abstraction instruction (even if not found in the sentence) if (dai.slot in delex_slots and dai.value not in delex_slots[dai.slot] and dai.value != 'dont_care' and (found == 0 or pos != (-1, -1))): abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append( Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) found += 1 if delex_slot_names: for dai in sorted([dai for dai in da if dai.slot is not None], key=lambda dai: len(dai.slot), reverse=True): pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask) if dai.slot in delex_slots: absts.append( Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be delexicalized absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be delexicalized on the output if (abst.slot not in delex_slots or abst.value in delex_slots[abst.slot] or abst.value == 'dont_care' or abst.start < 0): continue # replace the text with the placeholder (X-slot/X-value, X-slot-name, X) if delex_slot_names and abst.value is None: toks[abst.start - shift:abst.end - shift] = ['X-slot'] elif use_slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks) if return_string else toks, abst_da, absts
def read_data(filename, target_cols, das_type='cambridge', delex_slots=set(), delex_slot_names=False, delex_das=False): """Read the input data from a TSV file.""" refs_cache = {} def cached_preprocess_sent(da, sent): """we're caching since with generated data, we're likely to parse the same sentence many times.""" if (da, sent) not in refs_cache: refs_cache[(da, sent)] = preprocess_sent(da, sent, delex_slots, delex_slot_names) return list(refs_cache[(da, sent)]) log_info("Reading %s..." % filename) data = pd.read_csv(filename, sep=b"\t", encoding='UTF-8') log_info("Loaded %d instances." % len(data)) # force data type to string if the data set doesn't contain human references data['orig_ref'] = data['orig_ref'].apply( lambda x: '' if not isinstance(x, basestring) else x) log_info("Adapted refs data type.") if das_type == 'text': # for MT output classification das = [[(tok, None) for tok in preprocess_sent(None, sent, False, False)] for sent in data['mr']] else: das = [DA.parse_cambridge_da(da) for da in data['mr']] log_info("Parsed DAs.") texts_ref = [[(tok, None) for tok in cached_preprocess_sent(da, sent)] for da, sent in zip(das, data['orig_ref'])] log_info("Preprocessed human refs.") texts_hyp = [[(tok, None) for tok in cached_preprocess_sent(da, sent)] for da, sent in zip(das, data['system_ref'])] log_info("Preprocessed system outputs.") # alternative reference with rating difference / use to compare if 'system_ref2' in data.columns: texts_hyp2 = [[(tok, None) for tok in cached_preprocess_sent(da, sent)] if isinstance(sent, basestring) else None for da, sent in zip(das, data['system_ref2'])] else: texts_hyp2 = [None] * len(texts_hyp) log_info("Preprocessed 2nd system outputs.") # DA delexicalization must take place after text delexicalization if das_type != 'text' and delex_das: das = [da.get_delexicalized(delex_slots) for da in das] log_info("Delexicalized DAs.") # fake data indicator if 'is_real' in data.columns: real_indics = [0 if indic == 0 else 1 for indic in data['is_real']] else: real_indics = [1 for _ in xrange(len(data))] log_info("Retrieved is_real indications.") inputs = [(da, ref, hyp, hyp2, ri) for da, ref, hyp, hyp2, ri in zip( das, texts_ref, texts_hyp, texts_hyp2, real_indics)] log_info("Built inputs list.") targets = np.array( data[[target_cols] if not isinstance(target_cols, list ) else target_cols], dtype=np.float) log_info("Built targets list.") return inputs, targets
def convert(args): """Main function – read in the JSON data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions # statistics about different DAs da_keys = {} turns = 0 def process_instance(da, conc): da.sort() conc_das.append(da) # store the non-delexicalized version of the DA # delexicalize text, da, abst = delex_sent(da, conc, slots_to_abstract, args.slot_names) da.sort( ) # delexicalization does not keep DAI order, need to sort again # store the DA text = fix_capitalization(text) conc = fix_capitalization(conc) da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # process the input data and store it in memory with open(args.in_file, 'r') as fh: data = json.load(fh, encoding='UTF-8') for dialogue in data: if isinstance(dialogue, dict): for turn in dialogue['dial']: da = DA.parse_cambridge_da(turn['S']['dact']) if args.skip_hello and len( da) == 1 and da[0].da_type == 'hello': continue # skip hello() DAs conc = postprocess_sent(turn['S']['ref']) process_instance(da, conc) turns += 1 else: da = DA.parse_cambridge_da(dialogue[0]) conc = postprocess_sent(dialogue[1]) process_instance(da, conc) turns += 1 print 'Processed', turns, 'turns.' print '%d different DAs.' % len(da_keys) print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))) if args.split: # get file name prefixes and compute data sizes for all the parts to be split out_names = re.split(r'[, ]+', args.out_name) data_sizes = [int(part_size) for part_size in args.split.split(':')] assert len(out_names) == len(data_sizes) # compute sizes for all but the 1st part (+ round them up, as Wen does) total = float(sum(data_sizes)) remain = turns for part_no in xrange(len(data_sizes) - 1, 0, -1): part_size = int(ceil(turns * (data_sizes[part_no] / total))) data_sizes[part_no] = part_size remain -= part_size # put whatever remained into the 1st part data_sizes[0] = remain else: # use just one part -- containing all the data data_sizes = [turns] out_names = [args.out_name] # write all data parts for part_size, part_name in zip(data_sizes, out_names): # create multiple lexicalized references for each instance by relexicalizing sentences # with the same DA from the same part if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']: # group sentences with the same DA da_groups = {} for da, text, abst in zip(das[0:part_size], texts[0:part_size], absts[0:part_size]): da_groups[unicode(da)] = da_groups.get(unicode(da), []) da_groups[unicode(da)].append( (text, filter_abst(abst, slots_to_abstract))) for da_str in da_groups.keys(): seen = set() uniq = [] for text, abst in da_groups[da_str]: sig = text + "\n" + ' '.join( [a.slot + str(a.start) for a in abst]) if sig not in seen: seen.add(sig) uniq.append((text, abst)) da_groups[da_str] = uniq # relexicalize all abstract sentences for each DA relex = [] for da, abst in zip(das[0:part_size], absts[0:part_size]): relex.append( relexicalize(da_groups[unicode(da)], filter_abst(abst, slots_to_abstract))) with open(part_name + '-ref.txt', 'w') as fh: for relex_pars in relex: fh.write("\n".join(relex_pars).encode('utf-8') + "\n\n") with open(part_name + '-das.txt', 'w') as fh: for da in das[0:part_size]: fh.write(unicode(da).encode('utf-8') + "\n") del das[0:part_size] with open(part_name + '-conc_das.txt', 'w') as fh: for conc_da in conc_das[0:part_size]: fh.write(unicode(conc_da).encode('utf-8') + "\n") del conc_das[0:part_size] with open(part_name + '-conc.txt', 'w') as fh: for conc in concs[0:part_size]: fh.write(conc.encode('utf-8') + "\n") del concs[0:part_size] with open(part_name + '-abst.txt', 'w') as fh: for abst in absts[0:part_size]: fh.write("\t".join([unicode(a) for a in abst]).encode('utf-8') + "\n") del absts[0:part_size] with open(part_name + '-text.txt', 'w') as fh: for text in texts[0:part_size]: fh.write(text.encode('utf-8') + "\n") del texts[0:part_size]
def delex_sent(da, sent, delex_slots, use_slot_names=True, delex_slot_names=False, repeated=False): """Delexicalize ("abstract") the given slots in the given sentence (replace them with X or X-slot_name). @param da: concrete DA @param sent: lexicalized sentence text (string -- split only on whitespace, or list of tokens) @param delex_slots: a set of slots to be delexicalized, or a dict (with a set of values to \ leave untouched for each slot) @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X? @return: a tuple of the abstracted text (in the same format as sent), delexicalized DA, \ and abstraction instructions """ return_string = False if isinstance(sent, basestring): toks = sent.split(' ') return_string = True else: toks = sent if isinstance(delex_slots, set): # convert sets to dicts delex_slots = {slot: set() for slot in delex_slots} absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the delexicalized DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the delexicalized (abstracted) DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue # search for the 1st or all occurrences found = 0 pos = (-1, -1) while found < 1 or (repeated and pos != (-1, -1)): pos = find_value(dai.value, toks, toks_mask) # if the value is to be delexicalize, replace the value in the delexicalized DAI # and save abstraction instruction (even if not found in the sentence) if (dai.slot in delex_slots and dai.value not in delex_slots[dai.slot] and dai.value != 'dont_care' and (found == 0 or pos != (-1, -1))): abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append(Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) found += 1 if delex_slot_names: for dai in sorted([dai for dai in da if dai.slot is not None], key=lambda dai: len(dai.slot), reverse=True): pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask) if dai.slot in delex_slots: absts.append(Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be delexicalized absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be delexicalized on the output if (abst.slot not in delex_slots or abst.value in delex_slots[abst.slot] or abst.value == 'dont_care' or abst.start < 0): continue # replace the text with the placeholder (X-slot/X-value, X-slot-name, X) if delex_slot_names and abst.value is None: toks[abst.start - shift:abst.end - shift] = ['X-slot'] elif use_slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks) if return_string else toks, abst_da, absts
def create_fake_data(real_data, columns, score_type='nlg'): """Given some real data, create additional fake data, using human references and distorting them. Will start from scores provided, or default to best possible score. @param real_data: a real data set, as pd.DataFrame @param columns: list of columns for the fake data set @param score_type: switch between Likert scale 1-6 ('nlg') and HTER ('hter') @return: a fake data set, with the given columns, some of them empty """ def target_score(src_score, distort_step): if score_type == 'hter': return src_score + distort_step elif score_type == 'rank': return 1. # ignore scores for ranks return max(1, min(4., src_score - distort_step)) normalize = False best_score = 6. num_steps = 4 if score_type == 'hter': normalize = True best_score = 0. num_steps = 5 elif score_type == 'rank': best_score = 1. fake_data = pd.DataFrame(index=np.arange(len(real_data) * (num_steps + 1)), columns=columns) vocab = {} # add references as perfect data items for idx, row in enumerate(real_data.itertuples()): fake_data.loc[idx]['orig_ref'] = row.orig_ref fake_data.loc[idx]['system_ref'] = row.orig_ref fake_data.loc[idx]['mr'] = row.mr fake_data.loc[idx]['is_real'] = 0 for quant in ['naturalness', 'quality', 'informativeness']: fake_data.loc[idx][quant] = (getattr(row, quant) if ( hasattr(row, quant) and getattr(row, quant) is not None and not np.isnan(getattr(row, quant))) else best_score) for tok in tokenize(row.orig_ref).split(' '): vocab[tok] = vocab.get(tok, 0) + 1 lexicalizer = Lexicalizer(cfg={'mode': 'tokens'}) # default lexicalizer vocab = build_vocab(vocab) for distort_step in xrange(1, num_steps + 1): for idx, row in enumerate(real_data.itertuples(), start=distort_step * len(real_data)): fake_data.loc[idx]['orig_ref'] = row.orig_ref fake_data.loc[idx]['mr'] = row.mr fake_data.loc[idx]['is_real'] = 0 # delexicalize data da = DA.parse_cambridge_da(row.mr) sent, _, lex_instr = delex_sent(da, tokenize(row.orig_ref).split(' '), DELEX_SLOTS) ref_len = len(sent) # distort sent = distort_sent(sent, distort_step, vocab) # lexicalize again sent = lexicalizer._tree_to_sentence([(tok, None) for tok in sent], lex_instr) fake_data.loc[idx]['system_ref'] = ' '.join(sent) for quant in ['naturalness', 'quality', 'informativeness']: score = (getattr(row, quant) if ( hasattr(row, quant) and getattr(row, quant) is not None and not np.isnan(getattr(row, quant))) else best_score) score = target_score(score, distort_step) fake_data.loc[idx][quant] = (((score / ref_len) * 100) if normalize else score) return fake_data
def _init_training(self, das, trees, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (0.0-1.0) """ # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das + '...') das = read_das(das) if not isinstance(trees, list): log_info('Reading t-trees from ' + trees + '...') ttree_doc = read_ttrees(trees) if self.mode == 'tokens': tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) elif self.mode == 'tagged_lemmas': tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tls, use_tags=True) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) elif self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees(trees, use_tags=self.mode == 'tagged_lemmas') # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # ignore contexts, if they are contained in the DAs if isinstance(self.train_das[0], tuple): self.train_das = [da for (context, da) in self.train_das] # delexicalize if DAs are lexicalized and we don't want that if self.delex_slots: self.train_das = [da.get_delexicalized(self.delex_slots) for da in self.train_das] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([self.tree_embs.get_embeddings(tree) for tree in self.train_trees]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [self.tree_feats.get_features(tree, {}) for tree in self.train_trees] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [self.da_feats.get_features(None, {'da': da}) for da in self.train_das] self.y = self.da_vect.fit_transform(self.y) log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names())) # initialize I/O shapes if not self.tree_embs: self.input_shape = list(self.X[0].shape) else: self.input_shape = self.tree_embs.get_embeddings_shape() self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network() # initialize the NN variables self.session.run(tf.global_variables_initializer())
def abstract_sent(da, conc, abst_slots, slot_names): """Abstract the given slots in the given sentence (replace them with X). @param da: concrete DA @param conc: concrete sentence text @param abstr_slots: a set of slots to be abstracted @return: a tuple of the abstracted text, abstracted DA, and abstraction instructions """ toks = conc.split(' ') absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the abstracted DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the 'abstracted' DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue # try to find the value in the sentence (first exact, then fuzzy) # while masking tokens of previously found values val_toks = dai.value.split(' ') pos = find_substr(val_toks, [t if m else '' for t, m in zip(toks, toks_mask)]) if pos is None: pos = find_substr_approx( val_toks, [t if m else '' for t, m in zip(toks, toks_mask)]) if pos is not None: for idx in xrange( pos[0], pos[1]): # mask found things so they're not found twice toks_mask[idx] = False if pos is None or pos == (0, 0): # default to -1 for unknown positions pos = -1, -1 # if the value is to be abstracted, replace the value in the abstracted DAI # and save abstraction instruction (even if not found in the sentence) if dai.slot in abst_slots and dai.value != 'dont_care': abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append(Abst(dai.slot, dai.value, start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be abstracted absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be abstracted on the output if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0: continue # replace the text if slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks), abst_da, absts
def convert(args): """Main function – read in the JSON data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions # statistics about different DAs da_keys = {} turns = 0 def process_instance(da, conc): da.sort() conc_das.append(da) # store the non-delexicalized version of the DA # delexicalize text, da, abst = delex_sent(da, conc, slots_to_abstract, args.slot_names) da.sort() # delexicalization does not keep DAI order, need to sort again # store the DA text = fix_capitalization(text) conc = fix_capitalization(conc) da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # process the input data and store it in memory with open(args.in_file, 'r') as fh: data = json.load(fh, encoding='UTF-8') for dialogue in data: if isinstance(dialogue, dict): for turn in dialogue['dial']: da = DA.parse_cambridge_da(turn['S']['dact']) if args.skip_hello and len(da) == 1 and da[0].da_type == 'hello': continue # skip hello() DAs conc = postprocess_sent(turn['S']['ref']) process_instance(da, conc) turns += 1 else: da = DA.parse_cambridge_da(dialogue[0]) conc = postprocess_sent(dialogue[1]) process_instance(da, conc) turns += 1 print 'Processed', turns, 'turns.' print '%d different DAs.' % len(da_keys) print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))) if args.split: # get file name prefixes and compute data sizes for all the parts to be split out_names = re.split(r'[, ]+', args.out_name) data_sizes = [int(part_size) for part_size in args.split.split(':')] assert len(out_names) == len(data_sizes) # compute sizes for all but the 1st part (+ round them up, as Wen does) total = float(sum(data_sizes)) remain = turns for part_no in xrange(len(data_sizes) - 1, 0, -1): part_size = int(ceil(turns * (data_sizes[part_no] / total))) data_sizes[part_no] = part_size remain -= part_size # put whatever remained into the 1st part data_sizes[0] = remain else: # use just one part -- containing all the data data_sizes = [turns] out_names = [args.out_name] # write all data parts for part_size, part_name in zip(data_sizes, out_names): # create multiple lexicalized references for each instance by relexicalizing sentences # with the same DA from the same part if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']: # group sentences with the same DA da_groups = {} for da, text, abst in zip(das[0:part_size], texts[0:part_size], absts[0:part_size]): da_groups[unicode(da)] = da_groups.get(unicode(da), []) da_groups[unicode(da)].append((text, filter_abst(abst, slots_to_abstract))) for da_str in da_groups.keys(): seen = set() uniq = [] for text, abst in da_groups[da_str]: sig = text + "\n" + ' '.join([a.slot + str(a.start) for a in abst]) if sig not in seen: seen.add(sig) uniq.append((text, abst)) da_groups[da_str] = uniq # relexicalize all abstract sentences for each DA relex = [] for da, abst in zip(das[0:part_size], absts[0:part_size]): relex.append(relexicalize(da_groups[unicode(da)], filter_abst(abst, slots_to_abstract))) with open(part_name + '-ref.txt', 'w') as fh: for relex_pars in relex: fh.write("\n".join(relex_pars).encode('utf-8') + "\n\n") with open(part_name + '-das.txt', 'w') as fh: for da in das[0:part_size]: fh.write(unicode(da).encode('utf-8') + "\n") del das[0:part_size] with open(part_name + '-conc_das.txt', 'w') as fh: for conc_da in conc_das[0:part_size]: fh.write(unicode(conc_da).encode('utf-8') + "\n") del conc_das[0:part_size] with open(part_name + '-conc.txt', 'w') as fh: for conc in concs[0:part_size]: fh.write(conc.encode('utf-8') + "\n") del concs[0:part_size] with open(part_name + '-abst.txt', 'w') as fh: for abst in absts[0:part_size]: fh.write("\t".join([unicode(a) for a in abst]).encode('utf-8') + "\n") del absts[0:part_size] with open(part_name + '-text.txt', 'w') as fh: for text in texts[0:part_size]: fh.write(text.encode('utf-8') + "\n") del texts[0:part_size]
def convert(args): """Main function – read in the CSV data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions # statistics about different DAs da_keys = {} insts = 0 def process_instance(conc_da, conc): # sort the DA using the same order as in E2E NLG data conc_da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value)) conc_das.append(conc_da) text, da, abst = delex_sent(conc_da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'X-') # lowercase all but placeholders da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value)) da_keys[str(da)] = da_keys.get(str(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # process the input data and store it in memory data = pd.read_csv(args.in_file, sep=',', encoding='UTF-8') data['mr'] = data['mr'].fillna('') for inst in data.itertuples(): da = DA.parse_diligent_da(inst.mr) process_instance(da, inst.ref) insts += 1 if insts % 100 == 0: print('%d...' % insts, end='', flush=True, file=sys.stderr) print('Processed', insts, 'instances.', file=sys.stderr) print('%d different DAs.' % len(da_keys), file=sys.stderr) print('%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))), file=sys.stderr) print('Max DA len: %d, max text len: %d' % (max([len(da) for da in das]), max([text.count(' ') + 1 for text in texts])), file=sys.stderr) # for multi-ref mode, group by the same conc DA if args.multi_ref: groups = OrderedDict() # keep the original order (by 1st occurrence of DA) for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts): group = groups.get(str(conc_da), {}) group['da'] = da group['conc_da'] = conc_da group['abst'] = group.get('abst', []) + [abst] group['conc'] = group.get('conc', []) + [conc] group['text'] = group.get('text', []) + [text] groups[str(conc_da)] = group conc_das, das, concs, texts, absts = [], [], [], [], [] for group in groups.values(): conc_das.append(group['conc_da']) das.append(group['da']) concs.append("\n".join(group['conc']) + "\n") texts.append("\n".join(group['text']) + "\n") absts.append("\n".join(["\t".join([str(a) for a in absts_]) for absts_ in group['abst']]) + "\n") else: # convert abstraction instruction to string (coordinate output with multi-ref mode) absts = ["\t".join([str(a) for a in absts_]) for absts_ in absts] with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh: for da in das: fh.write(str(da) + "\n") with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh: for conc_da in conc_das: fh.write(str(conc_da) + "\n") with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh: for conc in concs: fh.write(conc + "\n") with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh: for abst in absts: fh.write(abst + "\n") with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh: for text in texts: fh.write(text + "\n")
def read_e2e_data(): with codecs.open('data/e2e-refs.tag.ngram.txt', 'r', 'UTF-8') as fh: refs = [split_tags(inst.strip()) for inst in fh.readlines()] with codecs.open('data/e2e-mrs.txt', 'r', 'UTF-8') as fh: mrs = [DA.parse_diligent_da(mr) for mr in fh.readlines()] return mrs, refs
def convert(args): src = pd.read_csv(args.src_file, index_col=None, encoding='utf-8') data = [] src_col = args.column trg_col = COLUMN_MAP[src_col[:3]] unique_mrs = set() for _, src_inst in src.iterrows(): mr = DA.parse_diligent_da(src_inst['mr']).to_cambridge_da_string() delex_mr = DA.parse_diligent_da(src_inst['mr']).get_delexicalized( set(['name', 'near'])).to_cambridge_da_string() unique_mrs.add(delex_mr) syss = [{ 'sys': src_inst['sys%d' % i], 'ref': src_inst['ref%d' % i], 'val': src_inst['%s%d' % (src_col, i)] } for i in xrange(1, 6)] for sys1, sys2 in itertools.combinations(syss, 2): if sys1['val'] < sys2['val']: # without loss of generality sys1, sys2 = sys2, sys1 if sys1['val'] == sys2['val']: # ignore those that are equal continue trg_inst = { 'dataset': 'E2E', 'system': SYSTEMS_MAP[sys1['sys']], 'system2': SYSTEMS_MAP[sys2['sys']], 'orig_ref': None, 'mr': mr, 'delex_mr': delex_mr, 'system_ref': sys1['ref'], 'system_ref2': sys2['ref'], 'is_real': 1, 'informativeness': None, 'naturalness': None, 'quality': None } trg_inst[trg_col] = 1 data.append(trg_inst) unique_mrs = sorted(list(unique_mrs)) random.shuffle(unique_mrs) part_sizes = [int(p) for p in args.ratio.split(':')] part_sizes = [ int(round(p * len(unique_mrs) / float(sum(part_sizes)))) for p in part_sizes ] part_sizes[0] = len(unique_mrs) - sum(part_sizes[1:]) part_labels = args.labels.split(':') part_start = 0 log_info('Data sizes in MRs: %s' % ':'.join([str(p) for p in part_sizes])) # remove ambiguous instances if args.unambiguous: occs = Counter([(inst['mr'], inst['system'], inst['system2']) for inst in data]) ambig = set() for mr, sys1, sys2 in occs.iterkeys(): if occs.get((mr, sys2, sys1), 0) == occs[(mr, sys1, sys2)]: ambig.add((mr, sys1, sys2)) uniq_data = [] used_insts = set() for inst in data: mr, sys1, sys2 = inst['mr'], inst['system'], inst['system2'] if (mr, sys1, sys2) in ambig or (mr, sys1, sys2) in used_insts: continue uniq_data.append(inst) used_insts.add((mr, sys1, sys2)) data = uniq_data # mark down the configuration with codecs.open(os.path.join(args.out_path, 'config'), 'wb', encoding='UTF-8') as fh: fh.write(pprint.pformat(vars(args), indent=4, width=100)) # split the output for part_no, (part_size, part_label) in enumerate(zip(part_sizes, part_labels)): part_mrs = set(unique_mrs[part_start:part_start + part_size]) part_data = [inst for inst in data if inst['delex_mr'] in part_mrs] if args.shuffle: random.shuffle(part_data) part_df = pd.DataFrame(part_data) if part_no == 0 and args.fake_data: # create fake data indiv_sys_outputs = get_sys_outputs(part_data) if args.fake_data_from: indiv_sys_outputs.extend( read_system_training_data(args.fake_data_from)) fake_insts = create_fake_data( pd.DataFrame.from_records(indiv_sys_outputs), part_df.columns, score_type='rank') fake_pairs = create_fake_pairs(fake_insts, len(indiv_sys_outputs)) part_df = part_df.append(fake_pairs, sort=True) out_file = os.path.join(args.out_path, part_label + '.tsv') log_info('File: %s, total size %d' % (out_file, len(part_df))) part_df.to_csv(out_file, columns=COLUMNS, sep=b"\t", index=False, encoding='UTF-8') part_start += part_size
def convert(args): """Main function – read in the CSV data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions # statistics about different DAs da_keys = {} insts = 0 def process_instance(da, conc): da.sort() conc_das.append(da) text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'X-') # lowercase all but placeholders da.sort() da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # process the input data and store it in memory with open(args.in_file, 'r') as fh: csvread = csv.reader(fh, encoding='UTF-8') csvread.next() # skip header for mr, text in csvread: da = DA.parse_diligent_da(mr) process_instance(da, text) insts += 1 print 'Processed', insts, 'instances.' print '%d different DAs.' % len(da_keys) print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))) print 'Max DA len: %d, max text len: %d' % (max([len(da) for da in das]), max([text.count(' ') + 1 for text in texts])) # for multi-ref mode, group by the same conc DA if args.multi_ref: groups = OrderedDict() for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts): group = groups.get(unicode(conc_da), {}) group['da'] = da group['conc_da'] = conc_da group['abst'] = group.get('abst', []) + [abst] group['conc'] = group.get('conc', []) + [conc] group['text'] = group.get('text', []) + [text] groups[unicode(conc_da)] = group conc_das, das, concs, texts, absts = [], [], [], [], [] for group in groups.itervalues(): conc_das.append(group['conc_da']) das.append(group['da']) concs.append("\n".join(group['conc']) + "\n") texts.append("\n".join(group['text']) + "\n") absts.append("\n".join(["\t".join([unicode(a) for a in absts_]) for absts_ in group['abst']]) + "\n") else: # convert abstraction instruction to string (coordinate output with multi-ref mode) absts = ["\t".join([unicode(a) for a in absts_]) for absts_ in absts] with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh: for da in das: fh.write(unicode(da) + "\n") with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh: for conc_da in conc_das: fh.write(unicode(conc_da) + "\n") with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh: for conc in concs: fh.write(conc + "\n") with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh: for abst in absts: fh.write(abst + "\n") with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh: for text in texts: fh.write(text + "\n")
def _init_training(self, das, trees, data_portion): """Initialize training. Store input data, initialize 1-hot feature representations for input and output and transform training data accordingly, initialize the classification neural network. @param das: name of source file with training DAs, or list of DAs @param trees: name of source file with corresponding trees/sentences, or list of trees @param data_portion: portion of the training data to be used (0.0-1.0) """ # read input from files or take it directly from parameters if not isinstance(das, list): log_info('Reading DAs from ' + das + '...') das = read_das(das) if not isinstance(trees, list): log_info('Reading t-trees from ' + trees + '...') ttree_doc = read_ttrees(trees) if self.mode == 'tokens': tokens = tokens_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tokens) elif self.mode == 'tagged_lemmas': tls = tagged_lemmas_from_doc(ttree_doc, self.language, self.selector) trees = self._tokens_to_flat_trees(tls, use_tags=True) else: trees = trees_from_doc(ttree_doc, self.language, self.selector) elif self.mode in ['tokens', 'tagged_lemmas']: trees = self._tokens_to_flat_trees( trees, use_tags=self.mode == 'tagged_lemmas') # make training data smaller if necessary train_size = int(round(data_portion * len(trees))) self.train_trees = trees[:train_size] self.train_das = das[:train_size] # ignore contexts, if they are contained in the DAs if isinstance(self.train_das[0], tuple): self.train_das = [da for (context, da) in self.train_das] # delexicalize if DAs are lexicalized and we don't want that if self.delex_slots: self.train_das = [ da.get_delexicalized(self.delex_slots) for da in self.train_das ] # add empty tree + empty DA to training data # (i.e. forbid the network to keep any of its outputs "always-on") train_size += 1 self.train_trees.append(TreeData()) empty_da = DA.parse('inform()') self.train_das.append(empty_da) self.train_order = range(len(self.train_trees)) log_info('Using %d training instances.' % train_size) # initialize input features/embeddings if self.tree_embs: self.dict_size = self.tree_embs.init_dict(self.train_trees) self.X = np.array([ self.tree_embs.get_embeddings(tree) for tree in self.train_trees ]) else: self.tree_feats = Features(['node: presence t_lemma formeme']) self.tree_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.X = [ self.tree_feats.get_features(tree, {}) for tree in self.train_trees ] self.X = self.tree_vect.fit_transform(self.X) # initialize output features self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence']) self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True) self.y = [ self.da_feats.get_features(None, {'da': da}) for da in self.train_das ] self.y = self.da_vect.fit_transform(self.y) log_info('Number of binary classes: %d.' % len(self.da_vect.get_feature_names())) # initialize I/O shapes if not self.tree_embs: self.input_shape = list(self.X[0].shape) else: self.input_shape = self.tree_embs.get_embeddings_shape() self.num_outputs = len(self.da_vect.get_feature_names()) # initialize NN classifier self._init_neural_network() # initialize the NN variables self.session.run(tf.global_variables_initializer())
def convert(args): """Main function – read in the JSON data and output TGEN-specific files.""" # initialize storage items = 0 das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions contexts = [] # abstracted contexts conc_contexts = [] # lexicalized contexts # process the input data and store it in memory with open(args.in_file, 'r') as fh: data = json.load(fh, encoding='UTF-8') for item in data: da = convert_abstr_da(DA.parse(item['response_da'])) context = convert_abstractions(item['context_utt']) context_l = item['context_utt_l'] conc_da = DA.parse(item['response_da_l']) concs_ = [tokenize(s) for s in item['response_nl_l']] absts_ = [] texts_ = [] for abst_text in item['response_nl']: text, abst = get_abstraction( abst_text, conc_da, args.slot_names) # convert *SLOT -> X absts_.append(abst) texts_.append(text) das.append(da) contexts.append(context) conc_contexts.append(context_l) concs.append(concs_) absts.append(absts_) texts.append(texts_) items += 1 print 'Processed', items, 'items.' if args.split: # get file name prefixes and compute data sizes for all the parts to be split out_names = re.split(r'[, ]+', args.out_name) data_sizes = [int(part_size) for part_size in args.split.split(':')] assert len(out_names) == len(data_sizes) # compute sizes for all but the 1st part (+ round them) total = float(sum(data_sizes)) remain = items for part_no in xrange(len(data_sizes) - 1, 0, -1): part_size = int(round(items * (data_sizes[part_no] / total))) data_sizes[part_no] = part_size remain -= part_size # put whatever remained into the 1st part data_sizes[0] = remain else: # use just one part -- containing all the data data_sizes = [items] out_names = [args.out_name] # write all data parts for part_size, part_name in zip(data_sizes, out_names): repeat_num = len(concs[0]) if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']: repeat_num = 1 # repeat DAs and contexts for synonymous paraphrases, unless for test data in multi-ref mode write_part(part_name + '-das.txt', das, part_size, repeat_num) write_part(part_name + '-context.txt', contexts, part_size, repeat_num) write_part(part_name + '-conc_context.txt', conc_contexts, part_size, repeat_num) # write all other just once (here, each instance is a list, it will be unrolled) write_part(part_name + '-conc.txt', concs, part_size) write_part(part_name + '-abst.txt', absts, part_size) write_part(part_name + '-text.txt', texts, part_size)
def convert(args): """Main function – read in the CSV data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions # statistics about different DAs da_keys = {} insts = 0 def process_instance(da, conc): da.sort() conc_das.append(da) text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'X-') # lowercase all but placeholders da.sort() da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # process the input data and store it in memory with open(args.in_file, 'r') as fh: csvread = csv.reader(fh, encoding='UTF-8') csvread.next() # skip header for mr, text, voice in csvread: da = DA.parse_diligent_da(mr, voice) process_instance(da, text) insts += 1 print 'Processed', insts, 'instances.' print '%d different DAs.' % len(da_keys) print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))) print 'Max DA len: %d, max text len: %d' % (max( [len(da) for da in das]), max([text.count(' ') + 1 for text in texts])) # for multi-ref mode, group by the same conc DA if args.multi_ref: groups = OrderedDict() for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts): group = groups.get(unicode(conc_da), {}) group['da'] = da group['conc_da'] = conc_da group['abst'] = group.get('abst', []) + [abst] group['conc'] = group.get('conc', []) + [conc] group['text'] = group.get('text', []) + [text] groups[unicode(conc_da)] = group conc_das, das, concs, texts, absts = [], [], [], [], [] for group in groups.itervalues(): conc_das.append(group['conc_da']) das.append(group['da']) concs.append("\n".join(group['conc']) + "\n") texts.append("\n".join(group['text']) + "\n") absts.append("\n".join([ "\t".join([unicode(a) for a in absts_]) for absts_ in group['abst'] ]) + "\n") else: # convert abstraction instruction to string (coordinate output with multi-ref mode) absts = ["\t".join([unicode(a) for a in absts_]) for absts_ in absts] with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh: for da in das: fh.write(unicode(da) + "\n") with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh: for conc_da in conc_das: fh.write(unicode(conc_da) + "\n") with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh: for conc in concs: fh.write(conc + "\n") with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh: for abst in absts: fh.write(abst + "\n") with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh: for text in texts: fh.write(text + "\n")
def convert(args): """Main function – read in the JSON data and output TGEN-specific files.""" # initialize storage items = 0 conc_das = [] # concrete DAs das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions contexts = [] # abstracted contexts conc_contexts = [] # lexicalized contexts # process the input data and store it in memory with open(args.in_file, 'r') as fh: data = json.load(fh, encoding='UTF-8') for item in data: da = convert_abstr_da(DA.parse(item['response_da'])) context = convert_abstractions(item['context_utt']) context_l = item['context_utt_l'] conc_da = DA.parse(item['response_da_l']) concs_ = [tokenize(s) for s in item['response_nl_l']] absts_ = [] texts_ = [] for abst_text in item['response_nl']: text, abst = get_abstraction(abst_text, conc_da, args.slot_names) # convert *SLOT -> X absts_.append(abst) texts_.append(text) das.append(da) conc_das.append(conc_da) contexts.append(context) conc_contexts.append(context_l) concs.append(concs_) absts.append(absts_) texts.append(texts_) items += 1 print 'Processed', items, 'items.' if args.split: # get file name prefixes and compute data sizes for all the parts to be split out_names = re.split(r'[, ]+', args.out_name) data_sizes = [int(part_size) for part_size in args.split.split(':')] assert len(out_names) == len(data_sizes) # compute sizes for all but the 1st part (+ round them) total = float(sum(data_sizes)) remain = items for part_no in xrange(len(data_sizes) - 1, 0, -1): part_size = int(round(items * (data_sizes[part_no] / total))) data_sizes[part_no] = part_size remain -= part_size # put whatever remained into the 1st part data_sizes[0] = remain else: # use just one part -- containing all the data data_sizes = [items] out_names = [args.out_name] # write all data parts for part_size, part_name in zip(data_sizes, out_names): repeat_num = len(concs[0]) if args.multi_ref and part_name in ['devel', 'test', 'dtest', 'etest']: repeat_num = 1 # repeat DAs and contexts for synonymous paraphrases, unless for test data in multi-ref mode write_part(part_name + '-das.txt', das, part_size, repeat_num) write_part(part_name + '-conc_das.txt', conc_das, part_size, repeat_num) write_part(part_name + '-context.txt', contexts, part_size, repeat_num) write_part(part_name + '-conc_context.txt', conc_contexts, part_size, repeat_num) # write all other just once (here, each instance is a list, it will be unrolled) write_part(part_name + '-ref.txt', concs, part_size, trunc=False, separate=True) write_part(part_name + '-conc.txt', concs, part_size) write_part(part_name + '-abst.txt', absts, part_size) write_part(part_name + '-text.txt', texts, part_size)