def _delex_das(self): """Delexicalize DAs in the buffers, save them separately.""" out = [] for da in self._das: delex_da = DA() for dai in da: delex_dai = DAI( dai.da_type, dai.slot, 'X-' + dai.slot if (dai.value not in [None, 'none', 'dont_care'] and dai.slot in self._abst_slots) else dai.value) delex_da.append(delex_dai) out.append(delex_da) self._delexed_das = out
def _delex_das(self): """Delexicalize DAs in the buffers, save them separately.""" out = [] for da in self._das: delex_da = DA() for dai in da: delex_dai = DAI(dai.da_type, dai.slot, 'X-' + dai.slot if (dai.value not in [None, 'none', 'dont_care'] and dai.slot in self._abst_slots) else dai.value) delex_da.append(delex_dai) out.append(delex_da) self._delexed_das = out
def parse_cambridge_da(da_text): """Parse a DA string into DAIs (DA types, slots, and values).""" da = DA() for dai_text in re.finditer(r'(\??[a-z_]+)\(([^)]*)\)', da_text): da_type, svps = dai_text.groups() if not svps: # no slots/values (e.g. 'hello()') da.append(DAI(da_type, None, None)) continue # we have some slots/values – split them into DAIs svps = re.split('(?<! )[,;]', svps) for svp in svps: if '=' not in svp: # no value, e.g. '?request(near)' da.append(DAI(da_type, svp, None)) continue # we have a value slot, value = svp.split('=', 1) if re.match(r'^\'.*\'$', value): value = value[1:-1] assert not re.match(r'^\'', value) and not re.match(r'\'$', value) da.append(DAI(da_type, slot, value)) return da
def delex_sent(da, conc, abst_slots, use_slot_names=True, delex_slot_names=False): """Abstract the given slots in the given sentence (replace them with X). @param da: concrete DA @param conc: concrete sentence text (string -- split only on whitespace, or list of tokens) @param abst_slots: a set of slots to be abstracted @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X? @return: a tuple of the abstracted text (in the same format as conc), abstracted DA, \ and abstraction instructions """ return_string = False if isinstance(conc, basestring): toks = conc.split(' ') return_string = True else: toks = conc absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the abstracted DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the 'abstracted' DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue pos = find_value(dai.value, toks, toks_mask) # if the value is to be abstracted, replace the value in the abstracted DAI # and save abstraction instruction (even if not found in the sentence) if dai.slot in abst_slots and dai.value != 'dont_care': abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append(Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) if delex_slot_names: for dai in sorted([dai for dai in da if dai.slot is not None], key=lambda dai: len(dai.slot), reverse=True): pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask) if dai.slot in abst_slots: absts.append(Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be abstracted absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be abstracted on the output if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0: continue # replace the text with the placeholder (X-slot/X-value, X-slot-name, X) if delex_slot_names and abst.value is None: toks[abst.start - shift:abst.end - shift] = ['X-slot'] elif use_slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks) if return_string else toks, abst_da, absts
def delex_sent(da, sent, delex_slots, use_slot_names=True, delex_slot_names=False, repeated=False): """Delexicalize ("abstract") the given slots in the given sentence (replace them with X or X-slot_name). @param da: concrete DA @param sent: lexicalized sentence text (string -- split only on whitespace, or list of tokens) @param delex_slots: a set of slots to be delexicalized, or a dict (with a set of values to \ leave untouched for each slot) @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X? @return: a tuple of the abstracted text (in the same format as sent), delexicalized DA, \ and abstraction instructions """ return_string = False if isinstance(sent, basestring): toks = sent.split(' ') return_string = True else: toks = sent if isinstance(delex_slots, set): # convert sets to dicts delex_slots = {slot: set() for slot in delex_slots} absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the delexicalized DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the delexicalized (abstracted) DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue # search for the 1st or all occurrences found = 0 pos = (-1, -1) while found < 1 or (repeated and pos != (-1, -1)): pos = find_value(dai.value, toks, toks_mask) # if the value is to be delexicalize, replace the value in the delexicalized DAI # and save abstraction instruction (even if not found in the sentence) if (dai.slot in delex_slots and dai.value not in delex_slots[dai.slot] and dai.value != 'dont_care' and (found == 0 or pos != (-1, -1))): abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append( Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) found += 1 if delex_slot_names: for dai in sorted([dai for dai in da if dai.slot is not None], key=lambda dai: len(dai.slot), reverse=True): pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask) if dai.slot in delex_slots: absts.append( Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be delexicalized absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be delexicalized on the output if (abst.slot not in delex_slots or abst.value in delex_slots[abst.slot] or abst.value == 'dont_care' or abst.start < 0): continue # replace the text with the placeholder (X-slot/X-value, X-slot-name, X) if delex_slot_names and abst.value is None: toks[abst.start - shift:abst.end - shift] = ['X-slot'] elif use_slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks) if return_string else toks, abst_da, absts
def delex_sent(da, sent, delex_slots, use_slot_names=True, delex_slot_names=False, repeated=False): """Delexicalize ("abstract") the given slots in the given sentence (replace them with X or X-slot_name). @param da: concrete DA @param sent: lexicalized sentence text (string -- split only on whitespace, or list of tokens) @param delex_slots: a set of slots to be delexicalized, or a dict (with a set of values to \ leave untouched for each slot) @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X? @return: a tuple of the abstracted text (in the same format as sent), delexicalized DA, \ and abstraction instructions """ return_string = False if isinstance(sent, basestring): toks = sent.split(' ') return_string = True else: toks = sent if isinstance(delex_slots, set): # convert sets to dicts delex_slots = {slot: set() for slot in delex_slots} absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the delexicalized DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the delexicalized (abstracted) DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue # search for the 1st or all occurrences found = 0 pos = (-1, -1) while found < 1 or (repeated and pos != (-1, -1)): pos = find_value(dai.value, toks, toks_mask) # if the value is to be delexicalize, replace the value in the delexicalized DAI # and save abstraction instruction (even if not found in the sentence) if (dai.slot in delex_slots and dai.value not in delex_slots[dai.slot] and dai.value != 'dont_care' and (found == 0 or pos != (-1, -1))): abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append(Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) found += 1 if delex_slot_names: for dai in sorted([dai for dai in da if dai.slot is not None], key=lambda dai: len(dai.slot), reverse=True): pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask) if dai.slot in delex_slots: absts.append(Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be delexicalized absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be delexicalized on the output if (abst.slot not in delex_slots or abst.value in delex_slots[abst.slot] or abst.value == 'dont_care' or abst.start < 0): continue # replace the text with the placeholder (X-slot/X-value, X-slot-name, X) if delex_slot_names and abst.value is None: toks[abst.start - shift:abst.end - shift] = ['X-slot'] elif use_slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks) if return_string else toks, abst_da, absts
def abstract_sent(da, conc, abst_slots, slot_names): """Abstract the given slots in the given sentence (replace them with X). @param da: concrete DA @param conc: concrete sentence text @param abstr_slots: a set of slots to be abstracted @return: a tuple of the abstracted text, abstracted DA, and abstraction instructions """ toks = conc.split(' ') absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the abstracted DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the 'abstracted' DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue # try to find the value in the sentence (first exact, then fuzzy) # while masking tokens of previously found values val_toks = dai.value.split(' ') pos = find_substr(val_toks, [t if m else '' for t, m in zip(toks, toks_mask)]) if pos is None: pos = find_substr_approx( val_toks, [t if m else '' for t, m in zip(toks, toks_mask)]) if pos is not None: for idx in xrange( pos[0], pos[1]): # mask found things so they're not found twice toks_mask[idx] = False if pos is None or pos == (0, 0): # default to -1 for unknown positions pos = -1, -1 # if the value is to be abstracted, replace the value in the abstracted DAI # and save abstraction instruction (even if not found in the sentence) if dai.slot in abst_slots and dai.value != 'dont_care': abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append(Abst(dai.slot, dai.value, start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be abstracted absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be abstracted on the output if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0: continue # replace the text if slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks), abst_da, absts