def _delex_texts(self): """Delexicalize texts in the buffers and save them separately in the member variables, along with the delexicalization instructions used for the operation.""" self._delexed_texts = [] self._absts = [] for text_idx, (text, da) in enumerate(zip(self._sents, self._das)): delex_text = [] absts = [] # do the delexicalization, keep track of which slots we used for tok_idx, (form, lemma, tag) in enumerate(text): slot = da.has_value(lemma) if slot and slot in self._abst_slots: delex_text.append(('X-' + slot, 'X-' + slot, tag)) absts.append(Abst(slot, lemma, form, tok_idx, tok_idx + 1)) else: delex_text.append((form, lemma, tag)) # fix coordinated delexicalized values self._delex_fix_coords(delex_text, da, absts) covered_slots = set([a.slot for a in absts]) # check and warn if we left isomething non-delexicalized for dai in da: if (dai.slot in self._abst_slots and dai.value not in [None, 'none', 'dont_care'] and dai.slot not in covered_slots): log_info( "Cannot delexicalize slot %s at %d:\nDA: %s\nTx: %s\n" % (dai.slot, text_idx, unicode(da), " ".join( [form for form, _, _ in text]))) # save the delexicalized text and the delexicalization instructions self._delexed_texts.append(delex_text) self._absts.append(absts)
def _create_delex_texts(self): """Delexicalize texts in the buffers and save them separately in the member variables, along with the delexicalization instructions used for the operation.""" self._delex_texts = [] self._absts = [] for text_idx, (text, da) in enumerate(zip(self._texts, self._das)): delex_text = [] absts = [] # do the delexicalization, keep track of which slots we used for tok_idx, (form, lemma, tag) in enumerate(text): # abstract away from numbers abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower()) abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma) # try to find if the surface form belongs to some slot slot, value = self._rev_sf_dict.get( (abst_form, abst_lemma, tag), (None, None)) # if we found a slot, get back the numbers if slot: for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)', lemma): value = re.sub(r'_', num_match.group(1), value, count=1) # fall back to directly comparing against the DA value else: slot = da.has_value(lemma) value = lemma # if we found something, delexicalize it (check if the value corresponds to the DA!) if (slot and slot in self._abst_slots and da.value_for_slot(slot) not in [None, 'none', 'dont_care'] and value in da.value_for_slot(slot)): delex_text.append(('X-' + slot, 'X-' + slot, tag)) absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1)) # otherwise keep the token as it is else: delex_text.append((form, lemma, tag)) # fix coordinated delexicalized values self._delex_fix_coords(delex_text, da, absts) covered_slots = set([a.slot for a in absts]) # check and warn if we left isomething non-delexicalized for dai in da: if (dai.slot in self._abst_slots and dai.value not in [None, 'none', 'dont_care'] and dai.slot not in covered_slots): log_info( "Cannot delexicalize slot %s at %d:\nDA: %s\nTx: %s\n" % (dai.slot, text_idx, str(da), " ".join( [form for form, _, _ in text]))) # save the delexicalized text and the delexicalization instructions self._delex_texts.append(delex_text) self._absts.append(absts)
def get_abstraction(text, conc_da, slot_names=False): """Get the abstraction instructions and convert the string (replace *SLOT with X). If slot_names is true, "X-slot_name" is used instead.""" abstr = [] toks = tokenize(text).split(' ') for dai in conc_da: slot_abst = '*' + dai.slot.upper() try: idx = toks.index(slot_abst) toks[idx] = 'X' + ('-' + dai.slot if slot_names else '') abstr.append( Abst(slot=dai.slot, value=dai.value, start=idx, end=idx + 1)) except ValueError: continue return ' '.join(toks), "\t".join([unicode(a) for a in abstr])
def delex_sent(da, conc, abst_slots, use_slot_names=True, delex_slot_names=False): """Abstract the given slots in the given sentence (replace them with X). @param da: concrete DA @param conc: concrete sentence text (string -- split only on whitespace, or list of tokens) @param abst_slots: a set of slots to be abstracted @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X? @return: a tuple of the abstracted text (in the same format as conc), abstracted DA, \ and abstraction instructions """ return_string = False if isinstance(conc, basestring): toks = conc.split(' ') return_string = True else: toks = conc absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the abstracted DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the 'abstracted' DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue pos = find_value(dai.value, toks, toks_mask) # if the value is to be abstracted, replace the value in the abstracted DAI # and save abstraction instruction (even if not found in the sentence) if dai.slot in abst_slots and dai.value != 'dont_care': abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append(Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) if delex_slot_names: for dai in sorted([dai for dai in da if dai.slot is not None], key=lambda dai: len(dai.slot), reverse=True): pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask) if dai.slot in abst_slots: absts.append(Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be abstracted absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be abstracted on the output if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0: continue # replace the text with the placeholder (X-slot/X-value, X-slot-name, X) if delex_slot_names and abst.value is None: toks[abst.start - shift:abst.end - shift] = ['X-slot'] elif use_slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks) if return_string else toks, abst_da, absts
def delex_sent(da, sent, delex_slots, use_slot_names=True, delex_slot_names=False, repeated=False): """Delexicalize ("abstract") the given slots in the given sentence (replace them with X or X-slot_name). @param da: concrete DA @param sent: lexicalized sentence text (string -- split only on whitespace, or list of tokens) @param delex_slots: a set of slots to be delexicalized, or a dict (with a set of values to \ leave untouched for each slot) @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X? @return: a tuple of the abstracted text (in the same format as sent), delexicalized DA, \ and abstraction instructions """ return_string = False if isinstance(sent, basestring): toks = sent.split(' ') return_string = True else: toks = sent if isinstance(delex_slots, set): # convert sets to dicts delex_slots = {slot: set() for slot in delex_slots} absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the delexicalized DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the delexicalized (abstracted) DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue # search for the 1st or all occurrences found = 0 pos = (-1, -1) while found < 1 or (repeated and pos != (-1, -1)): pos = find_value(dai.value, toks, toks_mask) # if the value is to be delexicalize, replace the value in the delexicalized DAI # and save abstraction instruction (even if not found in the sentence) if (dai.slot in delex_slots and dai.value not in delex_slots[dai.slot] and dai.value != 'dont_care' and (found == 0 or pos != (-1, -1))): abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append( Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) found += 1 if delex_slot_names: for dai in sorted([dai for dai in da if dai.slot is not None], key=lambda dai: len(dai.slot), reverse=True): pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask) if dai.slot in delex_slots: absts.append( Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]), start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be delexicalized absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be delexicalized on the output if (abst.slot not in delex_slots or abst.value in delex_slots[abst.slot] or abst.value == 'dont_care' or abst.start < 0): continue # replace the text with the placeholder (X-slot/X-value, X-slot-name, X) if delex_slot_names and abst.value is None: toks[abst.start - shift:abst.end - shift] = ['X-slot'] elif use_slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks) if return_string else toks, abst_da, absts
def abstract_sent(da, conc, abst_slots, slot_names): """Abstract the given slots in the given sentence (replace them with X). @param da: concrete DA @param conc: concrete sentence text @param abstr_slots: a set of slots to be abstracted @return: a tuple of the abstracted text, abstracted DA, and abstraction instructions """ toks = conc.split(' ') absts = [] abst_da = DA() toks_mask = [True] * len(toks) # find all values in the sentence, building the abstracted DA along the way # search first for longer values (so that substrings don't block them) for dai in sorted(da, key=lambda dai: len(dai.value) if dai.value is not None else 0, reverse=True): # first, create the 'abstracted' DAI as the copy of the current DAI abst_da.append(DAI(dai.da_type, dai.slot, dai.value)) if dai.value is None: continue # try to find the value in the sentence (first exact, then fuzzy) # while masking tokens of previously found values val_toks = dai.value.split(' ') pos = find_substr(val_toks, [t if m else '' for t, m in zip(toks, toks_mask)]) if pos is None: pos = find_substr_approx( val_toks, [t if m else '' for t, m in zip(toks, toks_mask)]) if pos is not None: for idx in xrange( pos[0], pos[1]): # mask found things so they're not found twice toks_mask[idx] = False if pos is None or pos == (0, 0): # default to -1 for unknown positions pos = -1, -1 # if the value is to be abstracted, replace the value in the abstracted DAI # and save abstraction instruction (even if not found in the sentence) if dai.slot in abst_slots and dai.value != 'dont_care': abst_da[-1].value = 'X-' + dai.slot # save the abstraction instruction absts.append(Abst(dai.slot, dai.value, start=pos[0], end=pos[1])) # go from the beginning of the sentence, replacing the values to be abstracted absts.sort(key=lambda a: a.start) shift = 0 for abst in absts: # select only those that should actually be abstracted on the output if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0: continue # replace the text if slot_names: toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot] else: toks[abst.start - shift:abst.end - shift] = ['X'] # update abstraction instruction indexes shift_add = abst.end - abst.start - 1 abst.start -= shift abst.end = abst.start + 1 shift += shift_add return ' '.join(toks), abst_da, absts