Beispiel #1
0
 def _delex_das(self):
     """Delexicalize DAs in the buffers, save them separately."""
     out = []
     for da in self._das:
         delex_da = DA()
         for dai in da:
             delex_dai = DAI(
                 dai.da_type, dai.slot, 'X-' + dai.slot if
                 (dai.value not in [None, 'none', 'dont_care']
                  and dai.slot in self._abst_slots) else dai.value)
             delex_da.append(delex_dai)
         out.append(delex_da)
     self._delexed_das = out
Beispiel #2
0
 def _delex_das(self):
     """Delexicalize DAs in the buffers, save them separately."""
     out = []
     for da in self._das:
         delex_da = DA()
         for dai in da:
             delex_dai = DAI(dai.da_type, dai.slot,
                             'X-' + dai.slot
                             if (dai.value not in [None, 'none', 'dont_care'] and
                                 dai.slot in self._abst_slots)
                             else dai.value)
             delex_da.append(delex_dai)
         out.append(delex_da)
     self._delexed_das = out
Beispiel #3
0
def parse_cambridge_da(da_text):
    """Parse a DA string into DAIs (DA types, slots, and values)."""

    da = DA()

    for dai_text in re.finditer(r'(\??[a-z_]+)\(([^)]*)\)', da_text):
        da_type, svps = dai_text.groups()

        if not svps:  # no slots/values (e.g. 'hello()')
            da.append(DAI(da_type, None, None))
            continue

        # we have some slots/values – split them into DAIs
        svps = re.split('(?<! )[,;]', svps)
        for svp in svps:

            if '=' not in svp:  # no value, e.g. '?request(near)'
                da.append(DAI(da_type, svp, None))
                continue

            # we have a value
            slot, value = svp.split('=', 1)
            if re.match(r'^\'.*\'$', value):
                value = value[1:-1]
            assert not re.match(r'^\'', value) and not re.match(r'\'$', value)

            da.append(DAI(da_type, slot, value))

    return da
Beispiel #4
0
def delex_sent(da, conc, abst_slots, use_slot_names=True, delex_slot_names=False):
    """Abstract the given slots in the given sentence (replace them with X).

    @param da: concrete DA
    @param conc: concrete sentence text (string -- split only on whitespace, or list of tokens)
    @param abst_slots: a set of slots to be abstracted
    @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X?
    @return: a tuple of the abstracted text (in the same format as conc), abstracted DA, \
        and abstraction instructions
    """
    return_string = False
    if isinstance(conc, basestring):
        toks = conc.split(' ')
        return_string = True
    else:
        toks = conc
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the abstracted DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value) if dai.value is not None else 0,
                      reverse=True):
        # first, create the 'abstracted' DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue
        pos = find_value(dai.value, toks, toks_mask)
        # if the value is to be abstracted, replace the value in the abstracted DAI
        # and save abstraction instruction (even if not found in the sentence)
        if dai.slot in abst_slots and dai.value != 'dont_care':
            abst_da[-1].value = 'X-' + dai.slot
            # save the abstraction instruction
            absts.append(Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]),
                              start=pos[0], end=pos[1]))

    if delex_slot_names:
        for dai in sorted([dai for dai in da if dai.slot is not None],
                          key=lambda dai: len(dai.slot),
                          reverse=True):
            pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask)
            if dai.slot in abst_slots:
                absts.append(Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]),
                                  start=pos[0], end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be abstracted
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be abstracted on the output
        if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0:
            continue
        # replace the text with the placeholder (X-slot/X-value, X-slot-name, X)
        if delex_slot_names and abst.value is None:
            toks[abst.start - shift:abst.end - shift] = ['X-slot']
        elif use_slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks) if return_string else toks, abst_da, absts
Beispiel #5
0
def delex_sent(da,
               sent,
               delex_slots,
               use_slot_names=True,
               delex_slot_names=False,
               repeated=False):
    """Delexicalize ("abstract") the given slots in the given sentence (replace them with X
    or X-slot_name).

    @param da: concrete DA
    @param sent: lexicalized sentence text (string -- split only on whitespace, or list of tokens)
    @param delex_slots: a set of slots to be delexicalized, or a dict (with a set of values to \
        leave untouched for each slot)
    @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X?
    @return: a tuple of the abstracted text (in the same format as sent), delexicalized DA, \
        and abstraction instructions
    """
    return_string = False
    if isinstance(sent, basestring):
        toks = sent.split(' ')
        return_string = True
    else:
        toks = sent
    if isinstance(delex_slots, set):  # convert sets to dicts
        delex_slots = {slot: set() for slot in delex_slots}
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the delexicalized DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value)
                      if dai.value is not None else 0,
                      reverse=True):
        # first, create the delexicalized (abstracted) DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue

        # search for the 1st or all occurrences
        found = 0
        pos = (-1, -1)
        while found < 1 or (repeated and pos != (-1, -1)):
            pos = find_value(dai.value, toks, toks_mask)
            # if the value is to be delexicalize, replace the value in the delexicalized DAI
            # and save abstraction instruction (even if not found in the sentence)
            if (dai.slot in delex_slots
                    and dai.value not in delex_slots[dai.slot]
                    and dai.value != 'dont_care' and (found == 0 or pos !=
                                                      (-1, -1))):

                abst_da[-1].value = 'X-' + dai.slot
                # save the abstraction instruction
                absts.append(
                    Abst(dai.slot,
                         dai.value,
                         surface_form=' '.join(toks[pos[0]:pos[1]]),
                         start=pos[0],
                         end=pos[1]))
            found += 1

    if delex_slot_names:
        for dai in sorted([dai for dai in da if dai.slot is not None],
                          key=lambda dai: len(dai.slot),
                          reverse=True):
            pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask)
            if dai.slot in delex_slots:
                absts.append(
                    Abst(dai.slot,
                         None,
                         surface_form=' '.join(toks[pos[0]:pos[1]]),
                         start=pos[0],
                         end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be delexicalized
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be delexicalized on the output
        if (abst.slot not in delex_slots
                or abst.value in delex_slots[abst.slot]
                or abst.value == 'dont_care' or abst.start < 0):
            continue
        # replace the text with the placeholder (X-slot/X-value, X-slot-name, X)
        if delex_slot_names and abst.value is None:
            toks[abst.start - shift:abst.end - shift] = ['X-slot']
        elif use_slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end -
                 shift] = ['X' if not delex_slot_names else 'X-value']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks) if return_string else toks, abst_da, absts
Beispiel #6
0
def delex_sent(da, sent, delex_slots, use_slot_names=True, delex_slot_names=False, repeated=False):
    """Delexicalize ("abstract") the given slots in the given sentence (replace them with X
    or X-slot_name).

    @param da: concrete DA
    @param sent: lexicalized sentence text (string -- split only on whitespace, or list of tokens)
    @param delex_slots: a set of slots to be delexicalized, or a dict (with a set of values to \
        leave untouched for each slot)
    @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X?
    @return: a tuple of the abstracted text (in the same format as sent), delexicalized DA, \
        and abstraction instructions
    """
    return_string = False
    if isinstance(sent, basestring):
        toks = sent.split(' ')
        return_string = True
    else:
        toks = sent
    if isinstance(delex_slots, set):  # convert sets to dicts
        delex_slots = {slot: set() for slot in delex_slots}
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the delexicalized DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value) if dai.value is not None else 0,
                      reverse=True):
        # first, create the delexicalized (abstracted) DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue

        # search for the 1st or all occurrences
        found = 0
        pos = (-1, -1)
        while found < 1 or (repeated and pos != (-1, -1)):
            pos = find_value(dai.value, toks, toks_mask)
            # if the value is to be delexicalize, replace the value in the delexicalized DAI
            # and save abstraction instruction (even if not found in the sentence)
            if (dai.slot in delex_slots and
                    dai.value not in delex_slots[dai.slot] and
                    dai.value != 'dont_care' and
                    (found == 0 or pos != (-1, -1))):

                abst_da[-1].value = 'X-' + dai.slot
                # save the abstraction instruction
                absts.append(Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]),
                                  start=pos[0], end=pos[1]))
            found += 1

    if delex_slot_names:
        for dai in sorted([dai for dai in da if dai.slot is not None],
                          key=lambda dai: len(dai.slot),
                          reverse=True):
            pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask)
            if dai.slot in delex_slots:
                absts.append(Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]),
                                  start=pos[0], end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be delexicalized
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be delexicalized on the output
        if (abst.slot not in delex_slots or abst.value in delex_slots[abst.slot]
                or abst.value == 'dont_care' or abst.start < 0):
            continue
        # replace the text with the placeholder (X-slot/X-value, X-slot-name, X)
        if delex_slot_names and abst.value is None:
            toks[abst.start - shift:abst.end - shift] = ['X-slot']
        elif use_slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks) if return_string else toks, abst_da, absts
Beispiel #7
0
def abstract_sent(da, conc, abst_slots, slot_names):
    """Abstract the given slots in the given sentence (replace them with X).

    @param da: concrete DA
    @param conc: concrete sentence text
    @param abstr_slots: a set of slots to be abstracted
    @return: a tuple of the abstracted text, abstracted DA, and abstraction instructions
    """
    toks = conc.split(' ')
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the abstracted DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value)
                      if dai.value is not None else 0,
                      reverse=True):
        # first, create the 'abstracted' DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue
        # try to find the value in the sentence (first exact, then fuzzy)
        # while masking tokens of previously found values
        val_toks = dai.value.split(' ')
        pos = find_substr(val_toks,
                          [t if m else '' for t, m in zip(toks, toks_mask)])
        if pos is None:
            pos = find_substr_approx(
                val_toks, [t if m else '' for t, m in zip(toks, toks_mask)])
        if pos is not None:
            for idx in xrange(
                    pos[0],
                    pos[1]):  # mask found things so they're not found twice
                toks_mask[idx] = False
        if pos is None or pos == (0, 0):  # default to -1 for unknown positions
            pos = -1, -1
        # if the value is to be abstracted, replace the value in the abstracted DAI
        # and save abstraction instruction (even if not found in the sentence)
        if dai.slot in abst_slots and dai.value != 'dont_care':
            abst_da[-1].value = 'X-' + dai.slot
            # save the abstraction instruction
            absts.append(Abst(dai.slot, dai.value, start=pos[0], end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be abstracted
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be abstracted on the output
        if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0:
            continue
        # replace the text
        if slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end - shift] = ['X']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks), abst_da, absts