コード例 #1
0
 def __init__(self, ontology):
     super(PTICSNLGPreprocessing, self).__init__(ontology)
     # keep track of relative and absolute time slots
     self.rel_time_slots = set()
     self.abs_time_slots = set()
     # keep track of temperature and temperature interval slots
     self.temp_slots = set()
     self.temp_int_slots = set()
     # keep track of translated slots
     self.translated_slots = set()
     self.translations = {}
     # load their lists from the ontology
     if 'slot_attributes' in self.ontology:
         for slot in self.ontology['slot_attributes']:
             if 'relative_time' in self.ontology['slot_attributes'][slot]:
                 self.rel_time_slots.add(slot)
             elif 'absolute_time' in self.ontology['slot_attributes'][slot]:
                 self.abs_time_slots.add(slot)
             elif 'temperature' in self.ontology['slot_attributes'][slot]:
                 self.temp_slots.add(slot)
             elif 'temperature_int' in self.ontology['slot_attributes'][
                     slot]:
                 self.temp_int_slots.add(slot)
     # load translations from the ontology
     if 'value_translation' in self.ontology:
         self.translations = self.ontology['value_translation']
         for slot in self.ontology['value_translation']:
             self.translated_slots.add(slot)
     analyzer_model = online_update(
         'applications/PublicTransportInfoCS/data/czech.tagger')
     generator_model = online_update(
         'applications/PublicTransportInfoCS/data/czech.dict')
     self._analyzer = Analyzer(analyzer_model)
     self._generator = Generator(generator_model)
コード例 #2
0
    def __init__(self, cases_list, strip_punct, lowercase_forms, personal_names):
        """Initialize the expander object, initialize the morphological analyzer and generator.

        @param cases_list: List of cases (given as strings) to be used for generation \
                (Czech numbers 1-7 are used)
        @param strip_punct: Strip all punctuation ?
        @param lowercase_forms: Lowercase all forms on the output?
        @param personal_names: Are we inflecting personal names?
        """
        self.stops = defaultdict(list)
        self.cases_list = cases_list
        self.personal_names = personal_names
        # initialize postprocessing
        postprocess_func = ((lambda text: re.sub(r' ([\.,])', r'\1', text))
                            if not strip_punct
                            else (lambda text: re.sub(r' [\.,\-–\(\)\{\}\[\];\\\/+&](?: [\.,\-–\(\)\{\}\[\];])*( |$)', r'\1', text)))
        if lowercase_forms:
            lc_func = lambda text: postprocess_func(text).lower()
            self.__postprocess_func = lc_func
        else:
            self.__postprocess_func = postprocess_func
        # initialize morphology
        analyzer_model = online_update('applications/PublicTransportInfoCS/data/czech.tagger')
        generator_model = online_update('applications/PublicTransportInfoCS/data/czech.dict')
        self.__analyzer = Analyzer(analyzer_model)
        self.__generator = Generator(generator_model)
コード例 #3
0
ファイル: preprocessing.py プロジェクト: UFAL-DSG/alex
 def __init__(self, ontology):
     super(PTICSNLGPreprocessing, self).__init__(ontology)
     # keep track of relative and absolute time slots
     self.rel_time_slots = set()
     self.abs_time_slots = set()
     # keep track of temperature and temperature interval slots
     self.temp_slots = set()
     self.temp_int_slots = set()
     # keep track of translated slots
     self.translated_slots = set()
     self.translations = {}
     # load their lists from the ontology
     if 'slot_attributes' in self.ontology:
         for slot in self.ontology['slot_attributes']:
             if 'relative_time' in self.ontology['slot_attributes'][slot]:
                 self.rel_time_slots.add(slot)
             elif 'absolute_time' in self.ontology['slot_attributes'][slot]:
                 self.abs_time_slots.add(slot)
             elif 'temperature' in self.ontology['slot_attributes'][slot]:
                 self.temp_slots.add(slot)
             elif 'temperature_int' in self.ontology['slot_attributes'][slot]:
                 self.temp_int_slots.add(slot)
     # load translations from the ontology
     if 'value_translation' in self.ontology:
         self.translations = self.ontology['value_translation']
         for slot in self.ontology['value_translation']:
             self.translated_slots.add(slot)
     analyzer_model = online_update('applications/PublicTransportInfoCS/data/czech.tagger')
     generator_model = online_update('applications/PublicTransportInfoCS/data/czech.dict')
     self._analyzer = Analyzer(analyzer_model)
     self._generator = Generator(generator_model)
コード例 #4
0
class PTICSNLGPreprocessing(TemplateNLGPreprocessing):
    """Template NLG preprocessing routines for Czech public transport information.

    This serves for spelling out relative and absolute time expressions,
    as well as translating certain slot values into Czech.
    """
    def __init__(self, ontology):
        super(PTICSNLGPreprocessing, self).__init__(ontology)
        # keep track of relative and absolute time slots
        self.rel_time_slots = set()
        self.abs_time_slots = set()
        # keep track of temperature and temperature interval slots
        self.temp_slots = set()
        self.temp_int_slots = set()
        # keep track of translated slots
        self.translated_slots = set()
        self.translations = {}
        # load their lists from the ontology
        if 'slot_attributes' in self.ontology:
            for slot in self.ontology['slot_attributes']:
                if 'relative_time' in self.ontology['slot_attributes'][slot]:
                    self.rel_time_slots.add(slot)
                elif 'absolute_time' in self.ontology['slot_attributes'][slot]:
                    self.abs_time_slots.add(slot)
                elif 'temperature' in self.ontology['slot_attributes'][slot]:
                    self.temp_slots.add(slot)
                elif 'temperature_int' in self.ontology['slot_attributes'][
                        slot]:
                    self.temp_int_slots.add(slot)
        # load translations from the ontology
        if 'value_translation' in self.ontology:
            self.translations = self.ontology['value_translation']
            for slot in self.ontology['value_translation']:
                self.translated_slots.add(slot)
        analyzer_model = online_update(
            'applications/PublicTransportInfoCS/data/czech.tagger')
        generator_model = online_update(
            'applications/PublicTransportInfoCS/data/czech.dict')
        self._analyzer = Analyzer(analyzer_model)
        self._generator = Generator(generator_model)

    def preprocess(self, template, svs_dict):
        """Preprocess values to be filled into an NLG template.
        Spells out temperature and time expressions and translates some of the values
        to Czech.

        :param svs_dict: Slot-value dictionary
        :return: The same dictionary, with modified values
        """
        # regular changes to slot values
        for slot_id, val in svs_dict.iteritems():
            # remove number suffixes from some slot IDs to produce actual slot names
            slot_name = slot_id[:-1] if slot_id[
                -1] in string.digits else slot_id
            # spell out time expressions
            if slot_name in self.rel_time_slots:
                svs_dict[slot_id] = self.spell_time(val, relative=True)
            elif slot_name in self.abs_time_slots:
                svs_dict[slot_id] = self.spell_time(val, relative=False)
            # spell out temperature expressions
            elif slot_name in self.temp_slots:
                svs_dict[slot_id] = self.spell_temperature(val, interval=False)
            elif slot_name in self.temp_int_slots:
                svs_dict[slot_id] = self.spell_temperature(val, interval=True)
            # translate some slot values (default to untranslated)
            elif slot_name in self.translated_slots:
                svs_dict[slot_id] = self.translations[slot_name].get(val, val)
        # reflect changes to slot values stored in the template
        slot_modif = {}

        def store_repl(match):
            slot, modif = match.groups()
            slot_modif[slot] = modif
            return '{' + slot + '}'

        template = re.sub(r'\{([^}/]+)/([^}]+)\}', store_repl, template)

        for slot, modif in slot_modif.iteritems():
            if modif == 'Cap1':
                svs_dict[slot] = svs_dict[slot][0].upper() + svs_dict[slot][1:]
            elif modif.startswith('Infl'):
                _, case, repl_word = modif.split(' ')
                words = self._analyzer.analyze(svs_dict[slot])
                forms = self._generator.inflect(words, case, check_fails=True)
                if forms:
                    svs_dict[slot] = ' '.join([f[0] for f in forms])
                else:
                    svs_dict[slot] = repl_word + ' ' + svs_dict[slot]

        return template, svs_dict

    HR_ENDING = {1: 'u', 2: 'y', 3: 'y', 4: 'y'}
    HR_ENDING_DEFAULT = ''

    def spell_time(self, time, relative):
        """\
        Convert a time expression into words (assuming accusative).

        :param time: The 24hr numerical time value in a string, e.g. '8:05'
        :param relative: If true, time is interpreted as relative, i.e. \
                0:15 will generate '15 minutes' and not '0 hours and \
                15 minutes'.
        :return: Czech time string with all numerals written out as words
        """
        if ':' not in time:  # 'now' and similar
            return time
        hours, mins = map(int, time.split(':'))
        time_str = []
        if not (relative and hours == 0):
            hr_id = 'hodin' + self.HR_ENDING.get(hours, '')
            hours = word_for_number(hours, 'F4')
            time_str.extend((hours, hr_id))
        if mins == 0 and (not relative or hours != 0):
            return ' '.join(time_str)
        if time_str:
            time_str.append('a')
        min_id = 'minut' + self.HR_ENDING.get(mins, self.HR_ENDING_DEFAULT)
        mins = word_for_number(mins, 'F4')
        return ' '.join(time_str + [mins, min_id])

    DEG_ENDING = {1: 'eň', 2: 'ně', 3: 'ně', 4: 'ně'}
    DEG_ENDING_DEFAULT = 'ňů'

    def spell_temperature(self, value, interval):
        """Convert a temperature expression into words (assuming nominative).

        :param value: Temperature value (whole number in degrees as string), \
                e.g. '1' or '-10'.
        :param interval: Boolean indicating whether to treat this as a start \
                of an interval, i.e. omit the degrees word.
        :return: Czech temperature expression as string
        """
        ret = ''
        value = int(value)
        if value < 0:
            ret += 'mínus '
            value = abs(value)
        ret += word_for_number(value, 'M1')
        if not interval:
            ret += ' stup' + self.DEG_ENDING.get(value,
                                                 self.DEG_ENDING_DEFAULT)
        return ret
コード例 #5
0
class ExpandStops(object):
    """This handles inflecting stop names into all desired cases in Czech."""

    def __init__(self, cases_list, strip_punct, lowercase_forms, personal_names):
        """Initialize the expander object, initialize the morphological analyzer and generator.

        @param cases_list: List of cases (given as strings) to be used for generation \
                (Czech numbers 1-7 are used)
        @param strip_punct: Strip all punctuation ?
        @param lowercase_forms: Lowercase all forms on the output?
        @param personal_names: Are we inflecting personal names?
        """
        self.stops = defaultdict(list)
        self.cases_list = cases_list
        self.personal_names = personal_names
        # initialize postprocessing
        postprocess_func = ((lambda text: re.sub(r' ([\.,])', r'\1', text))
                            if not strip_punct
                            else (lambda text: re.sub(r' [\.,\-–\(\)\{\}\[\];\\\/+&](?: [\.,\-–\(\)\{\}\[\];])*( |$)', r'\1', text)))
        if lowercase_forms:
            lc_func = lambda text: postprocess_func(text).lower()
            self.__postprocess_func = lc_func
        else:
            self.__postprocess_func = postprocess_func
        # initialize morphology
        analyzer_model = online_update('applications/PublicTransportInfoCS/data/czech.tagger')
        generator_model = online_update('applications/PublicTransportInfoCS/data/czech.dict')
        self.__analyzer = Analyzer(analyzer_model)
        self.__generator = Generator(generator_model)

    def save(self, fname):
        """Save all stops currently held in memory to a file."""
        with codecs.open(fname, 'w', 'UTF-8') as f_out:
            for stop_name in sorted(self.stops.keys()):
                f_out.write(stop_name + "\t")
                f_out.write('; '.join(self.stops[stop_name]))
                f_out.write("\n")

    def parse_line(self, line):
        """Load one line from the input file (tab-separated main form or
        implicit main form supported)."""
        if '\t' not in line:
            stop = None
            variants = line
        else:
            stop, variants = line.split('\t')
        variants = [var.strip() for var in variants.split(';')]
        if stop is None:
            stop = variants[0]
        return stop, variants

    def load_file(self, fname):
        """Just load a list of stops from a file and store it in memory."""
        with codecs.open(fname, 'r', 'UTF-8') as f_in:
            for line in f_in:
                if line.startswith('#'):  # skip comments
                    continue
                stop, variants = self.parse_line(line)
                self.stops[stop] = list(remove_dups_stable(variants + self.stops[stop]))

    def expand_file(self, fname):
        """Load a list of stops from a file and expand it."""
        with codecs.open(fname, 'r', 'UTF-8') as f_in:
            ctr = 0
            for line in f_in:
                if line.startswith('#'):  # skip comments
                    continue
                # load variant names for a stop
                stop, variants = self.parse_line(line)
                # skip those that needn't be inflected any more
                to_inflect = [var for var in variants if not var in self.stops[stop]]
                # inflect the rest
                for variant in to_inflect:
                    words = self.__analyzer.analyze(variant)
                    # in all required cases
                    for case in self.cases_list:
                        forms = self.__generator.inflect(words, case, self.personal_names)
                        # use all possible combinations if there are more variants for this case
                        inflected = map(self.__postprocess_func,
                                        remove_dups_stable([' '.join(var)
                                                            for var in itertools.product(*forms)]))
                        self.stops[stop] = list(remove_dups_stable(self.stops[stop] + inflected))
                ctr += 1
                if ctr % 1000 == 0:
                    print >> sys.stderr, '.',
        print >> sys.stderr
コード例 #6
0
ファイル: preprocessing.py プロジェクト: UFAL-DSG/alex
class PTICSNLGPreprocessing(TemplateNLGPreprocessing):
    """Template NLG preprocessing routines for Czech public transport information.

    This serves for spelling out relative and absolute time expressions,
    as well as translating certain slot values into Czech.
    """

    def __init__(self, ontology):
        super(PTICSNLGPreprocessing, self).__init__(ontology)
        # keep track of relative and absolute time slots
        self.rel_time_slots = set()
        self.abs_time_slots = set()
        # keep track of temperature and temperature interval slots
        self.temp_slots = set()
        self.temp_int_slots = set()
        # keep track of translated slots
        self.translated_slots = set()
        self.translations = {}
        # load their lists from the ontology
        if 'slot_attributes' in self.ontology:
            for slot in self.ontology['slot_attributes']:
                if 'relative_time' in self.ontology['slot_attributes'][slot]:
                    self.rel_time_slots.add(slot)
                elif 'absolute_time' in self.ontology['slot_attributes'][slot]:
                    self.abs_time_slots.add(slot)
                elif 'temperature' in self.ontology['slot_attributes'][slot]:
                    self.temp_slots.add(slot)
                elif 'temperature_int' in self.ontology['slot_attributes'][slot]:
                    self.temp_int_slots.add(slot)
        # load translations from the ontology
        if 'value_translation' in self.ontology:
            self.translations = self.ontology['value_translation']
            for slot in self.ontology['value_translation']:
                self.translated_slots.add(slot)
        analyzer_model = online_update('applications/PublicTransportInfoCS/data/czech.tagger')
        generator_model = online_update('applications/PublicTransportInfoCS/data/czech.dict')
        self._analyzer = Analyzer(analyzer_model)
        self._generator = Generator(generator_model)

    def preprocess(self, template, svs_dict):
        """Preprocess values to be filled into an NLG template.
        Spells out temperature and time expressions and translates some of the values
        to Czech.

        :param svs_dict: Slot-value dictionary
        :return: The same dictionary, with modified values
        """
        # regular changes to slot values
        for slot_id, val in svs_dict.iteritems():
            # remove number suffixes from some slot IDs to produce actual slot names
            slot_name = slot_id[:-1] if slot_id[-1] in string.digits else slot_id
            # spell out time expressions
            if slot_name in self.rel_time_slots:
                svs_dict[slot_id] = self.spell_time(val, relative=True)
            elif slot_name in self.abs_time_slots:
                svs_dict[slot_id] = self.spell_time(val, relative=False)
            # spell out temperature expressions
            elif slot_name in self.temp_slots:
                svs_dict[slot_id] = self.spell_temperature(val, interval=False)
            elif slot_name in self.temp_int_slots:
                svs_dict[slot_id] = self.spell_temperature(val, interval=True)
            # translate some slot values (default to untranslated)
            elif slot_name in self.translated_slots:
                svs_dict[slot_id] = self.translations[slot_name].get(val, val)
        # reflect changes to slot values stored in the template
        slot_modif = {}

        def store_repl(match):
            slot, modif = match.groups()
            slot_modif[slot] = modif
            return '{' + slot + '}'

        template = re.sub(r'\{([^}/]+)/([^}]+)\}', store_repl, template)

        for slot, modif in slot_modif.iteritems():
            if modif == 'Cap1':
                svs_dict[slot] = svs_dict[slot][0].upper() + svs_dict[slot][1:]
            elif modif.startswith('Infl'):
                _, case, repl_word = modif.split(' ')
                words = self._analyzer.analyze(svs_dict[slot])
                forms = self._generator.inflect(words, case, check_fails=True)
                if forms:
                    svs_dict[slot] = ' '.join([f[0] for f in forms])
                else:
                    svs_dict[slot] = repl_word + ' ' + svs_dict[slot]

        return template, svs_dict

    HR_ENDING = {1: 'u', 2: 'y', 3: 'y', 4: 'y'}
    HR_ENDING_DEFAULT = ''

    def spell_time(self, time, relative):
        """\
        Convert a time expression into words (assuming accusative).

        :param time: The 24hr numerical time value in a string, e.g. '8:05'
        :param relative: If true, time is interpreted as relative, i.e. \
                0:15 will generate '15 minutes' and not '0 hours and \
                15 minutes'.
        :return: Czech time string with all numerals written out as words
        """
        if ':' not in time:  # 'now' and similar
            return time
        hours, mins = map(int, time.split(':'))
        time_str = []
        if not (relative and hours == 0):
            hr_id = 'hodin' + self.HR_ENDING.get(hours, '')
            hours = word_for_number(hours, 'F4')
            time_str.extend((hours, hr_id))
        if mins == 0 and (not relative or hours != 0):
            return ' '.join(time_str)
        if time_str:
            time_str.append('a')
        min_id = 'minut' + self.HR_ENDING.get(mins, self.HR_ENDING_DEFAULT)
        mins = word_for_number(mins, 'F4')
        return ' '.join(time_str + [mins, min_id])

    DEG_ENDING = {1: 'eň', 2: 'ně', 3: 'ně', 4: 'ně'}
    DEG_ENDING_DEFAULT = 'ňů'

    def spell_temperature(self, value, interval):
        """Convert a temperature expression into words (assuming nominative).

        :param value: Temperature value (whole number in degrees as string), \
                e.g. '1' or '-10'.
        :param interval: Boolean indicating whether to treat this as a start \
                of an interval, i.e. omit the degrees word.
        :return: Czech temperature expression as string
        """
        ret = ''
        value = int(value)
        if value < 0:
            ret += 'mínus '
            value = abs(value)
        ret += word_for_number(value, 'M1')
        if not interval:
            ret += ' stup' + self.DEG_ENDING.get(value, self.DEG_ENDING_DEFAULT)
        return ret