def __init__(self, ontology): super(PTICSNLGPreprocessing, self).__init__(ontology) # keep track of relative and absolute time slots self.rel_time_slots = set() self.abs_time_slots = set() # keep track of temperature and temperature interval slots self.temp_slots = set() self.temp_int_slots = set() # keep track of translated slots self.translated_slots = set() self.translations = {} # load their lists from the ontology if 'slot_attributes' in self.ontology: for slot in self.ontology['slot_attributes']: if 'relative_time' in self.ontology['slot_attributes'][slot]: self.rel_time_slots.add(slot) elif 'absolute_time' in self.ontology['slot_attributes'][slot]: self.abs_time_slots.add(slot) elif 'temperature' in self.ontology['slot_attributes'][slot]: self.temp_slots.add(slot) elif 'temperature_int' in self.ontology['slot_attributes'][ slot]: self.temp_int_slots.add(slot) # load translations from the ontology if 'value_translation' in self.ontology: self.translations = self.ontology['value_translation'] for slot in self.ontology['value_translation']: self.translated_slots.add(slot) analyzer_model = online_update( 'applications/PublicTransportInfoCS/data/czech.tagger') generator_model = online_update( 'applications/PublicTransportInfoCS/data/czech.dict') self._analyzer = Analyzer(analyzer_model) self._generator = Generator(generator_model)
def __init__(self, cases_list, strip_punct, lowercase_forms, personal_names): """Initialize the expander object, initialize the morphological analyzer and generator. @param cases_list: List of cases (given as strings) to be used for generation \ (Czech numbers 1-7 are used) @param strip_punct: Strip all punctuation ? @param lowercase_forms: Lowercase all forms on the output? @param personal_names: Are we inflecting personal names? """ self.stops = defaultdict(list) self.cases_list = cases_list self.personal_names = personal_names # initialize postprocessing postprocess_func = ((lambda text: re.sub(r' ([\.,])', r'\1', text)) if not strip_punct else (lambda text: re.sub(r' [\.,\-–\(\)\{\}\[\];\\\/+&](?: [\.,\-–\(\)\{\}\[\];])*( |$)', r'\1', text))) if lowercase_forms: lc_func = lambda text: postprocess_func(text).lower() self.__postprocess_func = lc_func else: self.__postprocess_func = postprocess_func # initialize morphology analyzer_model = online_update('applications/PublicTransportInfoCS/data/czech.tagger') generator_model = online_update('applications/PublicTransportInfoCS/data/czech.dict') self.__analyzer = Analyzer(analyzer_model) self.__generator = Generator(generator_model)
def __init__(self, ontology): super(PTICSNLGPreprocessing, self).__init__(ontology) # keep track of relative and absolute time slots self.rel_time_slots = set() self.abs_time_slots = set() # keep track of temperature and temperature interval slots self.temp_slots = set() self.temp_int_slots = set() # keep track of translated slots self.translated_slots = set() self.translations = {} # load their lists from the ontology if 'slot_attributes' in self.ontology: for slot in self.ontology['slot_attributes']: if 'relative_time' in self.ontology['slot_attributes'][slot]: self.rel_time_slots.add(slot) elif 'absolute_time' in self.ontology['slot_attributes'][slot]: self.abs_time_slots.add(slot) elif 'temperature' in self.ontology['slot_attributes'][slot]: self.temp_slots.add(slot) elif 'temperature_int' in self.ontology['slot_attributes'][slot]: self.temp_int_slots.add(slot) # load translations from the ontology if 'value_translation' in self.ontology: self.translations = self.ontology['value_translation'] for slot in self.ontology['value_translation']: self.translated_slots.add(slot) analyzer_model = online_update('applications/PublicTransportInfoCS/data/czech.tagger') generator_model = online_update('applications/PublicTransportInfoCS/data/czech.dict') self._analyzer = Analyzer(analyzer_model) self._generator = Generator(generator_model)
class PTICSNLGPreprocessing(TemplateNLGPreprocessing): """Template NLG preprocessing routines for Czech public transport information. This serves for spelling out relative and absolute time expressions, as well as translating certain slot values into Czech. """ def __init__(self, ontology): super(PTICSNLGPreprocessing, self).__init__(ontology) # keep track of relative and absolute time slots self.rel_time_slots = set() self.abs_time_slots = set() # keep track of temperature and temperature interval slots self.temp_slots = set() self.temp_int_slots = set() # keep track of translated slots self.translated_slots = set() self.translations = {} # load their lists from the ontology if 'slot_attributes' in self.ontology: for slot in self.ontology['slot_attributes']: if 'relative_time' in self.ontology['slot_attributes'][slot]: self.rel_time_slots.add(slot) elif 'absolute_time' in self.ontology['slot_attributes'][slot]: self.abs_time_slots.add(slot) elif 'temperature' in self.ontology['slot_attributes'][slot]: self.temp_slots.add(slot) elif 'temperature_int' in self.ontology['slot_attributes'][ slot]: self.temp_int_slots.add(slot) # load translations from the ontology if 'value_translation' in self.ontology: self.translations = self.ontology['value_translation'] for slot in self.ontology['value_translation']: self.translated_slots.add(slot) analyzer_model = online_update( 'applications/PublicTransportInfoCS/data/czech.tagger') generator_model = online_update( 'applications/PublicTransportInfoCS/data/czech.dict') self._analyzer = Analyzer(analyzer_model) self._generator = Generator(generator_model) def preprocess(self, template, svs_dict): """Preprocess values to be filled into an NLG template. Spells out temperature and time expressions and translates some of the values to Czech. :param svs_dict: Slot-value dictionary :return: The same dictionary, with modified values """ # regular changes to slot values for slot_id, val in svs_dict.iteritems(): # remove number suffixes from some slot IDs to produce actual slot names slot_name = slot_id[:-1] if slot_id[ -1] in string.digits else slot_id # spell out time expressions if slot_name in self.rel_time_slots: svs_dict[slot_id] = self.spell_time(val, relative=True) elif slot_name in self.abs_time_slots: svs_dict[slot_id] = self.spell_time(val, relative=False) # spell out temperature expressions elif slot_name in self.temp_slots: svs_dict[slot_id] = self.spell_temperature(val, interval=False) elif slot_name in self.temp_int_slots: svs_dict[slot_id] = self.spell_temperature(val, interval=True) # translate some slot values (default to untranslated) elif slot_name in self.translated_slots: svs_dict[slot_id] = self.translations[slot_name].get(val, val) # reflect changes to slot values stored in the template slot_modif = {} def store_repl(match): slot, modif = match.groups() slot_modif[slot] = modif return '{' + slot + '}' template = re.sub(r'\{([^}/]+)/([^}]+)\}', store_repl, template) for slot, modif in slot_modif.iteritems(): if modif == 'Cap1': svs_dict[slot] = svs_dict[slot][0].upper() + svs_dict[slot][1:] elif modif.startswith('Infl'): _, case, repl_word = modif.split(' ') words = self._analyzer.analyze(svs_dict[slot]) forms = self._generator.inflect(words, case, check_fails=True) if forms: svs_dict[slot] = ' '.join([f[0] for f in forms]) else: svs_dict[slot] = repl_word + ' ' + svs_dict[slot] return template, svs_dict HR_ENDING = {1: 'u', 2: 'y', 3: 'y', 4: 'y'} HR_ENDING_DEFAULT = '' def spell_time(self, time, relative): """\ Convert a time expression into words (assuming accusative). :param time: The 24hr numerical time value in a string, e.g. '8:05' :param relative: If true, time is interpreted as relative, i.e. \ 0:15 will generate '15 minutes' and not '0 hours and \ 15 minutes'. :return: Czech time string with all numerals written out as words """ if ':' not in time: # 'now' and similar return time hours, mins = map(int, time.split(':')) time_str = [] if not (relative and hours == 0): hr_id = 'hodin' + self.HR_ENDING.get(hours, '') hours = word_for_number(hours, 'F4') time_str.extend((hours, hr_id)) if mins == 0 and (not relative or hours != 0): return ' '.join(time_str) if time_str: time_str.append('a') min_id = 'minut' + self.HR_ENDING.get(mins, self.HR_ENDING_DEFAULT) mins = word_for_number(mins, 'F4') return ' '.join(time_str + [mins, min_id]) DEG_ENDING = {1: 'eň', 2: 'ně', 3: 'ně', 4: 'ně'} DEG_ENDING_DEFAULT = 'ňů' def spell_temperature(self, value, interval): """Convert a temperature expression into words (assuming nominative). :param value: Temperature value (whole number in degrees as string), \ e.g. '1' or '-10'. :param interval: Boolean indicating whether to treat this as a start \ of an interval, i.e. omit the degrees word. :return: Czech temperature expression as string """ ret = '' value = int(value) if value < 0: ret += 'mínus ' value = abs(value) ret += word_for_number(value, 'M1') if not interval: ret += ' stup' + self.DEG_ENDING.get(value, self.DEG_ENDING_DEFAULT) return ret
class ExpandStops(object): """This handles inflecting stop names into all desired cases in Czech.""" def __init__(self, cases_list, strip_punct, lowercase_forms, personal_names): """Initialize the expander object, initialize the morphological analyzer and generator. @param cases_list: List of cases (given as strings) to be used for generation \ (Czech numbers 1-7 are used) @param strip_punct: Strip all punctuation ? @param lowercase_forms: Lowercase all forms on the output? @param personal_names: Are we inflecting personal names? """ self.stops = defaultdict(list) self.cases_list = cases_list self.personal_names = personal_names # initialize postprocessing postprocess_func = ((lambda text: re.sub(r' ([\.,])', r'\1', text)) if not strip_punct else (lambda text: re.sub(r' [\.,\-–\(\)\{\}\[\];\\\/+&](?: [\.,\-–\(\)\{\}\[\];])*( |$)', r'\1', text))) if lowercase_forms: lc_func = lambda text: postprocess_func(text).lower() self.__postprocess_func = lc_func else: self.__postprocess_func = postprocess_func # initialize morphology analyzer_model = online_update('applications/PublicTransportInfoCS/data/czech.tagger') generator_model = online_update('applications/PublicTransportInfoCS/data/czech.dict') self.__analyzer = Analyzer(analyzer_model) self.__generator = Generator(generator_model) def save(self, fname): """Save all stops currently held in memory to a file.""" with codecs.open(fname, 'w', 'UTF-8') as f_out: for stop_name in sorted(self.stops.keys()): f_out.write(stop_name + "\t") f_out.write('; '.join(self.stops[stop_name])) f_out.write("\n") def parse_line(self, line): """Load one line from the input file (tab-separated main form or implicit main form supported).""" if '\t' not in line: stop = None variants = line else: stop, variants = line.split('\t') variants = [var.strip() for var in variants.split(';')] if stop is None: stop = variants[0] return stop, variants def load_file(self, fname): """Just load a list of stops from a file and store it in memory.""" with codecs.open(fname, 'r', 'UTF-8') as f_in: for line in f_in: if line.startswith('#'): # skip comments continue stop, variants = self.parse_line(line) self.stops[stop] = list(remove_dups_stable(variants + self.stops[stop])) def expand_file(self, fname): """Load a list of stops from a file and expand it.""" with codecs.open(fname, 'r', 'UTF-8') as f_in: ctr = 0 for line in f_in: if line.startswith('#'): # skip comments continue # load variant names for a stop stop, variants = self.parse_line(line) # skip those that needn't be inflected any more to_inflect = [var for var in variants if not var in self.stops[stop]] # inflect the rest for variant in to_inflect: words = self.__analyzer.analyze(variant) # in all required cases for case in self.cases_list: forms = self.__generator.inflect(words, case, self.personal_names) # use all possible combinations if there are more variants for this case inflected = map(self.__postprocess_func, remove_dups_stable([' '.join(var) for var in itertools.product(*forms)])) self.stops[stop] = list(remove_dups_stable(self.stops[stop] + inflected)) ctr += 1 if ctr % 1000 == 0: print >> sys.stderr, '.', print >> sys.stderr
class PTICSNLGPreprocessing(TemplateNLGPreprocessing): """Template NLG preprocessing routines for Czech public transport information. This serves for spelling out relative and absolute time expressions, as well as translating certain slot values into Czech. """ def __init__(self, ontology): super(PTICSNLGPreprocessing, self).__init__(ontology) # keep track of relative and absolute time slots self.rel_time_slots = set() self.abs_time_slots = set() # keep track of temperature and temperature interval slots self.temp_slots = set() self.temp_int_slots = set() # keep track of translated slots self.translated_slots = set() self.translations = {} # load their lists from the ontology if 'slot_attributes' in self.ontology: for slot in self.ontology['slot_attributes']: if 'relative_time' in self.ontology['slot_attributes'][slot]: self.rel_time_slots.add(slot) elif 'absolute_time' in self.ontology['slot_attributes'][slot]: self.abs_time_slots.add(slot) elif 'temperature' in self.ontology['slot_attributes'][slot]: self.temp_slots.add(slot) elif 'temperature_int' in self.ontology['slot_attributes'][slot]: self.temp_int_slots.add(slot) # load translations from the ontology if 'value_translation' in self.ontology: self.translations = self.ontology['value_translation'] for slot in self.ontology['value_translation']: self.translated_slots.add(slot) analyzer_model = online_update('applications/PublicTransportInfoCS/data/czech.tagger') generator_model = online_update('applications/PublicTransportInfoCS/data/czech.dict') self._analyzer = Analyzer(analyzer_model) self._generator = Generator(generator_model) def preprocess(self, template, svs_dict): """Preprocess values to be filled into an NLG template. Spells out temperature and time expressions and translates some of the values to Czech. :param svs_dict: Slot-value dictionary :return: The same dictionary, with modified values """ # regular changes to slot values for slot_id, val in svs_dict.iteritems(): # remove number suffixes from some slot IDs to produce actual slot names slot_name = slot_id[:-1] if slot_id[-1] in string.digits else slot_id # spell out time expressions if slot_name in self.rel_time_slots: svs_dict[slot_id] = self.spell_time(val, relative=True) elif slot_name in self.abs_time_slots: svs_dict[slot_id] = self.spell_time(val, relative=False) # spell out temperature expressions elif slot_name in self.temp_slots: svs_dict[slot_id] = self.spell_temperature(val, interval=False) elif slot_name in self.temp_int_slots: svs_dict[slot_id] = self.spell_temperature(val, interval=True) # translate some slot values (default to untranslated) elif slot_name in self.translated_slots: svs_dict[slot_id] = self.translations[slot_name].get(val, val) # reflect changes to slot values stored in the template slot_modif = {} def store_repl(match): slot, modif = match.groups() slot_modif[slot] = modif return '{' + slot + '}' template = re.sub(r'\{([^}/]+)/([^}]+)\}', store_repl, template) for slot, modif in slot_modif.iteritems(): if modif == 'Cap1': svs_dict[slot] = svs_dict[slot][0].upper() + svs_dict[slot][1:] elif modif.startswith('Infl'): _, case, repl_word = modif.split(' ') words = self._analyzer.analyze(svs_dict[slot]) forms = self._generator.inflect(words, case, check_fails=True) if forms: svs_dict[slot] = ' '.join([f[0] for f in forms]) else: svs_dict[slot] = repl_word + ' ' + svs_dict[slot] return template, svs_dict HR_ENDING = {1: 'u', 2: 'y', 3: 'y', 4: 'y'} HR_ENDING_DEFAULT = '' def spell_time(self, time, relative): """\ Convert a time expression into words (assuming accusative). :param time: The 24hr numerical time value in a string, e.g. '8:05' :param relative: If true, time is interpreted as relative, i.e. \ 0:15 will generate '15 minutes' and not '0 hours and \ 15 minutes'. :return: Czech time string with all numerals written out as words """ if ':' not in time: # 'now' and similar return time hours, mins = map(int, time.split(':')) time_str = [] if not (relative and hours == 0): hr_id = 'hodin' + self.HR_ENDING.get(hours, '') hours = word_for_number(hours, 'F4') time_str.extend((hours, hr_id)) if mins == 0 and (not relative or hours != 0): return ' '.join(time_str) if time_str: time_str.append('a') min_id = 'minut' + self.HR_ENDING.get(mins, self.HR_ENDING_DEFAULT) mins = word_for_number(mins, 'F4') return ' '.join(time_str + [mins, min_id]) DEG_ENDING = {1: 'eň', 2: 'ně', 3: 'ně', 4: 'ně'} DEG_ENDING_DEFAULT = 'ňů' def spell_temperature(self, value, interval): """Convert a temperature expression into words (assuming nominative). :param value: Temperature value (whole number in degrees as string), \ e.g. '1' or '-10'. :param interval: Boolean indicating whether to treat this as a start \ of an interval, i.e. omit the degrees word. :return: Czech temperature expression as string """ ret = '' value = int(value) if value < 0: ret += 'mínus ' value = abs(value) ret += word_for_number(value, 'M1') if not interval: ret += ' stup' + self.DEG_ENDING.get(value, self.DEG_ENDING_DEFAULT) return ret