def tags_for_symbol(self, symbol): tags = self.__data.get('tags') or {} symbol_tags = tags.get( symbol ) # TODO: just change wrap_as_list to check for None... but that would affect a LOT of other code if symbol_tags: symbol_tags = wrap_as_list(symbol_tags) else: symbol_tags = [] if all(type(tag) is str for tag in symbol_tags): result = symbol_tags else: # a workaround to choose one of multiple tagsets from data assert ( symbol_tags ) # all([]) == True, so an empty list should be in the other branch assert (all(type(tag) is list for tag in symbol_tags)) # cache so that repeated queries are NOT inconsistent. # should be okay, since VERBSET_BANK re-instantiates for every query... if symbol not in self.__randomly_picked_symbol_tags: self.__randomly_picked_symbol_tags[symbol] = pick_random( symbol_tags).copy() result = self.__randomly_picked_symbol_tags[symbol] return result #tags.get(symbol)
def __pre_or_post_words(self, lang, kind): assert (kind == 'prewords' or kind == 'postwords') raw_data = self.__data['langs'][lang].get(kind, {}) assert (type(raw_data) is dict) result = {} for symbol, value in raw_data.items(): if type(value) is str: result[symbol] = value elif type(value) is list: assert (all(type(subvalue) is str for subvalue in value)) result[symbol] = pick_random(value) else: assert (type(value) is dict) # YAML raise Exception('preword/postword data should be list or str') return result
def word(self, lang): if lang not in self.__words: word_data = self._get_word_data(lang) if type(word_data) is str: self.__words[lang] = word_data elif type( word_data ) is list: # if wordset has multiple entries [man, person, ...], just pick one at random for WordSet's lifetime assert (word_data) assert (all(type(item) is str for item in word_data)) self.__words[lang] = pick_random(word_data) else: assert (type(word_data) is dict) # YAML raise Exception( 'per-lang verb data should be string or list of strings') assert (type(self.__words[lang]) is str) return self.__words[lang]
def transformation_for_symbol(self, symbol): transforms = self._data().get('transformations', {}) symbol_transforms = transforms.get(symbol) if symbol_transforms: symbol_transforms = wrap_as_list(symbol_transforms) else: symbol_transforms = [] if all(type(transform) is str for transform in symbol_transforms): result = symbol_transforms else: assert (symbol_transforms) assert (all( type(transform) is list for transform in symbol_transforms)) if symbol not in self.__randomly_picked_symbol_transformations: self.__randomly_picked_symbol_transformations[ symbol] = pick_random(symbol_transforms).copy() result = self.__randomly_picked_symbol_transformations[symbol] return result #transforms.get(symbol)
def tags(self): # don't have to worry about multiple synsets here, since semantic tags apply to all words in the synset if self.__tags is None: tag_data = self._data().get('tags') if tag_data: if type(tag_data) is str: self.__tags = [tag_data] elif type(tag_data) is list: if all(type(item) is str for item in tag_data): self.__tags = tag_data # single-list elif all(type(item) is list for item in tag_data): self.__tags = pick_random(tag_data) # list of lists else: assert (all(type(item) is dict) for item in tag_data) raise Exception('malformed tag list', tags) else: assert (type(tags) is dict) # YAML raise Exception('malformed tags', tags) else: self.__tags = [ ] # if you try to return wrap(data.get('tags', [])), you can wind up with [[]] assert (all(type(tag) is str for tag in self.__tags)) return self.__tags
def ppform(self, lang): if lang not in self.__ppform: forms_for_all_langs = self._data().get('ppforms', {}) forms = wrap_as_list(forms_for_all_langs.get(lang, ['standard'])) self.__ppform[lang] = pick_random(forms) return self.__ppform[lang]
def _generate_determiner(self, node): lexical_targets = node.lexical_targets() assert (len(lexical_targets) is 1) assert (lexical_targets[0].type() == 'noun') words = self._get_det_base( node) # string instead of list, to enable segmentation antics target = lexical_targets[0] assert ( 'object' in target._get_option('tags') ) # so that "DT 些" has a plural meaning (*这 些 水). so horribly brittle... if target.number() == 'singular': noun = self._get_noun_base(target) noun_form = self._noun_form_bank.get(noun) # workaround to allow skipping some less important data entry for now if utility.CHECK_DATABASE: assert ( noun_form ) # would call None.get() if noun is missing from nouns_zh.yml measure_words_from_file = noun_form.get('M', '个') else: if noun_form: measure_words_from_file = noun_form.get('M', '个') else: assert (noun_form is None) measure_words_from_file = '个' # TODO: allow measure word omission (e.g. 这 世界 - only allowed for some words?) if measure_words_from_file == '个': measure_word = '个' else: # this should occur here and not in NounSet, because it's zh-specific, and I'm trying to keep all language-specific code in Generators # but unfortunately, it's also data-specific code... if type(measure_words_from_file) is str: candidates = [measure_words_from_file] elif type(measure_words_from_file) is list: assert (all( type(item) is str for item in measure_words_from_file)) candidates = measure_words_from_file else: assert (type(measure_words_from_file) in [dict, type(None)] ) # I suppose it could be a number or a bool... raise Exception('M: expected str or list (YAML)', noun_form.get('pinyin')) if utility.rand() <= 0.9: measure_word = utility.pick_random( measure_words_from_file ) # allows multiple M's per word else: measure_word = '个' assert (type(measure_word) is str) words += ' ' + measure_word else: words += '些' self._generate_node_text(node, words)