def add_to_lookup(self, org_key, org_value): key = org_key.strip() target_key = JapaneseLanguage.zenhan_normalize(key) target_key = target_key.upper() value = org_value.strip() if JapaneseLanguage.is_CJKword(target_key) is True: if target_key in self._pairs_jp: YLogger.error(self, "%s = %s already exists in jp_collection", key, value) return else: matchs = self._match_jp splits = target_key check_key = target_key[0] self._pairs_jp[target_key] = value else: if target_key in self._pairs: YLogger.error(self, "%s = %s already exists in en_collection", key, value) return else: matchs = self._match splits = target_key.split() check_key = splits[0] self._pairs[target_key] = value if check_key not in matchs: matchs[check_key] = [] matchs[check_key].append(splits)
def resolve_to_string(self, client_context): name = self.name.resolve(client_context) name = JapaneseLanguage.zenhan_normalize(name) name = name.upper() targer_word = self.resolve_children(client_context) targer_word = JapaneseLanguage.zenhan_normalize(targer_word) var = targer_word.upper() if client_context.brain.dynamics.is_dynamic_map(name) is True: value = client_context.brain.dynamics.dynamic_map(client_context, name, var) else: if client_context.brain.maps.contains(name) is False: YLogger.debug(client_context, "No map defined for [%s], using default-map as value", var) value = self.get_default_value(client_context) else: the_map = client_context.brain.maps.map(name) if var in the_map: value = the_map[var] else: YLogger.debug(client_context, "No value defined for [%s], using default-map as value", var) value = self.get_default_value(client_context) YLogger.debug(client_context, "MAP [%s] resolved to [%s] = [%s]", self.to_string(), name, value) return value
def words_in_set(self, client_context, words, word_no): set_words = client_context.brain.sets.set_list(self.set_name) if not set_words: YLogger.debug(self, "No set with name [%s]", self.set_name) return EqualsMatch(False, word_no) is_CJK = client_context.brain.sets.is_cjk(self.set_name) set_values = client_context.brain.sets.values(self.set_name) word = words.word(word_no) check_word = JapaneseLanguage.zenhan_normalize(word) word = check_word.upper() if is_CJK is True: keyword = word[0] else: keyword = word if keyword in set_words: phrases = set_words[keyword] phrases = sorted(phrases, key=len, reverse=True) for phrase in phrases: if is_CJK is True: phrase_words = client_context.brain.tokenizer.texts_to_words( phrase) phrase = "".join(phrase_words) phrase_text = phrase else: phrase_text = " ".join(phrase) phrase_word_no = 0 words_word_no = word_no while phrase_word_no < len( phrase) and words_word_no < words.num_words(): word = words.word(words_word_no) check_word = JapaneseLanguage.zenhan_normalize(word) word = check_word.upper() if is_CJK is True: phrase_word = phrase[phrase_word_no:(phrase_word_no + len(word))] if phrase_word == word: if (phrase_word_no + len(word)) == len(phrase): return EqualsMatch(True, words_word_no, set_values[phrase_text]) else: break phrase_word_no += len(word) else: phrase_word = phrase[phrase_word_no] if phrase_word == word: if phrase_word_no + 1 == len(phrase): return EqualsMatch(True, words_word_no, set_values[phrase_text]) else: break phrase_word_no += 1 words_word_no += 1 return EqualsMatch(False, word_no)
def replace_by_words(self, tokenizer, replacable): resolved = '' if tokenizer is None: words = replacable.split() else: tokenizer.is_convert = False tokenizer.is_punctuation = False words = tokenizer.texts_to_words(replacable) tokenizer.is_convert = True tokenizer.is_punctuation = True if len(words) == 0: return resolved match_count = 0 word_no = 0 is_match = False last_CJK = True for word in words: if match_count > 0: match_count -= 1 word_no += 1 continue word_CJK = JapaneseLanguage.is_CJKword(word) if word_CJK is True: pairs = self._pairs_jp matchs = self._match_jp keyword = word[0] else: pairs = self._pairs matchs = self._match keyword = word if keyword in matchs: phrases = matchs[keyword] match_count, key = self.match(word_CJK, words, word_no, phrases) if match_count > 0: resolved += pairs[key] match_count -= 1 is_match = True word_CJK = JapaneseLanguage.is_CJKword(pairs[key]) else: if is_match is False: if word_CJK is False or (last_CJK is False and word_CJK is True): resolved += ' ' is_match = False resolved += word word_no += 1 last_CJK = word_CJK return resolved.strip()
def replace_by_words(self, tokenizer, replacable): resolved = '' if tokenizer is None: words = replacable.split() else: tokenizer.is_punctuation = False words = tokenizer.texts_to_words(replacable) tokenizer.is_punctuation = True if len(words) == 0: return resolved last_CJK = True match_count = 0 word_no = 0 for word in words: if match_count > 0: match_count -= 1 word_no += 1 continue target_word = JapaneseLanguage.zenhan_normalize(word) target_word = target_word.upper() is_CJK = JapaneseLanguage.is_CJKword(target_word) if is_CJK is True: pairs = self._pairs_jp matchs = self._match_jp else: pairs = self._pairs matchs = self._match if is_CJK is True: keyword = target_word[0] else: keyword = target_word if keyword in matchs: phrases = matchs[keyword] match_count, key = self.match(is_CJK, words, word_no, phrases) if match_count > 0: if is_CJK is False or last_CJK != is_CJK: resolved += ' ' resolved += pairs[key] match_count -= 1 else: if is_CJK is False or last_CJK != is_CJK: resolved += ' ' resolved += word last_CJK = is_CJK word_no += 1 return resolved.strip()
def __init__(self, attribs, text, userid='*', element=None, brain=None): PatternNode.__init__(self, userid) if 'name' in attribs: name = attribs['name'] elif text: name = text else: raise ParserException("No name specified as attribute or text", xml_element=element, nodename='set(pattern)') if name == '': raise ParserException("No name specified as attribute or text", xml_element=element, nodename='set(pattern)') name = JapaneseLanguage.zenhan_normalize(name) set_name = name.upper() if brain is not None: if brain.sets.storename(set_name) is None: if brain.dynamics.is_dynamic_set(set_name) is False: raise ParserException("Set[%s] not found" % set_name, xml_element=element, nodename='set(pattern)') self._set_name = set_name
def parse_expression(self, graph, expression): name_found = False if 'name' in expression.attrib: name = self.parse_attrib_value_as_word_node(graph, expression, 'name') name_text = JapaneseLanguage.zenhan_normalize(name.children[0].word) map_name = name_text.upper() if graph.aiml_parser.brain.maps.storename(map_name) is None: if graph.aiml_parser.brain.dynamics.is_dynamic_map(map_name) is False: raise ParserException("Map[%s] name not found" % map_name, xml_element=expression, nodename='map') self.name = name name_found = True self.parse_text(graph, self.get_text_from_element(expression)) for child in expression: tag_name = TextUtils.tag_from_text(child.tag) if tag_name == 'name': self.name = self.parse_children_as_word_node(graph, child) name_found = True else: graph.parse_tag_expression(child, self) self.parse_text(graph, self.get_tail_from_element(child)) if name_found is False: raise ParserException("Name not found in map", xml_element=expression, nodename='map')
def convert_key(self, key_item): if key_item is not None: key_item = JapaneseLanguage.zenhan_normalize(key_item) if key_item.startswith("?") is False: key_item = key_item.upper() return key_item
def _load_file_contents(self, map_collection, filename): YLogger.debug(self, "Loading map [%s]", filename) the_map = {} try: line_no = 0 with open(filename, 'r', encoding='utf8') as my_file: for line in my_file: line_no += 1 line = line.strip() if line == '' or line[0] == '#': continue splits = line.split(":") if len(splits) > 1: targer_word = splits[0].strip() if targer_word == '': error_info = "key is empty" map_collection.set_error_info( filename, line_no, error_info) continue targer_word = JapaneseLanguage.zenhan_normalize( targer_word) key = re.sub(' +', ' ', targer_word.upper()) value = ":".join(splits[1:]).strip() if key not in the_map: the_map[key] = value.strip() else: error_info = "duplicate key='%s' (value='%s' is invalid)" % ( key, value) map_collection.set_error_info( filename, line_no, error_info) else: error_info = "invalid parameters [%s]" % line map_collection.set_error_info(filename, line_no, error_info) except Exception as excep: YLogger.exception(self, "Failed to load map [%s]", excep, filename) if len(the_map) > 0: name = self.get_just_filename_from_filepath(filename) map_name = JapaneseLanguage.zenhan_normalize(name) map_collection.add_map(map_name, the_map, filename) return self.storage_engine.configuration.maps_storage
def texts_to_words(self, texts): if not texts: return [] if self._is_convert is True: han_texts = mojimoji.zen_to_han(texts, kana=False) zen_texts = mojimoji.han_to_zen(han_texts, digit=False, ascii=False) else: han_texts = texts zen_texts = texts if JapaneseLanguage.is_CJKword(zen_texts) is True: if self._is_template is False: words = [] target_text = '' words_CJK = JapaneseLanguage.is_CJKchar(zen_texts[0]) for ch in zen_texts: char_CJK = JapaneseLanguage.is_CJKchar(ch) if words_CJK != char_CJK: if words_CJK is True: tmp_words = self._texts_to_words_jp(target_text) else: tmp_words = self._texts_to_words_en(target_text) for word in tmp_words: words.append(word) words_CJK = char_CJK target_text = '' target_text += ch if len(target_text) > 0: if words_CJK is True: tmp_words = self._texts_to_words_jp(target_text) else: tmp_words = self._texts_to_words_en(target_text) for word in tmp_words: words.append(word) else: words = self._template_texts_to_words_jp(texts) else: if self._is_template is False: words = self._texts_to_words_en(han_texts) else: words = self._texts_to_words_en(texts) return words
def add_rest_template(self, name, params, filename, idx): template_name = JapaneseLanguage.zenhan_normalize(name) template_name = template_name.upper() if template_name not in self._templates: self._templates[template_name] = params else: error_info = "duplicate template_name='%s' (host='%s' is invalid)" % ( name, params.host) self.set_error_info(filename, idx, error_info) return
def __init__(self, attribs, text, userid='*', element=None): PatternNode.__init__(self, userid) self._words = {} self._values = {} if 'words' in attribs: words = attribs['words'] elif text: words = text else: raise ParserException("No words specified as attribute or text", xml_element=element, nodename='iset') check_words = JapaneseLanguage.zenhan_normalize(words) self._is_CJK = JapaneseLanguage.is_CJKword(check_words) if self._parse_words(words) is False: raise ParserException("empty element in words", xml_element=element, nodename='iset') self._iset_name = "iset_%d" % (PatternISetNode.iset_count) PatternISetNode.iset_count += 1
def add_botname(self, name, botInfo, filename, line): bot_name = JapaneseLanguage.zenhan_normalize(name) bot_name = bot_name.upper() if bot_name not in self._botnames: self._botnames[bot_name] = botInfo else: error_info = "duplicate botname='%s' (url='%s' is invalid)" % ( name, botInfo.url) self.set_error_info(filename, line, error_info) return
def _load_file_contents(self, set_collection, filename): YLogger.debug(self, "Loading set [%s]", filename) try: the_set = {} set_list = [] check_list = [] is_cjk = False values = {} line_no = 0 with open(filename, 'r', encoding='utf8') as my_file: for line in my_file: line_no += 1 line = line.strip() if line: if line[0] == '#': continue chk_words = JapaneseLanguage.zenhan_normalize(line) chk_words = chk_words.upper() cjk = self.check_cjk(is_cjk, chk_words) if cjk is True: line_text = re.sub(' ', '', line) chk_words = re.sub(' ', '', chk_words) if is_cjk is False: is_cjk = True else: line_text = re.sub(' +', ' ', line) chk_words = re.sub(' +', ' ', chk_words) if chk_words in check_list: error_info = "duplicate value='%s'" % line set_collection.set_error_info(filename, line_no, error_info) else: set_list.append(line_text) check_list.append(chk_words) the_set, values = self.make_set_table(is_cjk, set_list) except Exception as excep: YLogger.exception(self, "Failed to load set [%s]", excep, filename) if len(the_set) > 0: name = self.get_just_filename_from_filepath(filename) set_name = JapaneseLanguage.zenhan_normalize(name) set_collection.add_set(set_name, the_set, filename, is_cjk, values)
def _convert_name(self, name): if name is None: return name else: name = name.strip() if name == '': return None server_name = JapaneseLanguage.zenhan_normalize(name) server_name = server_name.upper() return server_name
def replace_by_words(self, tokenizer, replacable): resolved = '' if tokenizer is None: words = replacable.split() else: tokenizer.is_convert = False tokenizer.is_punctuation = False words = tokenizer.texts_to_words(replacable) tokenizer.is_convert = True tokenizer.is_punctuation = True if len(words) == 0: return resolved match_count = 0 word_no = 0 new_words = [] for word in words: if match_count > 0: match_count -= 1 word_no += 1 continue word_CJK = JapaneseLanguage.is_CJKword(word) if word_CJK is True: pairs = self._pairs_jp matchs = self._match_jp keyword = word[0] else: pairs = self._pairs matchs = self._match keyword = word if keyword in matchs: phrases = matchs[keyword] match_count, key = self.match(word_CJK, words, word_no, phrases) if match_count > 0: new_words.append(pairs[key]) match_count -= 1 else: new_words.append(word) word_no += 1 if len(new_words) > 0: if tokenizer is None: to_join = [ word.strip() for word in new_words if word and word != ' ' ] resolved = " ".join(to_join) else: resolved = tokenizer.words_to_texts(new_words) return resolved
def equals(self, client_context, words, word_no): if client_context.match_nlu is True: return EqualsMatch(False, word_no) word = words.word(word_no) if self.userid != '*': if self.userid != client_context.userid: return EqualsMatch(False, word_no) if client_context.brain.properties.has_property(self.property): value = client_context.brain.properties.property(self.property) value_len = len(value) word_len = len(word) texts = word add_count = 0 if value_len > word_len: texts_len = word_len check_index = 0 for word in words.words: if check_index <= word_no: check_index += 1 continue if word == '__TOPIC__': break texts_len += len(word) texts += word add_count += 1 if texts_len >= value_len: break check_texts = JapaneseLanguage.zenhan_normalize(texts) value_texts = JapaneseLanguage.zenhan_normalize(value) if check_texts.upper() == value_texts.upper(): word_no += add_count YLogger.debug(client_context, "Found words [%s] as bot property", value) return EqualsMatch(True, word_no, value) return EqualsMatch(False, word_no)
def add_property(self, key, value, filename=None, line=0): if key == '': error_info = "key is empty" self.set_error_info(filename, line, error_info) return template = JapaneseLanguage.zenhan_normalize(key) template = template.upper() if self.has_property(template) is False: self.pairs.append([template, value]) else: error_info = "duplicate key='%s' (value='%s' is invalid)" % (key, value) self.set_error_info(filename, line, error_info)
def add_to_lookup(self, org_key, org_value, filename=None, line=0): key = org_key.strip() if key == '': error_info = "key is empty" self.set_error_info(filename, line, error_info) return target_key = JapaneseLanguage.zenhan_normalize(key) target_key = re.sub(' +', ' ', target_key.upper()) value = org_value.strip() if JapaneseLanguage.is_CJKword(target_key) is True: if target_key in self._pairs_jp: YLogger.error(self, "%s = %s already exists in jp_collection", key, value) error_info = "duplicate key='%s' (value='%s' is invalid)" % (key, value) self.set_error_info(filename, line, error_info) return else: matchs = self._match_jp splits = target_key check_key = target_key[0] self._pairs_jp[target_key] = value else: if target_key in self._pairs: YLogger.error(self, "%s = %s already exists in en_collection", key, value) error_info = "duplicate key='%s' (value='%s' is invalid)" % (key, value) self.set_error_info(filename, line, error_info) return else: matchs = self._match splits = target_key.split() check_key = splits[0] self._pairs[target_key] = value if check_key not in matchs: matchs[check_key] = [] matchs[check_key].append(splits)
def add_set_values(self, value): checkwords = JapaneseLanguage.zenhan_normalize(value) checkwords = checkwords.upper() if checkwords in self._values: return self._values[checkwords] = value if self._is_CJK is True: splits = checkwords key = splits[0].upper() else: splits = checkwords.split() key = splits[0].upper() if key not in self._words: self._words[key] = [] self._words[key].append(splits)
def add_to_lookup(self, org_key, org_value): key = org_key value = org_value.strip() if JapaneseLanguage.is_CJKword(org_key) is True: key = key.strip() if key in self._pairs_jp: YLogger.error(self, "%s = %s already exists in jp_collection", key, value) return else: matchs = self._match_jp splits = key check_key = key[0] self._pairs_jp[key] = value else: if key[0] != ' ': key = key.strip() pattern_text = DoubleStringPatternSplitCollection.normalise_pattern( key) start = pattern_text.lstrip() middle = pattern_text end = pattern_text.rstrip() pattern = "(^%s|%s|%s$)" % (start, middle, end) replacement = value replaceInfo = [key, re.compile(pattern), replacement] self._replace.append(replaceInfo) return else: key = key.strip() if key in self._pairs: YLogger.error(self, "%s = %s already exists in en_collection", key, value) return else: matchs = self._match splits = key.split() check_key = splits[0] self._pairs[key] = value if check_key not in matchs: matchs[check_key] = [] matchs[check_key].append(splits)
def match(self, is_CJK, words, word_no, phrases): match_count = 0 phrases = sorted(phrases, key=len, reverse=True) for phrase in phrases: key = '' phrase_word_no = 0 words_word_no = word_no while phrase_word_no < len(phrase) and words_word_no < len(words): word = words[words_word_no] target_word = JapaneseLanguage.zenhan_normalize(word) target_word = target_word.upper() if is_CJK is True: phrase_word = phrase[phrase_word_no:(phrase_word_no + len(word))] if phrase_word == target_word: key += target_word match_count += 1 if (phrase_word_no + len(word)) == len(phrase): return match_count, key else: match_count = 0 break phrase_word_no += len(word) else: phrase_word = phrase[phrase_word_no] if phrase_word == target_word: if key != '': key += ' ' key += target_word match_count += 1 if (phrase_word_no + 1) == len(phrase): return match_count, key else: match_count = 0 break phrase_word_no += 1 words_word_no += 1 return 0, ''
def make_set_table(self, is_cjk, set_list): the_set = {} values = {} for value in set_list: check_words = JapaneseLanguage.zenhan_normalize(value) check_words = check_words.upper() if check_words in values: continue values[check_words] = value if is_cjk is True: splits = check_words key = splits[0] if key not in the_set: the_set[key] = [] the_set[key].append(splits) else: splits = check_words.split() key = splits[0] if key not in the_set: the_set[key] = [] the_set[key].append(splits) return the_set, values
def contains(self, name): bot_name = JapaneseLanguage.zenhan_normalize(name) bot_name = bot_name.upper() return bool(bot_name in self._botnames)
def regex(self, key): template = JapaneseLanguage.zenhan_normalize(key) template = template.upper() return self.property(template)
def contains(self, name): template_name = JapaneseLanguage.zenhan_normalize(name) template_name = template_name.upper() return bool(template_name in self._templates)
def check_cjk(self, is_cjk, value): if is_cjk is False: check_words = JapaneseLanguage.zenhan_normalize(value) if JapaneseLanguage.is_CJKword(check_words) is True: is_cjk = True return is_cjk
def add_to_lookup(self, org_key, org_value, filename=None, line=0): key = org_key value = org_value.strip() if key.strip() == '': error_info = "key is empty" self.set_error_info(filename, line, error_info) return if JapaneseLanguage.is_CJKword(org_key) is True: key = key.strip() if key in self._pairs_jp: YLogger.error(self, "%s = %s already exists in jp_collection", key, value) error_info = "duplicate key='%s' (value='%s' is invalid)" % ( key, value) self.set_error_info(filename, line, error_info) return else: matchs = self._match_jp splits = key check_key = key[0] self._pairs_jp[key] = value else: if key[0] != ' ': key = key.strip() if key in self._replace_key: YLogger.error( self, "%s = %s already exists in replace_collection", key, value) error_info = "duplicate replace_chars='%s' (value='%s' is invalid)" % ( key, value) self.set_error_info(filename, line, error_info) return pattern_text = DoubleStringPatternSplitCollection.normalise_pattern( key) start = pattern_text.lstrip() middle = pattern_text end = pattern_text.rstrip() pattern = "(^%s|%s|%s$)" % (start, middle, end) replacement = value replaceInfo = [key, re.compile(pattern), replacement] self._replace.append(replaceInfo) self._replace_key.append(key) return else: key = key.strip() if key in self._pairs: YLogger.error(self, "%s = %s already exists in en_collection", key, value) error_info = "duplicate key='%s' (value='%s' is invalid)" % ( key, value) self.set_error_info(filename, line, error_info) return else: matchs = self._match splits = key.split() check_key = splits[0] self._pairs[key] = value if check_key not in matchs: matchs[check_key] = [] matchs[check_key].append(splits)
def remove(self, name): bot_name = JapaneseLanguage.zenhan_normalize(name) bot_name = bot_name.upper() if bot_name in self._botnames: del self._botnames[bot_name]
def botInfo(self, name): bot_name = JapaneseLanguage.zenhan_normalize(name) bot_name = bot_name.upper() if bot_name in self._botnames: return self._botnames[bot_name] return None