def init(self, logger): Plugin.init(self, logger) language = self.father.config.options.get("language") if not(language and self.father.config.options.get("multilingual_style")): return False self.errors[50604] = self.def_class(item = 5060, level = 2, tags = ['name', 'fix:chair'], title = T_('Multilingual not matching')) self.lang = lang = self.father.config.options.get("language") style = self.father.config.options.get("multilingual_style") self.present = lambda tags: tags.get("name:" + lang[0]) and tags.get("name:" + lang[1]) if style == "be": self.aggregator = lambda tags: [ {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]}, {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]}, ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}] self.split = self.split_be elif style == "sp_eu": def aggregator(tags): name = tags.get("name") if name is not None and ("-" in name or "(" in name): return [] separator = " / " if name is None or " / " in name else "/" return [ {"name": tags["name:"+lang[0]].strip()}, {"name": tags["name:"+lang[1]].strip()}, {"name": tags["name:"+lang[0]].strip() + separator + tags["name:"+lang[1].strip()]}, {"name": tags["name:"+lang[1]].strip() + separator + tags["name:"+lang[0].strip()]}, ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}] self.aggregator = aggregator self.split = self.split_sp_eu elif style == "sp_ast": def aggregator(tags): name = tags.get("name") if name is not None and ("-" in name or "(" in name): return [] separator = " / " if name is None or " / " in name else "/" return [ {"name": tags["name:"+lang[0]].strip()}, {"name": tags["name:"+lang[1]].strip()}, {"name": tags["name:"+lang[0]].strip() + separator + tags["name:"+lang[1].strip()]}, ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}] self.aggregator = aggregator self.split = self.split_sp_ast elif style == "xk": self.aggregator = lambda tags: [ {"name": tags["name:"+lang[0]].strip()}, {"name": tags["name:"+lang[1]].strip()}, {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]}, {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]}, ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}] self.split = self.split_be elif style == "ma": self.aggregator = lambda tags: [ {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:zgh", tags.get("name:ber")), tags.get("name:ar")])))} ] self.split = self.split_ma elif style == "dj": self.aggregator = lambda tags: [ {"name": " / ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))} ] if tags.get("name:fr") and tags.get("name:fr")[-1] in '0123456789' else [ {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))} ] self.split = self.split_dj self.lang_regex_script = list(map(lambda l: [l, regex.compile(r"^[\p{{Common}}{0}]+$".format(gen_regex(language2scripts[l])), flags=regex.V1)], lang))
def init(self, logger): Plugin.init(self, logger) language = self.father.config.options.get("language") if not (language and self.father.config.options.get("multilingual-style")): return False self.errors[50604] = { "item": 5060, "level": 2, "tag": ["name", "fix:chair"], "desc": T_(u"Multilingual not matching") } self.lang = lang = self.father.config.options.get("language") style = self.father.config.options.get("multilingual-style") self.present = lambda tags: tags.get("name:" + lang[0]) and tags.get( "name:" + lang[1]) if style == "be": self.aggregator = lambda tags: [ { "name": tags["name:" + lang[0]].strip() + " - " + tags[ "name:" + lang[1].strip()] }, { "name": tags["name:" + lang[1]].strip() + " - " + tags[ "name:" + lang[0].strip()] }, ] if tags.get("name:" + lang[0]) and tags.get("name:" + lang[ 1]) and tags["name:" + lang[0]].strip( ) != tags["name:" + lang[1]].strip() else [{ "name": tags.get("name:" + lang[0], tags.get("name:" + lang[1])). strip() }] self.split = self.split_be elif style == "xk": self.aggregator = lambda tags: [ { "name": tags["name:" + lang[0]].strip() }, { "name": tags["name:" + lang[1]].strip() }, { "name": tags["name:" + lang[0]].strip() + " - " + tags[ "name:" + lang[1].strip()] }, { "name": tags["name:" + lang[1]].strip() + " - " + tags[ "name:" + lang[0].strip()] }, ] if tags.get("name:" + lang[0]) and tags.get("name:" + lang[ 1]) and tags["name:" + lang[0]].strip( ) != tags["name:" + lang[1]].strip() else [{ "name": tags.get("name:" + lang[0], tags.get("name:" + lang[1])). strip() }] self.split = self.split_be elif style == "ma": self.aggregator = lambda tags: [{ "name": " ".join( map( lambda a: a.strip(), filter(lambda a: a, [ tags.get("name:fr"), tags.get("name:zgh", tags.get("name:ber")), tags.get("name:ar") ]))) }] self.split = self.split_ma elif style == "dj": self.aggregator = lambda tags: [{ "name": " / ".join( map( lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")]))) }] if tags.get("name:fr") and tags.get("name:fr")[ -1] in '0123456789' else [{ "name": " ".join( map( lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")]))) }] self.split = self.split_dj self.lang_regex_script = list( map( lambda l: [ l, regex.compile(r"^[\p{Common}%s]+$" % gen_regex( language2scripts[l]), flags=regex.V1) ], lang))
class Name_Multilingual(Plugin): def init(self, logger): Plugin.init(self, logger) language = self.father.config.options.get("language") if not(language and self.father.config.options.get("multilingual_style")): return False self.errors[50604] = self.def_class(item = 5060, level = 2, tags = ['name', 'fix:chair'], title = T_('Multilingual not matching')) self.lang = lang = self.father.config.options.get("language") style = self.father.config.options.get("multilingual_style") self.present = lambda tags: tags.get("name:" + lang[0]) and tags.get("name:" + lang[1]) if style == "be": self.aggregator = lambda tags: [ {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]}, {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]}, ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}] self.split = self.split_be elif style == "sp_eu": def aggregator(tags): name = tags.get("name") if name is not None and ("-" in name or "(" in name): return [] separator = " / " if name is None or " / " in name else "/" return [ {"name": tags["name:"+lang[0]].strip()}, {"name": tags["name:"+lang[1]].strip()}, {"name": tags["name:"+lang[0]].strip() + separator + tags["name:"+lang[1].strip()]}, {"name": tags["name:"+lang[1]].strip() + separator + tags["name:"+lang[0].strip()]}, ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}] self.aggregator = aggregator self.split = self.split_sp_eu elif style == "sp_ast": def aggregator(tags): name = tags.get("name") if name is not None and ("-" in name or "(" in name): return [] separator = " / " if name is None or " / " in name else "/" return [ {"name": tags["name:"+lang[0]].strip()}, {"name": tags["name:"+lang[1]].strip()}, {"name": tags["name:"+lang[0]].strip() + separator + tags["name:"+lang[1].strip()]}, ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}] self.aggregator = aggregator self.split = self.split_sp_ast elif style == "xk": self.aggregator = lambda tags: [ {"name": tags["name:"+lang[0]].strip()}, {"name": tags["name:"+lang[1]].strip()}, {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]}, {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]}, ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}] self.split = self.split_be elif style == "ma": self.aggregator = lambda tags: [ {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:zgh", tags.get("name:ber")), tags.get("name:ar")])))} ] self.split = self.split_ma elif style == "dj": self.aggregator = lambda tags: [ {"name": " / ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))} ] if tags.get("name:fr") and tags.get("name:fr")[-1] in '0123456789' else [ {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))} ] self.split = self.split_dj self.lang_regex_script = list(map(lambda l: [l, regex.compile(r"^[\p{{Common}}{0}]+$".format(gen_regex(language2scripts[l])), flags=regex.V1)], lang)) def filter_fix_already_existing(self, names, s): return list(filter( lambda d: len(d) > 0, map( lambda z: dict(filter( lambda kv: kv[1] not in names, z.items() )), s ) )) def node(self, data, tags): name = tags.get("name") names = list(map(lambda a: (a and a.strip()) or None, map(lambda a: tags.get("name:" + a), self.lang))) names_counts = ilen(filter(lambda a: a, names)) if not name and names_counts == 0: return fix = [] s = self.split(name) if name else None # Split: name -> name:xx if s: ss = self.filter_fix_already_existing(names, s) # Remove the uniq fix, if does not change an already existing tag if names_counts == 0: ss = list(filter(lambda d: len(d) > 1 or tags.get(list(d.items())[0]), ss)) fix = fix + ss # Aggregate: name:xx -> name if names_counts > 0: if s: for z in s: s_tags = dict(z, **tags) a = self.aggregator(s_tags) if {"name": name} not in a: fix = fix + a else: a = self.aggregator(tags) if {"name": name} not in a: fix = fix + a if fix: fix_ = [] for f in fix: if f not in fix_: fix_.append(f) return [{"class": 50604, "subclass": 0, "fix": fix_}] def way(self, data, tags, nds): return self.node(data, tags) def relation(self, data, tags, members): return self.node(data, tags) def split_delimitor(self, name, delimitor, ordered): s = list(map(lambda a: a.strip(), name.split(delimitor))) ret = [] if len(s) == 1: for (lang, regex_) in self.lang_regex_script: if regex_.match(s[0]): ret.append({"name:" + lang: s[0]}) elif len(s) == 2: if self.lang_regex_script[0][1].match(s[0]) and self.lang_regex_script[1][1].match(s[1]): ret.append({"name:" + self.lang[0]: s[0], "name:" + self.lang[1]: s[1]}) if not ordered and self.lang_regex_script[1][1].match(s[0]) and self.lang_regex_script[0][1].match(s[1]): ret.append({"name:" + self.lang[0]: s[1], "name:" + self.lang[1]: s[0]}) return ret def split_be(self, name): return self.split_delimitor(name, ' - ', False) def split_sp_eu(self, name): if "-" not in name and "(" not in name: return self.split_delimitor(name, '/', False) def split_sp_ast(self, name): if "-" not in name and "(" not in name: return self.split_delimitor(name, '/', True) char_common = regex.compile(r"[\p{Common}]", flags=regex.V1) char_ma = { 'fr': regex.compile(r"[{0}]".format(gen_regex(language2scripts['fr'])), flags=regex.V1), 'ar': regex.compile(r"[{0}]".format(gen_regex(language2scripts['ar'])), flags=regex.V1), 'zgh': regex.compile(r"[{0}]".format(gen_regex(language2scripts['zgh'])), flags=regex.V1), }.items() def split_ma(self, name): return self.split_diff_alphabets(name, ['ar', 'fr', 'zgh']) def split_dj(self, name): ret = self.split_diff_alphabets(name, ['ar', 'fr']) return list(map(lambda r: dict(map(lambda kv: (kv[0], kv[1].strip(' /')), r.items())), ret)) if ret else None def split_diff_alphabets(self, name, languages): min_max = dict(map(lambda l: [l, {'min': None, 'max': None}], languages)) for i, c in enumerate(name): if not self.char_common.match(c): for (l, re) in self.char_ma: if re.match(c): if min_max[l]['min'] is None: min_max[l]['min'] = i min_max[l]['max'] = i min_max_filtered = list(filter(lambda l_mm: l_mm[1]['min'] is not None, min_max.items())) if len(min_max_filtered) == 0: return # No text detected min_max_sorted = sorted(min_max_filtered, key = lambda v: v[1]['min']) min_max_sorted_ = list(map(lambda a: [a[1]['min'], a[1]['max']], min_max_sorted)) min_max_sorted_ = sum(min_max_sorted_, []) # Flatten the list if min_max_sorted_ != sorted(min_max_sorted_): return # Abort, there is overlap # Expend min_max_sorted[0][1]['min'] = 0 min_max_sorted[-1][1]['max'] = len(name) - 1 for i in range(1, len(min_max_sorted)): min_max_sorted[i - 1][1]['max'] = min_max_sorted[i][1]['min'] - 1 # Split z = dict(map(lambda l_mm: ["name:" + l_mm[0], name[l_mm[1]['min']:l_mm[1]['max'] + 1].strip()], min_max_sorted)) if len(z) > 0: return [z]
def init(self, logger): Plugin.init(self, logger) language = self.father.config.options.get("language") if not(language and self.father.config.options.get("multilingual-style")): return False self.errors[50604] = {"item": 5060, "level": 2, "tag": ["name", "fix:chair"], "desc": T_(u"Multilingual not matching") } self.lang = lang = self.father.config.options.get("language") style = self.father.config.options.get("multilingual-style") self.present = lambda tags: tags.get("name:" + lang[0]) and tags.get("name:" + lang[1]) if style == "be": self.aggregator = lambda tags: [ {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]}, {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]}, ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}] self.split = self.split_be elif style == "xk": self.aggregator = lambda tags: [ {"name": tags["name:"+lang[0]].strip()}, {"name": tags["name:"+lang[1]].strip()}, {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]}, {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]}, ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}] self.split = self.split_be elif style == "ma": self.aggregator = lambda tags: [ {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:zgh", tags.get("name:ber")), tags.get("name:ar")])))} ] self.split = self.split_ma elif style == "dj": self.aggregator = lambda tags: [ {"name": " / ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))} ] if tags.get("name:fr") and tags.get("name:fr")[-1] in '0123456789' else [ {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))} ] self.split = self.split_dj self.lang_regex_script = list(map(lambda l: [l, regex.compile(r"^[\p{Common}%s]+$" % gen_regex(language2scripts[l]), flags=regex.V1)], lang))
def init(self, logger): Plugin.init(self, logger) self.errors[50701] = self.def_class(item = 5070, level = 2, tags = ['name', 'fix:chair'], title = T_('Some value chars does not match the language charset'), detail = T_( '''Words are not written in the appropriate alphabet of the language.'''), fix = T_( '''Usually, a wrong language has been chosen. Sometimes the word has been transliterated, and needs to be changed back to the original alphabet. `name:ar=Salaam` should be either `name:en=Salaam` (if known by untranslated name) or `name:en=Peace` (translated) or `name:ar=سلام` (original).''')) self.errors[50702] = self.def_class(item = 5070, level = 2, tags = ['name', 'fix:chair'], title = T_('Non printable char'), detail = T_( '''A non-printable character such as linefeed (0x000a) has been used.'''), fix = T_( '''Remove the character.''')) self.errors[50703] = self.def_class(item = 5070, level = 2, tags = ['name', 'fix:chair'], title = T_('Unexpected symbol in name'), detail = T_( '''A symbol is used instead of a letter from the appropriate alphabet.'''), fix = T_( '''Change the character into a punctuation mark or something else more appropriate.'''), resource = 'http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:General_Category=Other_Symbol:]') country = self.father.config.options.get("country") self.non_printable = regex.compile(u"[\p{Line_Separator}\p{Paragraph_Separator}\p{Control}\p{Private_Use}\p{Surrogate}\p{Unassigned}]", flags=regex.V1) # http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:General_Category=Other_Symbol:] self.other_symbol = regex.compile(u"[[\p{General_Category=Other_Symbol}]--[\p{Block=Latin 1 Supplement}\p{Block=Braille Patterns}\p{Block=CJK Radicals Supplement}\p{Block=Kangxi Radicals}\p{Block=CJK Strokes}]--[↔→◄►№]]", flags=regex.V1) self.non_letter = regex.compile(u"[^\p{Letter}\p{Mark}\p{Separator}]", flags=regex.V1) non_look_like_latin = u"\p{Hangul}\p{Bengali}\p{Bopomofo}\p{Braille}\p{Canadian_Aboriginal}\p{Devanagari}\p{Ethiopic}\p{Gujarati}\p{Gurmukhi}\p{Han}\p{Hangul}\p{Hanunoo}\p{Hebrew}\p{Hiragana}\p{Inherited}\p{Kannada}\p{Katakana}\p{Khmer}\p{Lao}\p{Malayalam}\p{Oriya}\p{Runic}\p{Sinhala}\p{Syriac}\p{TaiLe}\p{Tamil}\p{Thaana}\p{Thai}\p{Tibetan}" ammend = "" if country and country.startswith("BG"): ammend = "|TT" # Bulgarian survey point self.alone_char = regex.compile(u"(^| |[%s])(?:[A-Z]%s)(?= |[%s]|$)" % (non_look_like_latin, ammend, non_look_like_latin), flags=regex.V1) self.roman_number = regex.compile(u"(^| )(?:[IVXLDCM]+)(?= |$)", flags=regex.V1) self.scripts = language2scripts self.uniq_scripts = {} for k, s in self.scripts.items(): if s and ilen(filter(lambda ss: ss[0] != "[", s)) == 1: self.uniq_scripts[k] = s[0] else: self.uniq_scripts[k] = None self.lang = {} for (k, s) in self.scripts.items(): self.lang[k] = gen_regex(s) self.default = None languages = self.father.config.options.get("language") if languages: if not isinstance(languages, list): languages = [languages] # Assert the languages are mapped to scripts for language in languages: if language not in self.lang: raise Exception("No script setup for language '%s'" % language) # Disable default scripts if one language is not mapped to scripts for language in languages: if not self.lang[language]: languages = None # Build default regex if languages: self.default = regex.compile(r"[\p{Common}%s]" % "".join(map(lambda l: self.lang[l], languages)), flags=regex.V1) self.uniq_script = self.uniq_scripts.get(languages[0]) if languages and len(languages) == 1 else None for l, s in list(self.lang.items()): if s is None: del(self.lang[l]) else: self.lang[l] = regex.compile(r"[\p{Common}%s]" % s, flags=regex.V1) self.names = [u"name", u"name_1", u"name_2", u"alt_name", u"loc_name", u"old_name", u"official_name", u"short_name"]
def init(self, logger): Plugin.init(self, logger) self.errors[50701] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Some value chars does not match the language charset") } self.errors[50702] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Non printable char") } self.errors[50703] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Symbol char") } country = self.father.config.options.get("country") self.non_printable = regex.compile(u"[\p{Line_Separator}\p{Paragraph_Separator}\p{Control}\p{Private_Use}\p{Surrogate}\p{Unassigned}]", flags=regex.V1) # http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:General_Category=Other_Symbol:] self.other_symbol = regex.compile(u"[[\p{General_Category=Other_Symbol}]--[\p{Block=Latin 1 Supplement}\p{Block=Braille Patterns}\p{Block=CJK Radicals Supplement}\p{Block=Kangxi Radicals}\p{Block=CJK Strokes}]--[↔→◄►№]]", flags=regex.V1) self.non_letter = regex.compile(u"[^\p{Letter}\p{Mark}\p{Separator}]", flags=regex.V1) non_look_like_latin = u"\p{Hangul}\p{Bengali}\p{Bopomofo}\p{Braille}\p{Canadian_Aboriginal}\p{Devanagari}\p{Ethiopic}\p{Gujarati}\p{Gurmukhi}\p{Han}\p{Hangul}\p{Hanunoo}\p{Hebrew}\p{Hiragana}\p{Inherited}\p{Kannada}\p{Katakana}\p{Khmer}\p{Lao}\p{Malayalam}\p{Oriya}\p{Runic}\p{Sinhala}\p{Syriac}\p{TaiLe}\p{Tamil}\p{Thaana}\p{Thai}\p{Tibetan}" ammend = "" if country and country.startswith("BG"): ammend = "|TT" # Bulgarian survey point self.alone_char = regex.compile(u"(^| |[%s])(?:[A-Z]%s)(?= |[%s]|$)" % (non_look_like_latin, ammend, non_look_like_latin), flags=regex.V1) self.roman_number = regex.compile(u"(^| )(?:[IVXLDCM]+)(?= |$)", flags=regex.V1) self.scripts = language2scripts self.uniq_scripts = {} for k, s in self.scripts.items(): if s and ilen(filter(lambda ss: ss[0] != "[", s)) == 1: self.uniq_scripts[k] = s[0] else: self.uniq_scripts[k] = None self.lang = {} for (k, s) in self.scripts.items(): self.lang[k] = gen_regex(s) self.default = None languages = self.father.config.options.get("language") if languages: if not isinstance(languages, list): languages = [languages] # Assert the languages are mapped to scripts for language in languages: if language not in self.lang: raise Exception("No script setup for language '%s'" % language) # Disable default scripts if one language is not mapped to scripts for language in languages: if not self.lang[language]: languages = None # Build default regex if languages: self.default = regex.compile(r"[\p{Common}%s]" % "".join(map(lambda l: self.lang[l], languages)), flags=regex.V1) self.uniq_script = self.uniq_scripts.get(languages[0]) if languages and len(languages) == 1 else None for l, s in list(self.lang.items()): if s is None: del(self.lang[l]) else: self.lang[l] = regex.compile(r"[\p{Common}%s]" % s, flags=regex.V1) self.names = [u"name", u"name_1", u"name_2", u"alt_name", u"loc_name", u"old_name", u"official_name", u"short_name"]
def init(self, logger): Plugin.init(self, logger) self.errors[50701] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Some value chars does not match the language charset") } self.errors[50702] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Non printable char") } self.errors[50703] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Symbol char") } country = self.father.config.options.get("country") self.non_printable = regex.compile(u"[\p{Line_Separator}\p{Paragraph_Separator}\p{Control}\p{Private_Use}\p{Surrogate}\p{Unassigned}]", flags=regex.V1) # http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:General_Category=Other_Symbol:] self.other_symbol = regex.compile(u"[[\p{General_Category=Other_Symbol}]--[\p{Block=Latin 1 Supplement}\p{Block=Braille Patterns}\p{Block=CJK Radicals Supplement}\p{Block=Kangxi Radicals}\p{Block=CJK Strokes}]--[↔→◄►№]]", flags=regex.V1) self.non_letter = regex.compile(u"[^\p{Letter}\p{Mark}\p{Separator}]", flags=regex.V1) non_look_like_latin = u"\p{Hangul}\p{Bengali}\p{Bopomofo}\p{Braille}\p{Canadian_Aboriginal}\p{Devanagari}\p{Ethiopic}\p{Gujarati}\p{Gurmukhi}\p{Han}\p{Hangul}\p{Hanunoo}\p{Hebrew}\p{Hiragana}\p{Inherited}\p{Kannada}\p{Katakana}\p{Khmer}\p{Lao}\p{Malayalam}\p{Oriya}\p{Runic}\p{Sinhala}\p{Syriac}\p{TaiLe}\p{Tamil}\p{Thaana}\p{Thai}\p{Tibetan}" ammend = "" if country and country.startswith("BG"): ammend = "|TT" # Bulgarian survey point self.alone_char = regex.compile(u"(^| |[%s])(?:[A-Z]%s)(?= |[%s]|$)" % (non_look_like_latin, ammend, non_look_like_latin), flags=regex.V1) self.roman_number = regex.compile(u"(^| )(?:[IVXLDCM]+)(?= |$)", flags=regex.V1) self.scripts = language2scripts self.uniq_scripts = {} for k, s in self.scripts.items(): if s and ilen(filter(lambda ss: ss[0] != "[", s)) == 1: self.uniq_scripts[k] = s[0] else: self.uniq_scripts[k] = None self.lang = {} for (k, s) in self.scripts.items(): self.lang[k] = gen_regex(s) self.default = None languages = self.father.config.options.get("language") if languages: if not isinstance(languages, list): languages = [languages] # Assert the languages are mapped to scripts for language in languages: if language not in self.lang: raise Exception("No script setup for language '%s'" % language) # Disable default scripts if one language is not mapped to scripts for language in languages: if not self.lang[language]: languages = None # Build default regex if languages: self.default = regex.compile(r"[\p{Common}%s]" % "".join(map(lambda l: self.lang[l], languages)), flags=regex.V1) self.uniq_script = self.uniq_scripts.get(languages[0]) if languages and len(languages) == 1 else None for l, s in list(self.lang.items()): if s == None: del(self.lang[l]) else: self.lang[l] = regex.compile(r"[\p{Common}%s]" % s, flags=regex.V1) self.names = [u"name", u"name_1", u"name_2", u"alt_name", u"loc_name", u"old_name", u"official_name", u"short_name"]
class Name_Multilingual(Plugin): def init(self, logger): Plugin.init(self, logger) language = self.father.config.options.get("language") if not (language and self.father.config.options.get("multilingual-style")): return False self.errors[50604] = { "item": 5060, "level": 2, "tag": ["name", "fix:chair"], "desc": T_(u"Multilingual not matching") } self.lang = lang = self.father.config.options.get("language") style = self.father.config.options.get("multilingual-style") self.present = lambda tags: tags.get("name:" + lang[0]) and tags.get( "name:" + lang[1]) if style == "be": self.aggregator = lambda tags: [ { "name": tags["name:" + lang[0]].strip() + " - " + tags[ "name:" + lang[1].strip()] }, { "name": tags["name:" + lang[1]].strip() + " - " + tags[ "name:" + lang[0].strip()] }, ] if tags.get("name:" + lang[0]) and tags.get("name:" + lang[ 1]) and tags["name:" + lang[0]].strip( ) != tags["name:" + lang[1]].strip() else [{ "name": tags.get("name:" + lang[0], tags.get("name:" + lang[1])). strip() }] self.split = self.split_be elif style == "xk": self.aggregator = lambda tags: [ { "name": tags["name:" + lang[0]].strip() }, { "name": tags["name:" + lang[1]].strip() }, { "name": tags["name:" + lang[0]].strip() + " - " + tags[ "name:" + lang[1].strip()] }, { "name": tags["name:" + lang[1]].strip() + " - " + tags[ "name:" + lang[0].strip()] }, ] if tags.get("name:" + lang[0]) and tags.get("name:" + lang[ 1]) and tags["name:" + lang[0]].strip( ) != tags["name:" + lang[1]].strip() else [{ "name": tags.get("name:" + lang[0], tags.get("name:" + lang[1])). strip() }] self.split = self.split_be elif style == "ma": self.aggregator = lambda tags: [{ "name": " ".join( map( lambda a: a.strip(), filter(lambda a: a, [ tags.get("name:fr"), tags.get("name:zgh", tags.get("name:ber")), tags.get("name:ar") ]))) }] self.split = self.split_ma self.lang_regex_script = map( lambda l: [ l, regex.compile(ur"^[\p{Common}%s]+$" % gen_regex( language2scripts[l]), flags=regex.V1) ], lang) def filter_fix_already_existing(self, names, s): return filter( lambda d: len(d) > 0, map( lambda z: dict(filter(lambda (k, v): v not in names, z.items())), s)) def node(self, data, tags): name = tags.get("name") names = list( map(lambda a: (a and a.strip()) or None, map(lambda a: tags.get("name:" + a), self.lang))) names_counts = len(filter(lambda a: a, names)) if not name and names_counts == 0: return fix = [] s = self.split(name) if name else None # Split: name -> name:xx if s: ss = self.filter_fix_already_existing(names, s) # Remove the uniq fix, if does not change an already existing tag if names_counts == 0: ss = filter(lambda d: len(d) > 1 or tags.get(d.items()[0]), ss) fix = fix + ss # Aggregate: name:xx -> name if names_counts > 0: if s: for z in s: s_tags = dict(z, **tags) a = self.aggregator(s_tags) if {"name": name} not in a: fix = fix + a else: a = self.aggregator(tags) if {"name": name} not in a: fix = fix + a if fix: fix_ = [] for f in fix: if f not in fix_: fix_.append(f) return [{"class": 50604, "subclass": 0, "fix": fix_}] def way(self, data, tags, nds): return self.node(data, tags) def relation(self, data, tags, members): return self.node(data, tags) def split_be(self, name): s = map(lambda a: a.strip(), name.split(' - ')) ret = [] if len(s) == 1: for (lang, regex_) in self.lang_regex_script: if regex_.match(s[0]): ret.append({"name:" + lang: s[0]}) elif len(s) == 2: if self.lang_regex_script[0][1].match( s[0]) and self.lang_regex_script[1][1].match(s[1]): ret.append({ "name:" + self.lang[0]: s[0], "name:" + self.lang[1]: s[1] }) if self.lang_regex_script[1][1].match( s[0]) and self.lang_regex_script[0][1].match(s[1]): ret.append({ "name:" + self.lang[0]: s[1], "name:" + self.lang[1]: s[0] }) return ret char_common = regex.compile(r"[\p{Common}]", flags=regex.V1) char_ma = { 'fr': regex.compile(r"[%s]" % gen_regex(language2scripts['fr']), flags=regex.V1), 'ar': regex.compile(r"[%s]" % gen_regex(language2scripts['ar']), flags=regex.V1), 'zgh': regex.compile(r"[%s]" % gen_regex(language2scripts['zgh']), flags=regex.V1), }.items() def split_ma(self, name): min_max = dict( map(lambda l: [l, { 'min': None, 'max': None }], ['ar', 'fr', 'zgh'])) for i, c in enumerate(name): if not self.char_common.match(c): for (l, re) in self.char_ma: if re.match(c): if min_max[l]['min'] == None: min_max[l]['min'] = i min_max[l]['max'] = i min_max_filtered = filter(lambda (l, mm): mm['min'] != None, min_max.items()) if len(min_max_filtered) == 0: return # No text detected min_max_sorted = sorted(min_max_filtered, key=lambda v: v[1]['min']) min_max_sorted_ = map(lambda a: [a[1]['min'], a[1]['max']], min_max_sorted) min_max_sorted_ = sum(min_max_sorted_, []) # Flatten the list if min_max_sorted_ != sorted(min_max_sorted_): return # Abort, there is overlap # Expend min_max_sorted[0][1]['min'] = 0 min_max_sorted[-1][1]['max'] = len(name) - 1 for i in range(1, len(min_max_sorted)): min_max_sorted[i - 1][1]['max'] = min_max_sorted[i][1]['min'] - 1 # Split z = dict( map( lambda (l, mm): ["name:" + l, name[mm['min']:mm['max'] + 1].strip()], min_max_sorted)) if len(z) > 0: return [z]