def init(self, logger):
        Plugin.init(self, logger)
        language = self.father.config.options.get("language")
        if not(language and self.father.config.options.get("multilingual_style")):
            return False

        self.errors[50604] = self.def_class(item = 5060, level = 2, tags = ['name', 'fix:chair'],
            title = T_('Multilingual not matching'))
        self.lang = lang = self.father.config.options.get("language")
        style = self.father.config.options.get("multilingual_style")
        self.present = lambda tags: tags.get("name:" + lang[0]) and tags.get("name:" + lang[1])
        if style == "be":
            self.aggregator = lambda tags: [
                {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]},
                {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]},
            ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}]
            self.split = self.split_be
        elif style == "sp_eu":
            def aggregator(tags):
                name = tags.get("name")
                if name is not None and ("-" in name or "(" in name):
                    return []
                separator = " / " if name is None or " / " in name else "/"
                return [
                    {"name": tags["name:"+lang[0]].strip()},
                    {"name": tags["name:"+lang[1]].strip()},
                    {"name": tags["name:"+lang[0]].strip() + separator + tags["name:"+lang[1].strip()]},
                    {"name": tags["name:"+lang[1]].strip() + separator + tags["name:"+lang[0].strip()]},
                ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}]
            self.aggregator = aggregator
            self.split = self.split_sp_eu
        elif style == "sp_ast":
            def aggregator(tags):
                name = tags.get("name")
                if name is not None and ("-" in name or "(" in name):
                    return []
                separator = " / " if name is None or " / " in name else "/"
                return [
                    {"name": tags["name:"+lang[0]].strip()},
                    {"name": tags["name:"+lang[1]].strip()},
                    {"name": tags["name:"+lang[0]].strip() + separator + tags["name:"+lang[1].strip()]},
                ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}]
            self.aggregator = aggregator
            self.split = self.split_sp_ast
        elif style == "xk":
            self.aggregator = lambda tags: [
                {"name": tags["name:"+lang[0]].strip()},
                {"name": tags["name:"+lang[1]].strip()},
                {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]},
                {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]},
            ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}]
            self.split = self.split_be
        elif style == "ma":
            self.aggregator = lambda tags: [
                {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:zgh", tags.get("name:ber")), tags.get("name:ar")])))}
            ]
            self.split = self.split_ma
        elif style == "dj":
            self.aggregator = lambda tags: [
                {"name": " / ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))}
            ] if tags.get("name:fr") and tags.get("name:fr")[-1] in '0123456789' else [
                {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))}
            ]
            self.split = self.split_dj

        self.lang_regex_script = list(map(lambda l: [l, regex.compile(r"^[\p{{Common}}{0}]+$".format(gen_regex(language2scripts[l])), flags=regex.V1)], lang))
Ejemplo n.º 2
0
    def init(self, logger):
        Plugin.init(self, logger)
        language = self.father.config.options.get("language")
        if not (language
                and self.father.config.options.get("multilingual-style")):
            return False

        self.errors[50604] = {
            "item": 5060,
            "level": 2,
            "tag": ["name", "fix:chair"],
            "desc": T_(u"Multilingual not matching")
        }
        self.lang = lang = self.father.config.options.get("language")
        style = self.father.config.options.get("multilingual-style")
        self.present = lambda tags: tags.get("name:" + lang[0]) and tags.get(
            "name:" + lang[1])
        if style == "be":
            self.aggregator = lambda tags: [
                {
                    "name":
                    tags["name:" + lang[0]].strip() + " - " + tags[
                        "name:" + lang[1].strip()]
                },
                {
                    "name":
                    tags["name:" + lang[1]].strip() + " - " + tags[
                        "name:" + lang[0].strip()]
                },
            ] if tags.get("name:" + lang[0]) and tags.get("name:" + lang[
                1]) and tags["name:" + lang[0]].strip(
                ) != tags["name:" + lang[1]].strip() else [{
                    "name":
                    tags.get("name:" + lang[0], tags.get("name:" + lang[1])).
                    strip()
                }]
            self.split = self.split_be
        elif style == "xk":
            self.aggregator = lambda tags: [
                {
                    "name": tags["name:" + lang[0]].strip()
                },
                {
                    "name": tags["name:" + lang[1]].strip()
                },
                {
                    "name":
                    tags["name:" + lang[0]].strip() + " - " + tags[
                        "name:" + lang[1].strip()]
                },
                {
                    "name":
                    tags["name:" + lang[1]].strip() + " - " + tags[
                        "name:" + lang[0].strip()]
                },
            ] if tags.get("name:" + lang[0]) and tags.get("name:" + lang[
                1]) and tags["name:" + lang[0]].strip(
                ) != tags["name:" + lang[1]].strip() else [{
                    "name":
                    tags.get("name:" + lang[0], tags.get("name:" + lang[1])).
                    strip()
                }]
            self.split = self.split_be
        elif style == "ma":
            self.aggregator = lambda tags: [{
                "name":
                " ".join(
                    map(
                        lambda a: a.strip(),
                        filter(lambda a: a, [
                            tags.get("name:fr"),
                            tags.get("name:zgh", tags.get("name:ber")),
                            tags.get("name:ar")
                        ])))
            }]
            self.split = self.split_ma
        elif style == "dj":
            self.aggregator = lambda tags: [{
                "name":
                " / ".join(
                    map(
                        lambda a: a.strip(),
                        filter(lambda a: a,
                               [tags.get("name:fr"),
                                tags.get("name:ar")])))
            }] if tags.get("name:fr") and tags.get("name:fr")[
                -1] in '0123456789' else [{
                    "name":
                    " ".join(
                        map(
                            lambda a: a.strip(),
                            filter(lambda a: a,
                                   [tags.get("name:fr"),
                                    tags.get("name:ar")])))
                }]
            self.split = self.split_dj

        self.lang_regex_script = list(
            map(
                lambda l: [
                    l,
                    regex.compile(r"^[\p{Common}%s]+$" % gen_regex(
                        language2scripts[l]),
                                  flags=regex.V1)
                ], lang))
class Name_Multilingual(Plugin):

    def init(self, logger):
        Plugin.init(self, logger)
        language = self.father.config.options.get("language")
        if not(language and self.father.config.options.get("multilingual_style")):
            return False

        self.errors[50604] = self.def_class(item = 5060, level = 2, tags = ['name', 'fix:chair'],
            title = T_('Multilingual not matching'))
        self.lang = lang = self.father.config.options.get("language")
        style = self.father.config.options.get("multilingual_style")
        self.present = lambda tags: tags.get("name:" + lang[0]) and tags.get("name:" + lang[1])
        if style == "be":
            self.aggregator = lambda tags: [
                {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]},
                {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]},
            ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}]
            self.split = self.split_be
        elif style == "sp_eu":
            def aggregator(tags):
                name = tags.get("name")
                if name is not None and ("-" in name or "(" in name):
                    return []
                separator = " / " if name is None or " / " in name else "/"
                return [
                    {"name": tags["name:"+lang[0]].strip()},
                    {"name": tags["name:"+lang[1]].strip()},
                    {"name": tags["name:"+lang[0]].strip() + separator + tags["name:"+lang[1].strip()]},
                    {"name": tags["name:"+lang[1]].strip() + separator + tags["name:"+lang[0].strip()]},
                ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}]
            self.aggregator = aggregator
            self.split = self.split_sp_eu
        elif style == "sp_ast":
            def aggregator(tags):
                name = tags.get("name")
                if name is not None and ("-" in name or "(" in name):
                    return []
                separator = " / " if name is None or " / " in name else "/"
                return [
                    {"name": tags["name:"+lang[0]].strip()},
                    {"name": tags["name:"+lang[1]].strip()},
                    {"name": tags["name:"+lang[0]].strip() + separator + tags["name:"+lang[1].strip()]},
                ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}]
            self.aggregator = aggregator
            self.split = self.split_sp_ast
        elif style == "xk":
            self.aggregator = lambda tags: [
                {"name": tags["name:"+lang[0]].strip()},
                {"name": tags["name:"+lang[1]].strip()},
                {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]},
                {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]},
            ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}]
            self.split = self.split_be
        elif style == "ma":
            self.aggregator = lambda tags: [
                {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:zgh", tags.get("name:ber")), tags.get("name:ar")])))}
            ]
            self.split = self.split_ma
        elif style == "dj":
            self.aggregator = lambda tags: [
                {"name": " / ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))}
            ] if tags.get("name:fr") and tags.get("name:fr")[-1] in '0123456789' else [
                {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))}
            ]
            self.split = self.split_dj

        self.lang_regex_script = list(map(lambda l: [l, regex.compile(r"^[\p{{Common}}{0}]+$".format(gen_regex(language2scripts[l])), flags=regex.V1)], lang))

    def filter_fix_already_existing(self, names, s):
        return list(filter(
            lambda d: len(d) > 0,
            map(
                lambda z: dict(filter(
                    lambda kv: kv[1] not in names,
                    z.items()
                )),
                s
           )
        ))

    def node(self, data, tags):
        name = tags.get("name")
        names = list(map(lambda a: (a and a.strip()) or None, map(lambda a: tags.get("name:" + a), self.lang)))
        names_counts = ilen(filter(lambda a: a, names))

        if not name and names_counts == 0:
            return

        fix = []

        s = self.split(name) if name else None

        # Split: name -> name:xx
        if s:
            ss = self.filter_fix_already_existing(names, s)

            # Remove the uniq fix, if does not change an already existing tag
            if names_counts == 0:
                ss = list(filter(lambda d: len(d) > 1 or tags.get(list(d.items())[0]), ss))

            fix = fix + ss

        # Aggregate: name:xx -> name
        if names_counts > 0:
            if s:
                for z in s:
                    s_tags = dict(z, **tags)
                    a = self.aggregator(s_tags)
                    if {"name": name} not in a:
                        fix = fix + a
            else:
                a = self.aggregator(tags)
                if {"name": name} not in a:
                    fix = fix + a

        if fix:
            fix_ = []
            for f in fix:
                if f not in fix_:
                    fix_.append(f)
            return [{"class": 50604, "subclass": 0, "fix": fix_}]

    def way(self, data, tags, nds):
        return self.node(data, tags)

    def relation(self, data, tags, members):
        return self.node(data, tags)

    def split_delimitor(self, name, delimitor, ordered):
        s = list(map(lambda a: a.strip(), name.split(delimitor)))
        ret = []
        if len(s) == 1:
            for (lang, regex_) in self.lang_regex_script:
                if regex_.match(s[0]):
                    ret.append({"name:" + lang: s[0]})
        elif len(s) == 2:
            if self.lang_regex_script[0][1].match(s[0]) and self.lang_regex_script[1][1].match(s[1]):
                ret.append({"name:" + self.lang[0]: s[0], "name:" + self.lang[1]: s[1]})
            if not ordered and self.lang_regex_script[1][1].match(s[0]) and self.lang_regex_script[0][1].match(s[1]):
                ret.append({"name:" + self.lang[0]: s[1], "name:" + self.lang[1]: s[0]})
        return ret

    def split_be(self, name):
        return self.split_delimitor(name, ' - ', False)

    def split_sp_eu(self, name):
        if "-" not in name and "(" not in name:
            return self.split_delimitor(name, '/', False)

    def split_sp_ast(self, name):
        if "-" not in name and "(" not in name:
            return self.split_delimitor(name, '/', True)

    char_common = regex.compile(r"[\p{Common}]", flags=regex.V1)
    char_ma = {
        'fr': regex.compile(r"[{0}]".format(gen_regex(language2scripts['fr'])), flags=regex.V1),
        'ar': regex.compile(r"[{0}]".format(gen_regex(language2scripts['ar'])), flags=regex.V1),
        'zgh': regex.compile(r"[{0}]".format(gen_regex(language2scripts['zgh'])), flags=regex.V1),
    }.items()

    def split_ma(self, name):
        return self.split_diff_alphabets(name, ['ar', 'fr', 'zgh'])

    def split_dj(self, name):
        ret = self.split_diff_alphabets(name, ['ar', 'fr'])
        return list(map(lambda r:
            dict(map(lambda kv: (kv[0], kv[1].strip(' /')), r.items())),
            ret)) if ret else None

    def split_diff_alphabets(self, name, languages):
        min_max = dict(map(lambda l: [l, {'min': None, 'max': None}], languages))

        for i, c in enumerate(name):
            if not self.char_common.match(c):
                for (l, re) in self.char_ma:
                    if re.match(c):
                        if min_max[l]['min'] is None:
                            min_max[l]['min'] = i
                        min_max[l]['max'] = i

        min_max_filtered = list(filter(lambda l_mm: l_mm[1]['min'] is not None, min_max.items()))
        if len(min_max_filtered) == 0:
            return # No text detected
        min_max_sorted = sorted(min_max_filtered, key = lambda v: v[1]['min'])
        min_max_sorted_ = list(map(lambda a: [a[1]['min'], a[1]['max']], min_max_sorted))
        min_max_sorted_ = sum(min_max_sorted_, []) # Flatten the list
        if min_max_sorted_ != sorted(min_max_sorted_):
            return # Abort, there is overlap

        # Expend
        min_max_sorted[0][1]['min'] = 0
        min_max_sorted[-1][1]['max'] = len(name) - 1
        for i in range(1, len(min_max_sorted)):
            min_max_sorted[i - 1][1]['max'] = min_max_sorted[i][1]['min'] - 1

        # Split
        z = dict(map(lambda l_mm: ["name:" + l_mm[0], name[l_mm[1]['min']:l_mm[1]['max'] + 1].strip()], min_max_sorted))
        if len(z) > 0:
            return [z]
Ejemplo n.º 4
0
    def init(self, logger):
        Plugin.init(self, logger)
        language = self.father.config.options.get("language")
        if not(language and self.father.config.options.get("multilingual-style")):
            return False

        self.errors[50604] = {"item": 5060, "level": 2, "tag": ["name", "fix:chair"], "desc": T_(u"Multilingual not matching") }
        self.lang = lang = self.father.config.options.get("language")
        style = self.father.config.options.get("multilingual-style")
        self.present = lambda tags: tags.get("name:" + lang[0]) and tags.get("name:" + lang[1])
        if style == "be":
            self.aggregator = lambda tags: [
              {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]},
              {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]},
            ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}]
            self.split = self.split_be
        elif style == "xk":
            self.aggregator = lambda tags: [
              {"name": tags["name:"+lang[0]].strip()},
              {"name": tags["name:"+lang[1]].strip()},
              {"name": tags["name:"+lang[0]].strip() + " - " + tags["name:"+lang[1].strip()]},
              {"name": tags["name:"+lang[1]].strip() + " - " + tags["name:"+lang[0].strip()]},
            ] if tags.get("name:"+lang[0]) and tags.get("name:"+lang[1]) and tags["name:"+lang[0]].strip() != tags["name:"+lang[1]].strip() else [{"name": tags.get("name:"+lang[0], tags.get("name:"+lang[1])).strip()}]
            self.split = self.split_be
        elif style == "ma":
            self.aggregator = lambda tags: [
              {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:zgh", tags.get("name:ber")), tags.get("name:ar")])))}
            ]
            self.split = self.split_ma
        elif style == "dj":
            self.aggregator = lambda tags: [
              {"name": " / ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))}
            ] if tags.get("name:fr") and tags.get("name:fr")[-1] in '0123456789' else [
              {"name": " ".join(map(lambda a: a.strip(), filter(lambda a: a, [tags.get("name:fr"), tags.get("name:ar")])))}
            ]
            self.split = self.split_dj

        self.lang_regex_script = list(map(lambda l: [l, regex.compile(r"^[\p{Common}%s]+$" % gen_regex(language2scripts[l]), flags=regex.V1)], lang))
Ejemplo n.º 5
0
    def init(self, logger):
        Plugin.init(self, logger)
        self.errors[50701] = self.def_class(item = 5070, level = 2, tags = ['name', 'fix:chair'],
            title = T_('Some value chars does not match the language charset'),
            detail = T_(
'''Words are not written in the appropriate alphabet of the
language.'''),
            fix = T_(
'''Usually, a wrong language has been chosen. Sometimes the word has been
transliterated, and needs to be changed back to the original alphabet.
`name:ar=Salaam` should be either `name:en=Salaam` (if known by
untranslated name) or `name:en=Peace` (translated) or `name:ar=سلام`
(original).'''))
        self.errors[50702] = self.def_class(item = 5070, level = 2, tags = ['name', 'fix:chair'],
            title = T_('Non printable char'),
            detail = T_(
'''A non-printable character such as linefeed (0x000a) has been
used.'''),
            fix = T_(
'''Remove the character.'''))
        self.errors[50703] = self.def_class(item = 5070, level = 2, tags = ['name', 'fix:chair'],
            title = T_('Unexpected symbol in name'),
            detail = T_(
'''A symbol is used instead of a letter from the appropriate
alphabet.'''),
            fix = T_(
'''Change the character into a punctuation mark or something else more
appropriate.'''),
            resource = 'http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:General_Category=Other_Symbol:]')

        country = self.father.config.options.get("country")

        self.non_printable = regex.compile(u"[\p{Line_Separator}\p{Paragraph_Separator}\p{Control}\p{Private_Use}\p{Surrogate}\p{Unassigned}]", flags=regex.V1)
        # http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:General_Category=Other_Symbol:]
        self.other_symbol = regex.compile(u"[[\p{General_Category=Other_Symbol}]--[\p{Block=Latin 1 Supplement}\p{Block=Braille Patterns}\p{Block=CJK Radicals Supplement}\p{Block=Kangxi Radicals}\p{Block=CJK Strokes}]--[↔→◄►№]]", flags=regex.V1)
        self.non_letter = regex.compile(u"[^\p{Letter}\p{Mark}\p{Separator}]", flags=regex.V1)
        non_look_like_latin = u"\p{Hangul}\p{Bengali}\p{Bopomofo}\p{Braille}\p{Canadian_Aboriginal}\p{Devanagari}\p{Ethiopic}\p{Gujarati}\p{Gurmukhi}\p{Han}\p{Hangul}\p{Hanunoo}\p{Hebrew}\p{Hiragana}\p{Inherited}\p{Kannada}\p{Katakana}\p{Khmer}\p{Lao}\p{Malayalam}\p{Oriya}\p{Runic}\p{Sinhala}\p{Syriac}\p{TaiLe}\p{Tamil}\p{Thaana}\p{Thai}\p{Tibetan}"
        ammend = ""
        if country and country.startswith("BG"):
            ammend = "|TT" # Bulgarian survey point
        self.alone_char = regex.compile(u"(^| |[%s])(?:[A-Z]%s)(?= |[%s]|$)" % (non_look_like_latin, ammend, non_look_like_latin), flags=regex.V1)
        self.roman_number = regex.compile(u"(^| )(?:[IVXLDCM]+)(?= |$)", flags=regex.V1)

        self.scripts = language2scripts

        self.uniq_scripts = {}
        for k, s in self.scripts.items():
            if s and ilen(filter(lambda ss: ss[0] != "[", s)) == 1:
                self.uniq_scripts[k] = s[0]
            else:
                self.uniq_scripts[k] = None

        self.lang = {}
        for (k, s) in self.scripts.items():
            self.lang[k] = gen_regex(s)

        self.default = None
        languages = self.father.config.options.get("language")
        if languages:
            if not isinstance(languages, list):
                languages = [languages]

            # Assert the languages are mapped to scripts
            for language in languages:
                if language not in self.lang:
                    raise Exception("No script setup for language '%s'" % language)

            # Disable default scripts if one language is not mapped to scripts
            for language in languages:
                if not self.lang[language]:
                    languages = None

            # Build default regex
            if languages:
                self.default = regex.compile(r"[\p{Common}%s]" % "".join(map(lambda l: self.lang[l], languages)), flags=regex.V1)

        self.uniq_script = self.uniq_scripts.get(languages[0]) if languages and len(languages) == 1 else None

        for l, s in list(self.lang.items()):
            if s is None:
                del(self.lang[l])
            else:
                self.lang[l] = regex.compile(r"[\p{Common}%s]" % s, flags=regex.V1)

        self.names = [u"name", u"name_1", u"name_2", u"alt_name", u"loc_name", u"old_name", u"official_name", u"short_name"]
Ejemplo n.º 6
0
    def init(self, logger):
        Plugin.init(self, logger)
        self.errors[50701] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Some value chars does not match the language charset") }
        self.errors[50702] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Non printable char") }
        self.errors[50703] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Symbol char") }

        country = self.father.config.options.get("country")

        self.non_printable = regex.compile(u"[\p{Line_Separator}\p{Paragraph_Separator}\p{Control}\p{Private_Use}\p{Surrogate}\p{Unassigned}]", flags=regex.V1)
        # http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:General_Category=Other_Symbol:]
        self.other_symbol = regex.compile(u"[[\p{General_Category=Other_Symbol}]--[\p{Block=Latin 1 Supplement}\p{Block=Braille Patterns}\p{Block=CJK Radicals Supplement}\p{Block=Kangxi Radicals}\p{Block=CJK Strokes}]--[↔→◄►№]]", flags=regex.V1)
        self.non_letter = regex.compile(u"[^\p{Letter}\p{Mark}\p{Separator}]", flags=regex.V1)
        non_look_like_latin = u"\p{Hangul}\p{Bengali}\p{Bopomofo}\p{Braille}\p{Canadian_Aboriginal}\p{Devanagari}\p{Ethiopic}\p{Gujarati}\p{Gurmukhi}\p{Han}\p{Hangul}\p{Hanunoo}\p{Hebrew}\p{Hiragana}\p{Inherited}\p{Kannada}\p{Katakana}\p{Khmer}\p{Lao}\p{Malayalam}\p{Oriya}\p{Runic}\p{Sinhala}\p{Syriac}\p{TaiLe}\p{Tamil}\p{Thaana}\p{Thai}\p{Tibetan}"
        ammend = ""
        if country and country.startswith("BG"):
            ammend = "|TT" # Bulgarian survey point
        self.alone_char = regex.compile(u"(^| |[%s])(?:[A-Z]%s)(?= |[%s]|$)" % (non_look_like_latin, ammend, non_look_like_latin), flags=regex.V1)
        self.roman_number = regex.compile(u"(^| )(?:[IVXLDCM]+)(?= |$)", flags=regex.V1)

        self.scripts = language2scripts

        self.uniq_scripts = {}
        for k, s in self.scripts.items():
            if s and ilen(filter(lambda ss: ss[0] != "[", s)) == 1:
                self.uniq_scripts[k] = s[0]
            else:
                self.uniq_scripts[k] = None

        self.lang = {}
        for (k, s) in self.scripts.items():
            self.lang[k] = gen_regex(s)

        self.default = None
        languages = self.father.config.options.get("language")
        if languages:
            if not isinstance(languages, list):
                languages = [languages]

            # Assert the languages are mapped to scripts
            for language in languages:
                if language not in self.lang:
                    raise Exception("No script setup for language '%s'" % language)

            # Disable default scripts if one language is not mapped to scripts
            for language in languages:
                if not self.lang[language]:
                    languages = None

            # Build default regex
            if languages:
                self.default = regex.compile(r"[\p{Common}%s]" % "".join(map(lambda l: self.lang[l], languages)), flags=regex.V1)

        self.uniq_script = self.uniq_scripts.get(languages[0]) if languages and len(languages) == 1 else None

        for l, s in list(self.lang.items()):
            if s is None:
                del(self.lang[l])
            else:
                self.lang[l] = regex.compile(r"[\p{Common}%s]" % s, flags=regex.V1)

        self.names = [u"name", u"name_1", u"name_2", u"alt_name", u"loc_name", u"old_name", u"official_name", u"short_name"]
Ejemplo n.º 7
0
    def init(self, logger):
        Plugin.init(self, logger)
        self.errors[50701] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Some value chars does not match the language charset") }
        self.errors[50702] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Non printable char") }
        self.errors[50703] = { "item": 5070, "level": 2, "tag": ["name", "fix:chair"], "desc": T_f(u"Symbol char") }

        country = self.father.config.options.get("country")

        self.non_printable = regex.compile(u"[\p{Line_Separator}\p{Paragraph_Separator}\p{Control}\p{Private_Use}\p{Surrogate}\p{Unassigned}]", flags=regex.V1)
        # http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:General_Category=Other_Symbol:]
        self.other_symbol = regex.compile(u"[[\p{General_Category=Other_Symbol}]--[\p{Block=Latin 1 Supplement}\p{Block=Braille Patterns}\p{Block=CJK Radicals Supplement}\p{Block=Kangxi Radicals}\p{Block=CJK Strokes}]--[↔→◄►№]]", flags=regex.V1)
        self.non_letter = regex.compile(u"[^\p{Letter}\p{Mark}\p{Separator}]", flags=regex.V1)
        non_look_like_latin = u"\p{Hangul}\p{Bengali}\p{Bopomofo}\p{Braille}\p{Canadian_Aboriginal}\p{Devanagari}\p{Ethiopic}\p{Gujarati}\p{Gurmukhi}\p{Han}\p{Hangul}\p{Hanunoo}\p{Hebrew}\p{Hiragana}\p{Inherited}\p{Kannada}\p{Katakana}\p{Khmer}\p{Lao}\p{Malayalam}\p{Oriya}\p{Runic}\p{Sinhala}\p{Syriac}\p{TaiLe}\p{Tamil}\p{Thaana}\p{Thai}\p{Tibetan}"
        ammend = ""
        if country and country.startswith("BG"):
            ammend = "|TT" # Bulgarian survey point
        self.alone_char = regex.compile(u"(^| |[%s])(?:[A-Z]%s)(?= |[%s]|$)" % (non_look_like_latin, ammend, non_look_like_latin), flags=regex.V1)
        self.roman_number = regex.compile(u"(^| )(?:[IVXLDCM]+)(?= |$)", flags=regex.V1)

        self.scripts = language2scripts

        self.uniq_scripts = {}
        for k, s in self.scripts.items():
            if s and ilen(filter(lambda ss: ss[0] != "[", s)) == 1:
                self.uniq_scripts[k] = s[0]
            else:
                self.uniq_scripts[k] = None

        self.lang = {}
        for (k, s) in self.scripts.items():
            self.lang[k] = gen_regex(s)

        self.default = None
        languages = self.father.config.options.get("language")
        if languages:
            if not isinstance(languages, list):
                languages = [languages]

            # Assert the languages are mapped to scripts
            for language in languages:
                if language not in self.lang:
                    raise Exception("No script setup for language '%s'" % language)

            # Disable default scripts if one language is not mapped to scripts
            for language in languages:
                if not self.lang[language]:
                    languages = None

            # Build default regex
            if languages:
                self.default = regex.compile(r"[\p{Common}%s]" % "".join(map(lambda l: self.lang[l], languages)), flags=regex.V1)

        self.uniq_script = self.uniq_scripts.get(languages[0]) if languages and len(languages) == 1 else None

        for l, s in list(self.lang.items()):
            if s == None:
                del(self.lang[l])
            else:
                self.lang[l] = regex.compile(r"[\p{Common}%s]" % s, flags=regex.V1)

        self.names = [u"name", u"name_1", u"name_2", u"alt_name", u"loc_name", u"old_name", u"official_name", u"short_name"]
Ejemplo n.º 8
0
class Name_Multilingual(Plugin):
    def init(self, logger):
        Plugin.init(self, logger)
        language = self.father.config.options.get("language")
        if not (language
                and self.father.config.options.get("multilingual-style")):
            return False

        self.errors[50604] = {
            "item": 5060,
            "level": 2,
            "tag": ["name", "fix:chair"],
            "desc": T_(u"Multilingual not matching")
        }
        self.lang = lang = self.father.config.options.get("language")
        style = self.father.config.options.get("multilingual-style")
        self.present = lambda tags: tags.get("name:" + lang[0]) and tags.get(
            "name:" + lang[1])
        if style == "be":
            self.aggregator = lambda tags: [
                {
                    "name":
                    tags["name:" + lang[0]].strip() + " - " + tags[
                        "name:" + lang[1].strip()]
                },
                {
                    "name":
                    tags["name:" + lang[1]].strip() + " - " + tags[
                        "name:" + lang[0].strip()]
                },
            ] if tags.get("name:" + lang[0]) and tags.get("name:" + lang[
                1]) and tags["name:" + lang[0]].strip(
                ) != tags["name:" + lang[1]].strip() else [{
                    "name":
                    tags.get("name:" + lang[0], tags.get("name:" + lang[1])).
                    strip()
                }]
            self.split = self.split_be
        elif style == "xk":
            self.aggregator = lambda tags: [
                {
                    "name": tags["name:" + lang[0]].strip()
                },
                {
                    "name": tags["name:" + lang[1]].strip()
                },
                {
                    "name":
                    tags["name:" + lang[0]].strip() + " - " + tags[
                        "name:" + lang[1].strip()]
                },
                {
                    "name":
                    tags["name:" + lang[1]].strip() + " - " + tags[
                        "name:" + lang[0].strip()]
                },
            ] if tags.get("name:" + lang[0]) and tags.get("name:" + lang[
                1]) and tags["name:" + lang[0]].strip(
                ) != tags["name:" + lang[1]].strip() else [{
                    "name":
                    tags.get("name:" + lang[0], tags.get("name:" + lang[1])).
                    strip()
                }]
            self.split = self.split_be
        elif style == "ma":
            self.aggregator = lambda tags: [{
                "name":
                " ".join(
                    map(
                        lambda a: a.strip(),
                        filter(lambda a: a, [
                            tags.get("name:fr"),
                            tags.get("name:zgh", tags.get("name:ber")),
                            tags.get("name:ar")
                        ])))
            }]
            self.split = self.split_ma

        self.lang_regex_script = map(
            lambda l: [
                l,
                regex.compile(ur"^[\p{Common}%s]+$" % gen_regex(
                    language2scripts[l]),
                              flags=regex.V1)
            ], lang)

    def filter_fix_already_existing(self, names, s):
        return filter(
            lambda d: len(d) > 0,
            map(
                lambda z: dict(filter(lambda
                                      (k, v): v not in names, z.items())), s))

    def node(self, data, tags):
        name = tags.get("name")
        names = list(
            map(lambda a: (a and a.strip()) or None,
                map(lambda a: tags.get("name:" + a), self.lang)))
        names_counts = len(filter(lambda a: a, names))

        if not name and names_counts == 0:
            return

        fix = []

        s = self.split(name) if name else None

        # Split: name -> name:xx
        if s:
            ss = self.filter_fix_already_existing(names, s)

            # Remove the uniq fix, if does not change an already existing tag
            if names_counts == 0:
                ss = filter(lambda d: len(d) > 1 or tags.get(d.items()[0]), ss)

            fix = fix + ss

        # Aggregate: name:xx -> name
        if names_counts > 0:
            if s:
                for z in s:
                    s_tags = dict(z, **tags)
                    a = self.aggregator(s_tags)
                    if {"name": name} not in a:
                        fix = fix + a
            else:
                a = self.aggregator(tags)
                if {"name": name} not in a:
                    fix = fix + a

        if fix:
            fix_ = []
            for f in fix:
                if f not in fix_:
                    fix_.append(f)
            return [{"class": 50604, "subclass": 0, "fix": fix_}]

    def way(self, data, tags, nds):
        return self.node(data, tags)

    def relation(self, data, tags, members):
        return self.node(data, tags)

    def split_be(self, name):
        s = map(lambda a: a.strip(), name.split(' - '))
        ret = []
        if len(s) == 1:
            for (lang, regex_) in self.lang_regex_script:
                if regex_.match(s[0]):
                    ret.append({"name:" + lang: s[0]})
        elif len(s) == 2:
            if self.lang_regex_script[0][1].match(
                    s[0]) and self.lang_regex_script[1][1].match(s[1]):
                ret.append({
                    "name:" + self.lang[0]: s[0],
                    "name:" + self.lang[1]: s[1]
                })
            if self.lang_regex_script[1][1].match(
                    s[0]) and self.lang_regex_script[0][1].match(s[1]):
                ret.append({
                    "name:" + self.lang[0]: s[1],
                    "name:" + self.lang[1]: s[0]
                })
        return ret

    char_common = regex.compile(r"[\p{Common}]", flags=regex.V1)
    char_ma = {
        'fr':
        regex.compile(r"[%s]" % gen_regex(language2scripts['fr']),
                      flags=regex.V1),
        'ar':
        regex.compile(r"[%s]" % gen_regex(language2scripts['ar']),
                      flags=regex.V1),
        'zgh':
        regex.compile(r"[%s]" % gen_regex(language2scripts['zgh']),
                      flags=regex.V1),
    }.items()

    def split_ma(self, name):
        min_max = dict(
            map(lambda l: [l, {
                'min': None,
                'max': None
            }], ['ar', 'fr', 'zgh']))

        for i, c in enumerate(name):
            if not self.char_common.match(c):
                for (l, re) in self.char_ma:
                    if re.match(c):
                        if min_max[l]['min'] == None:
                            min_max[l]['min'] = i
                        min_max[l]['max'] = i

        min_max_filtered = filter(lambda (l, mm): mm['min'] != None,
                                  min_max.items())
        if len(min_max_filtered) == 0:
            return  # No text detected
        min_max_sorted = sorted(min_max_filtered, key=lambda v: v[1]['min'])
        min_max_sorted_ = map(lambda a: [a[1]['min'], a[1]['max']],
                              min_max_sorted)
        min_max_sorted_ = sum(min_max_sorted_, [])  # Flatten the list
        if min_max_sorted_ != sorted(min_max_sorted_):
            return  # Abort, there is overlap

        # Expend
        min_max_sorted[0][1]['min'] = 0
        min_max_sorted[-1][1]['max'] = len(name) - 1
        for i in range(1, len(min_max_sorted)):
            min_max_sorted[i - 1][1]['max'] = min_max_sorted[i][1]['min'] - 1

        # Split
        z = dict(
            map(
                lambda
                (l, mm): ["name:" + l, name[mm['min']:mm['max'] + 1].strip()],
                min_max_sorted))
        if len(z) > 0:
            return [z]