Esempio n. 1
0
def check_is_valid_glyph_string(glyphs, iso=None):
    """
    a string of glyphs like "a b c d e f" should be single-space separated
    single unicode characters
    """
    if type(glyphs) is not str or len(glyphs) < 1:
        log.error("Do not use empty glyph sequences")
        return False

    if re.findall(r"\n", glyphs):
        log.error("Glyph sequences should not contain line breaks")
        return False

    if re.findall(r" {2,}", glyphs):
        log.error("More than single space in '%s'" % glyphs)
        print([g for g in re.findall(r" {2,}", glyphs)])
        return False

    pruned, removed = prune_superflous_marks(glyphs)
    if len(removed) > 0:
        log.error("Superflous marks that are implicitly extracted via "
                  "decomposition: '%s'" % "','".join(removed))
        return False

    for c in glyphs:
        if unicodedata2.category(c) == "Sk":
            log.warning("'%s' contains modifier symbol '%s' in characters. It "
                        "is very likely this should be a combining mark "
                        "instead." % (iso, c))

    return True
Esempio n. 2
0
def is_separator(character: str) -> bool:
    if character.isspace() or character in ["|", "+", ",", ";", "<", ">"]:
        return True

    character_category = unicodedata.category(character)  # type: str

    return "Z" in character_category
Esempio n. 3
0
def prune_superflous_marks(string):
    """
    From a given string return a set of unique characters with all those
    standalone Mark charaters removed that are already implicitly present in
    a decomposable character

    @param string str
    @return set pruned, set removed
    """
    unique_strings = character_list_from_string(string)
    removed = []

    for c in unique_strings:
        # No need to bother about glyph clusters with more than one character,
        # since that inherently will not be a mistakenly listed mark
        if len(c) > 1:
            continue
        if unicodedata2.category(c).startswith("M"):
            for s in unique_strings:
                if s != c and c in parse_chars(s):
                    removed.append(c)

    if removed == []:
        return unique_strings, ()

    pruned = list_unique([c for c in unique_strings if c not in removed])
    removed = list_unique(removed)

    return pruned, removed
Esempio n. 4
0
def is_separator(character: str) -> bool:
    if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
        return True

    character_category: str = unicodedata.category(character)

    return "Z" in character_category
Esempio n. 5
0
def detectScript(txt):
    charScript = [script(c) for c in txt]

    for i, ch in enumerate(txt):
        scr = charScript[i]
        if scr in UNKNOWN_SCRIPT:
            if i:
                scr = charScript[i - 1]
            else:
                scr = None
            cat = category(ch)
            if ch in MIRRORED and cat == "Pe":
                scr = None
        charScript[i] = scr

    # Any unknowns should be mapped to the _next_ script
    prev = None
    for i in range(len(txt) - 1, -1, -1):
        if charScript[i] is None:
            charScript[i] = prev
        else:
            prev = charScript[i]

    # There may be unknowns at the end of the string, fall back to
    # preceding script
    prev = "Zxxx"  # last resort
    for i in range(len(txt)):
        if charScript[i] is None:
            charScript[i] = prev
        else:
            prev = charScript[i]

    assert None not in charScript

    return charScript
Esempio n. 6
0
 def prune_chars(self, retainDecomposed=False):
     """
     A helper to parse all orthographies' charsets in all languages. This
     decomposes glyphs and prunes any glyphs that are redundant. Also
     transforms the dict attributes from strings to lists.
     """
     for lang in self.values():
         if "orthographies" in lang:
             for o in lang["orthographies"]:
                 for type in [
                         "base", "auxiliary", "numerals", "punctuation",
                         "marks"
                 ]:
                     if type in o:
                         o[type] = parse_chars(o[type], True,
                                               retainDecomposed)
                         if type == "base":
                             o[type] = [
                                 c for c in o[type]
                                 if not unicodedata2.category(c).startswith(
                                     "M")
                             ]  # noqa
                 # Remove any components in auxiliary after decomposition
                 # that are already in base
                 if "base" in o and "auxiliary" in o:
                     o["auxiliary"] = [
                         a for a in o["auxiliary"] if a not in o["base"]
                     ]
Esempio n. 7
0
	def getCategoryStr(self):
		cat = unicodedata.category(self.text)
		if cat == 'Cn':
			try:
				cat = unicodeInfo.unicodeData['en'][self.num][1]
			except KeyError:
				pass
		catNames = [self.getCategoryValue(cat, l) for l in unicodeInfo.langs]
		return cat + ' - ' + ' / '.join(catNames)
Esempio n. 8
0
def is_punctuation(character: str) -> bool:
    character_category = unicodedata.category(character)  # type: str

    if "P" in character_category:
        return True

    character_range = unicode_range(character)  # type: Optional[str]

    if character_range is None:
        return False

    return "Punctuation" in character_range
Esempio n. 9
0
def is_symbol(character: str) -> bool:
    character_category = unicodedata.category(character)  # type: str

    if "S" in character_category or "N" in character_category:
        return True

    character_range = unicode_range(character)  # type: Optional[str]

    if character_range is None:
        return False

    return "Forms" in character_range
Esempio n. 10
0
def check_types(Langs):
    for iso, lang in Langs.items():
        if "includes" in lang:
            if not check_is_valid_list(lang["includes"]):
                logging.error("'%s' has invalid list 'includes'" % iso)

        if "source" in lang:
            if not check_is_valid_list(lang["source"]):
                logging.error("'%s' has invalid list 'source'" % iso)

        if "orthographies" in lang:
            if not check_is_valid_list(lang["orthographies"]):
                logging.error("'%s' has invalid list 'orthographies'" % iso)

            for o in lang["orthographies"]:
                if "base" in o:
                    if iso == "arg":
                        for i, c in enumerate(list(o["base"].replace(" ",
                                                                     ""))):
                            if unicodedata2.category(c).startswith("Z"):
                                logging.error("'%s' has invalid whitespace "
                                              "characters '%s' at %d" %
                                              (iso, unicodedata2.name(c), i))

                    if not check_is_valid_glyph_string(o["base"]):
                        logging.error("'%s' has invalid 'base' glyph list" %
                                      iso)

                if "combinations" in o:
                    if not check_is_valid_combation_string(o["combinations"]):
                        logging.error("'%s' has invalid 'combination' string" %
                                      iso)

        if "name" not in lang and "preferred_name" not in lang:
            logging.error("'%s' has neither 'name' nor 'preferred_name'" % iso)

        if "name" in lang and "preferred_name" in lang and \
                lang["name"] == lang["preferred_name"]:
            logging.error("'%s' has 'name' and 'preferred_name', but they are "
                          "identical" % iso)

        # if "todo_status" in lang and lang["todo_status"] not in VALID_TODOS:
        #     logging.error("'%s' has an invalid 'todo_status'" % iso)

        if "status" in lang and lang["status"] not in VALID_STATUS:
            logging.error("'%s' has an invalid 'status'" % iso)
Esempio n. 11
0
def sort_key_character_category(c):
    """
    Sorting comparator to sort unicode characters by their unicode type, first
    Letters (Uppercase, then lowercase, if applicable), then Marks, then
    anything else, secondary sort by unicode ASC
    """
    order = ["Lu", "Lt", "Ll", "LC", "L", "Lo", "Mn", "Me", "M", "Mc"]

    # Get the first letter of the category
    cat = unicodedata2.category(c)[:2]

    # Get the index of that letter in the order, or higher if not found
    order = order.index(cat) if cat in order else len(order)

    # Concat the primary order with the unicode int, so as secondary sort we
    # get unicode ASC
    order = "%s-%s" % (str(order).zfill(2), str(ord(c)).zfill(8))
    return order
Esempio n. 12
0
def parse_marks(input):
    """
    From a space separated string
    """
    chars = parse_chars(input)
    return [c for c in chars if unicodedata2.category(c).startswith("M")]
Esempio n. 13
0
def is_private_use_only(character: str) -> bool:
    character_category = unicodedata.category(character)  # type: str

    return "Co" == character_category
Esempio n. 14
0
	# return file names:
	if len(args) < 1: 
		print "Please specify an inputfont."
		sys.exit(2)
	elif len(args) < 2: 
		inPath = args[0]
		outPath = os.path.splitext(inPath)[0] + DEFAULT_OUTPATH_ADDITION.lower().strip() + os.path.splitext(inPath)[1]
	else: 
		inPath = args[0]
		outPath = args[1]
	return inPath, outPath

#########################################################################################################

MARK_GLYPH_CODEPOINT_RANGE = [ int(m) for m in range(65000) if unicodedata.category(unichr(m)) == "Mn" ]; m=None # also allow for "M"?
MARK_GLYPH_CODEPOINT_RANGE.remove(int("034F", 16))

PPF2_SUPPORTED = 0 # not tested yet! and deactivated ...

#########################################################################################################

def saveFile(data,file):
	modus = "wb"
	file = os.path.abspath(file)
	if os.path.exists(file): os.remove(file)
	directory = os.path.dirname(file)
	if not os.path.exists(directory): os.makedirs(directory)
	theFile = open(file,modus)
	theFile.write(data)
	theFile.close()
Esempio n. 15
0
def is_private_use_only(character: str) -> bool:
    character_category: str = unicodedata.category(character)

    return character_category == "Co"
Esempio n. 16
0
def check_types(Langs):
    for iso, lang in Langs.items():
        if "includes" in lang:
            if not check_is_valid_list(lang["includes"]):
                log.error("'%s' has invalid list 'includes'" % iso)

        if "source" in lang:
            if not check_is_valid_list(lang["source"]):
                log.error("'%s' has invalid list 'source'" % iso)

        if "orthographies" in lang:
            if not check_is_valid_list(lang["orthographies"]):
                log.error("'%s' has invalid list 'orthographies'" % iso)

            for o in lang["orthographies"]:
                if "base" in o:
                    if iso == "arg":
                        chars = list(o["base"].replace(" ", ""))
                        for i, c in enumerate(chars):
                            if unicodedata2.category(c).startswith("Z"):
                                log.error("'%s' has invalid whitespace "
                                          "characters '%s' at %d" %
                                          (iso, unicodedata2.name(c), i))

                    if not check_is_valid_glyph_string(o["base"], iso):
                        log.error("'%s' has invalid 'base' glyph list" % iso)

                if "auxiliary" in o:
                    if not check_is_valid_glyph_string(o["auxiliary"], iso):
                        log.error("'%s' has invalid 'auxiliary' glyph list" %
                                  iso)

                allowed = [
                    "autonym",
                    "inherit",
                    "script",
                    "base",
                    "marks",
                    "auxiliary",
                    "numerals",
                    "status",
                    "note",
                    "punctuation",  # tolerated for now, but unused
                    "preferred_as_group",
                    "design_note"
                ]
                invalid = [k for k in o.keys() if k not in allowed]
                if len(invalid):
                    log.warn("'%s' has invalid orthography keys: '%s'" %
                             (iso, "', '".join(invalid)))

                if "status" not in o:
                    log.error("'%s' has an orthography (script '%s') that is "
                              "missing 'status'" % (iso, o["script"]))
                else:
                    if o["status"] not in ORTHOGRAPHY_STATUSES:
                        log.error("'%s' has an orthography status '%s' which "
                                  "is invalid, should be one of %s" %
                                  (iso, o["status"],
                                   ", ".join(ORTHOGRAPHY_STATUSES)))

            primary_orthography = [
                o for o in lang["orthographies"]
                if "status" in o and o["status"] == "primary"
            ]
            if len(primary_orthography) == 0:
                log.error("'%s' has no primary orthography" % iso)

        if "name" not in lang and "preferred_name" not in lang:
            log.error("'%s' has neither 'name' nor 'preferred_name'" % iso)

        if "name" in lang and "preferred_name" in lang and \
                lang["name"] == lang["preferred_name"]:
            log.error("'%s' has 'name' and 'preferred_name', but they are "
                      "identical" % iso)

        if "status" in lang and lang["status"] not in STATUSES:
            log.error("'%s' has an invalid 'status'" % iso)

        if "validity" not in lang:
            log.warn("'%s' is missing 'validity'" % iso)

        if "validity" in lang and lang["validity"] not in VALIDITYLEVELS:
            log.error("'%s' has invalid 'validity'" % iso)

        if "speakers" in lang:
            if (re.search(r"[^\d]", str(lang["speakers"]))):
                log.error("'%s' has invalid 'speakers' '%s' - only numbers "
                          "are allowed" % (iso, lang["speakers"]))
Esempio n. 17
0
def save_sorted(Langs=None):
    """
    Helper script to re-save the hyperglot.yaml sorted alphabetically,
    alternatively from the passed in Langs object (which can have been
    modified)
    """
    log.setLevel(logging.WARNING)
    if Langs is None:
        Langs = Languages(inherit=False, prune=False)
        print("Running pre-save validation, please fix any issues flagged.")
        # validate()

    # Save with removed superflous marks
    for iso, lang in Langs.items():
        if "orthographies" in lang:
            for i, o in enumerate(lang["orthographies"]):
                for type in ["base", "auxiliary", "numerals"]:
                    if type in o:
                        chars = o[type]
                        pruned, removed = prune_superflous_marks(" ".join(
                            o[type]))

                        if len(removed) > 0:

                            log.info("Saving '%s' with '%s' pruned of "
                                     "superfluous marks (implicitly "
                                     "included in combining glyphs): "
                                     "%s" % (iso, type, "','".join(removed)))

                        chars = pruned

                        # Do not include anything (after decomposition)
                        # that is already listed in base
                        if "base" in o and type != "base":
                            chars = [c for c in chars if c not in o["base"]]

                        joined = " ".join(chars)

                        Langs[iso]["orthographies"][i][type] = joined

                # Automate extracting and writing marks (in addition to any
                # that might have been defined manually). Note that we only
                # extract marks from 'base' since 'marks' are part of the
                # base level checking. Marks in 'auxiliary' will simply be
                # saved (if necessary) in 'auxiliary'.
                marks = []
                if "marks" in o:
                    marks = parse_chars(o["marks"],
                                        decompose=True,
                                        retainDecomposed=False)
                if "base" in o:
                    marks = set(marks + parse_marks(o["base"]))
                if len(marks) > 0:
                    # Note: Let's store marks with two spaces between to
                    # make them more legible; when parsing the attribute
                    # back in all whitespaces are removed
                    o["marks"] = "  ".join(sorted(marks))
                    if "base" in o:
                        base, removed = prune_superflous_marks(" ".join(
                            o["base"]))

                        # Save base without marks
                        _base = [
                            c for c in base
                            if not uni.category(c).startswith("M")
                        ]
                        o["base"] = " ".join(_base)

    # Sort by keys
    alphabetic = dict(OrderedDict(sorted(Langs.items())))

    file = open(DB, "w")
    yaml.dump(alphabetic, file, **DUMP_ARGS)
    print("Saved lib/hyperglot/hyperglot.yaml")
Esempio n. 18
0
def get_stats_from_chars(text_chars, db=None):

    report = {}

    uppercase = []
    numerals = []
    punctuation = []
    controlchars = []
    spaces = []
    other = []

    # Include decomposed forms
    for c in text_chars:
        decomposed = ud.normalize("NFKD", c)
        if len(decomposed) > 1:
            text_chars = text_chars + [d for d in decomposed]

    text_chars = set(text_chars)

    for c in text_chars:
        # print(c, ud.category(c))
        cat = ud.category(c)

        if cat == "Lu":
            uppercase.append(c)
        elif cat.startswith("N"):
            numerals.append(c)
        elif cat.startswith("P"):
            punctuation.append(c)
        elif cat.startswith("C") and len(c) > 1:
            controlchars.append(c)
        elif cat.startswith("Z"):
            spaces.append(c)
        else:
            other.append(c)

    # Remove all but "other" from chars, we don't care about them for diffing
    for remove in [
            uppercase, numerals, punctuation, controlchars, spaces,
        ["\n", "\t"]
    ]:
        text_chars = text_chars.difference(set(remove))

    report["iso_in_db"] = db is not None
    report["found_in_text"] = {
        "uppercase": sorted(uppercase),
        "numerals": sorted(numerals),
        "punctuation": sorted(punctuation),
        "chars": sorted(text_chars)
    }

    # Compare to orthographies
    if db is not None:
        db_chars = []
        if "orthographies" in db:
            for o in db["orthographies"]:
                if "base" in o:
                    db_chars = db_chars + o["base"]
                if "auxiliary" in o:
                    db_chars = db_chars + o["auxiliary"]

        db_chars = set(sorted(db_chars))

        not_in_db = text_chars.difference(db_chars)
        missing_from_text = db_chars.difference(text_chars)
        decomposed = set(parse_chars("".join(text_chars), decompose=True))

        # print("Listed in DB but not in text", missing_from_text)
        # print("Appears in text but not listed in DB", not_in_db)
        # print("Text can be written with DB characters",
        #       decomposed.issubset(db_chars))
        missing_from_db = ""
        for c in not_in_db:
            missing = ud.normalize("NFKD", c)
            missing_parts = ""
            for part in missing:
                if part not in db_chars:
                    missing_parts = missing_parts + part
            if missing_parts != []:
                missing_from_db = missing_from_db + missing_parts
        # print("missing from db", sorted(list(missing_from_db)))
        missing_from_db = sorted(list(set(missing_from_db)))

        report["not_in_text"] = sorted(missing_from_text)
        report["not_in_db"] = sorted(not_in_db)
        if missing_from_db:
            report["missing_from_db"] = missing_from_db
        report["db_chars_valid"] = decomposed.issubset(db_chars)

    return report
Esempio n. 19
0
def infoline(cp):
	i = get_info(cp)
	char = i.char.encode('unicode_escape').decode() if category(i.char).startswith('C') else i.char
	return [oct(i.cp), i.cp, hex(i.cp), i.html, char, i.block.name, i.name]
Esempio n. 20
0
def remove_accents(s):
    return ''.join((c for c in unicodedata2.normalize('NFD', s)
                    if unicodedata2.category(c) != 'Mn'))
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )