Esempio n. 1
0
    def ldml_write(self, exemplars, sort=True):
        """Write exemplars to a string that can be written to a LDML formatted file."""
        if sort:
            # Exemplars mentioned in UTS #35 need to be sorted.
            list_exemplars = list()
            for exemplar in sorted(exemplars):
                list_exemplars.append(exemplar)
        else:
            # Graphemes should be sorted by frequency,
            # and since they already are,
            # do nothing further here with the order.
            list_exemplars = exemplars

        list_nfc_exemplars = map(self.ucd.normalize_nfc, list_exemplars)

        # Ignore exemplars not of most common script found in the data.
        list_nfc_exemplars_main_script = list()
        for exemplar in list_nfc_exemplars:
            char = exemplar[
                0]  # only look at the first character in an exemplar.
            script = Script.getScript(char)
            script_name = Script.getShortName(script)
            if script_name == self._get_script(
            ) or not self.ucd.is_specific_script(char):
                list_nfc_exemplars_main_script.append(exemplar)

        if self.unittest:
            return ' '.join(list_nfc_exemplars_main_script)
        else:
            return palaso.sldr.UnicodeSets.list2us(
                list_nfc_exemplars_main_script, self.ucd)
Esempio n. 2
0
    def ldml_write(self, exemplars, sort=True):
        """Write exemplars to a string that can be written to a LDML formatted file."""
        if sort:
            # Exemplars mentioned in UTS #35 need to be sorted.
            list_exemplars = list()
            for exemplar in sorted(exemplars):
                list_exemplars.append(exemplar)
        else:
            # Graphemes should be sorted by frequency,
            # and since they already are,
            # do nothing further here with the order.
            list_exemplars = exemplars

        list_nfc_exemplars = map(self.ucd.normalize_nfc, list_exemplars)

        # Ignore exemplars not of most common script found in the data.
        list_nfc_exemplars_main_script = list()
        for exemplar in list_nfc_exemplars:
            char = exemplar[0] # only look at the first character in an exemplar.
            script = Script.getScript(char)
            script_name = Script.getShortName(script)
            if script_name == self._get_script() or not self.ucd.is_specific_script(char):
                list_nfc_exemplars_main_script.append(exemplar)

        if self.unittest:
            return ' '.join(list_nfc_exemplars_main_script)
        else:
            return palaso.sldr.UnicodeSets.list2us(list_nfc_exemplars_main_script, self.ucd)
Esempio n. 3
0
 def is_specific_script(char):
     """True if the character has a specific Script property,
     that is, not the values Common or Inherited.
     """
     script = Script.getScript(char)
     script_code = Script.getScriptCode(script)
     if script_code == UScriptCode.COMMON or script_code == UScriptCode.INHERITED:
         return False
     return True
Esempio n. 4
0
 def is_specific_script(char):
     """True if the character has a specific Script property,
     that is, not the values Common or Inherited.
     """
     script = Script.getScript(char)
     script_code = Script.getScriptCode(script)
     if script_code == UScriptCode.COMMON or script_code == UScriptCode.INHERITED:
         return False
     return True
Esempio n. 5
0
def make_json(puz_uid, hint_uid, size=(30, 30), limit=1000, sample=100, numtrans=3):
    script = get_script(puz_uid)
    rtl = Script(Script.getCode(script)[0]).isRightToLeft()
    c = translate_clues(gen_puzzle2(puz_uid, size, limit, script), puz_uid, hint_uid, numtrans)
    grid = []
    for row in c.best_grid:
        new_row = [cell if cell != " " else "" for cell in row]
        if rtl: new_row = new_row[::-1]
        grid.append(new_row)
    wl = []
    for clue in sorted(c.best_wordlist, key=lambda x: (x[4], x[2], x[3])):
        new_clue = clue[:]
        if rtl:
            new_clue[3] = len(grid[0]) - 1 - clue[3]
        wl.append(new_clue)
    allcaps = "".join(["".join(clue[0]) for clue in wl]).isupper()
    json.dump({"grid": grid, "clues": wl, "rtl": rtl, "allcaps": allcaps}, open("puzzle.json", "w"))
    return wl
Esempio n. 6
0
 def _get_script(self):
     """Return most frequently occurring script."""
     script_code_and_count_list = self.scripts.most_common(1)
     if len(script_code_and_count_list) == 0:
         return ''
     else:
         script_code_and_count = script_code_and_count_list[0]
         script_code = script_code_and_count[0]
         script = self.codes_for_scripts[script_code]
         script_name = Script.getShortName(script)
         return script_name
Esempio n. 7
0
 def _get_script(self):
     """Return most frequently occurring script."""
     script_code_and_count_list = self.scripts.most_common(1)
     if len(script_code_and_count_list) == 0:
         return ''
     else:
         script_code_and_count = script_code_and_count_list[0]
         script_code = script_code_and_count[0]
         script = self.codes_for_scripts[script_code]
         script_name = Script.getShortName(script)
         return script_name
Esempio n. 8
0
def normalize_string(in_str, allowed_scripts):
    """
    Normalizes in_str by replacing letters and digits in other scripts with
    exemplar values.

    Args:
        in_str: String to process
        allowed_scripts: List of script short names (like "Mymr") to preserve
    """
    # TODO: Consider checking ScriptExtensions here as well
    output = ""
    for ch in in_str:
        ch_script = Script.getScript(ch)
        ch_type = Char.charType(ch)
        ch_bucket = CHAR_TYPE_TO_BUCKET[ch_type]
        ch_digit = Char.digit(ch)
        if ch_script.getShortName() in allowed_scripts:
            # ch is in an allowed script:
            # copy directly to the output
            output += ch
        elif ch_bucket == 1:
            # ch is a letter in a disallowed script:
            # normalize to the sample char for that script
            output += Script.getSampleString(ch_script)
        elif ch_bucket == 3 and ch_digit != -1:
            # ch is a decimal digit in a disallowed script:
            # normalize to the zero digit in that numbering system
            output += chr(ord(ch) - ch_digit)
        elif ch_type == UCharCategory.CURRENCY_SYMBOL:
            # ch is a currency symbol in a disallowed script:
            # normalize to $
            output += "$"
        else:
            # all other characters:
            # copy directly to the output
            output += ch
    return output
Esempio n. 9
0
    def process(self, text):
        """Analyze a string."""
        i = 0
        text = self.ucd.normalize('NFD', text)

        # Record script of each character.
        for char in text:
            script = Script.getScript(char)
            script_code = Script.getScriptCode(script)
            self.scripts[script_code] += 1
            self.codes_for_scripts[script_code] = script

        # Record clusters
        while i < len(text):

            # Look for multigraphs (from length of max_multigraph_length down to 1) character(s)
            # of multigraphs already specified in a LDML file.
            # Longest possible matches are looked at first.
            for multigraph_length in range(self.max_multigraph_length, 0, -1):
                multigraph = text[i:i + multigraph_length]

                if (multigraph in self._main or
                   multigraph in self._auxiliary or
                   multigraph in self._index or
                   multigraph in self._punctuation):
                    exemplar = Exemplar(multigraph)
                    self.clusters[exemplar] += 1
                    i += multigraph_length
                    break

            # No multigraphs were found at this position,
            # so continue processing a single character
            # if we have not gone beyond the end of the text.
            if not i < len(text):
                break

            char = text[i]

            # Test for punctuation.
            if self.ucd.ispunct(char):
                exemplar = Exemplar(char)
                self.clusters[exemplar] += 1
                i += 1
                continue

            # Find grapheme clusters.

            # Ensure exemplar base has needed properties.
            if not self.allowable(char):
                i += 1
                continue

            # The current character is a base character.
            base = char

            # Then find the end of the cluster
            # (which may consist of only base characters).
            length = base_length = 1
            while i + length < len(text):
                trailer = text[i + length]
                if Char.hasBinaryProperty(trailer, UProperty.DEFAULT_IGNORABLE_CODE_POINT):
                    # A Default_Ignorable_Code_Point was found, so the cluster continues.
                    length += 1
                    continue
                if self.ucd.ismark(trailer):
                    # A Mark was found, so the cluster continues.
                    length += 1

                    # Marks such as nuktas are considered part of the base.
                    if self.ucd.is_always_combine(trailer):
                        # A Mark such as a nukta was found, so the base continues,
                        # as well as the cluster.
                        base_length += 1
                        base = text[i:i + base_length]
                    continue
                else:
                    # No more marks, so the end of the cluster has been reached.
                    break

            # Extract cluster

            # If no nuktas have been found,
            # then the base will be the single character already called base (or char).
            # If no non-nukta marks have been found,
            # then the trailers variable will be an empty string.
            trailers = text[i + base_length:i + length]
            exemplar = Exemplar(base, trailers)

            self.clusters[exemplar] += 1
            i += length
Esempio n. 10
0
    def process(self, text):
        """Analyze a string."""
        i = 0
        text = self.ucd.normalize('NFD', text)

        # Record script of each character.
        for char in text:
            script = Script.getScript(char)
            script_code = Script.getScriptCode(script)
            self.scripts[script_code] += 1
            self.codes_for_scripts[script_code] = script

        # Record clusters
        while i < len(text):

            # Look for multigraphs (from length of max_multigraph_length down to 1) character(s)
            # of multigraphs already specified in a LDML file.
            # Longest possible matches are looked at first.
            for multigraph_length in range(self.max_multigraph_length, 0, -1):
                multigraph = text[i:i + multigraph_length]

                if (multigraph in self._main or
                   multigraph in self._auxiliary or
                   multigraph in self._index or
                   multigraph in self._punctuation):
                    exemplar = Exemplar(multigraph)
                    self.clusters[exemplar] += 1
                    i += multigraph_length
                    break

            # No multigraphs were found at this position,
            # so continue processing a single character
            # if we have not gone beyond the end of the text.
            if not i < len(text):
                break

            char = text[i]

            # Test for punctuation.
            if self.ucd.ispunct(char):
                exemplar = Exemplar(char)
                self.clusters[exemplar] += 1
                i += 1
                continue

            # Find grapheme clusters.

            # Ensure exemplar base has needed properties.
            if not self.allowable(char):
                i += 1
                continue

            # The current character is a base character.
            base = char

            # Then find the end of the cluster
            # (which may consist of only base characters).
            length = base_length = 1
            while i + length < len(text):
                trailer = text[i + length]
                if self.ucd.is_zwj(trailer):
                    # ZWJ found, so the cluster continues.
                    length += 1
                    continue
                if self.ucd.is_zwnj(trailer):
                    # ZWNJ found, so the end of the cluster has been reached,
                    # but put include ZWNJ in the cluster
                    length += 1
                    break
                if self.ucd.ismark(trailer):
                    # A Mark was found, so the cluster continues.
                    length += 1

                    # Marks such as nuktas are considered part of the base.
                    if self.ucd.is_always_combine(trailer):
                        # A Mark such as a nukta was found, so the base continues,
                        # as well as the cluster.
                        base_length += 1
                        base = text[i:i + base_length]
                    continue
                else:
                    # No more marks, so the end of the cluster has been reached.
                    break

            # Extract cluster

            # If no nuktas have been found,
            # then the base will be the single character already called base (or char).
            # If no non-nukta marks have been found,
            # then the trailers variable will be an empty string.
            trailers = text[i + base_length:i + length]
            exemplar = Exemplar(base, trailers)

            self.clusters[exemplar] += 1
            i += length