Beispiel #1
0
    def get_lcp(self):
        """Returns the LCP array"""

        if not self._completed_suffixes:
            self.get_suffixes()

        assert self._completed_suffixes

        rank = [[0 for _ in range(len(d_list))] for d_list in self.data]
        lcp = [0 for _ in range(self.length)]

        # compute rank array
        for i in range(self.length):
            glyph_idx, tok_idx = self.suffixes[i]
            rank[glyph_idx][tok_idx] = i

        for glyph_idx in range(len(self.data)):
            cur_h = 0
            chstring = self.data[glyph_idx]
            for tok_idx in range(len(chstring)):
                cur_rank = rank[glyph_idx][tok_idx]
                if cur_rank > 0:
                    last_glidx, last_tidx = self.suffixes[cur_rank - 1]
                    last_chstring = self.data[last_glidx]
                    while last_tidx + cur_h < len(last_chstring) and \
                          tok_idx + cur_h < len(chstring) and \
                          last_chstring[last_tidx + cur_h] == self.data[glyph_idx][tok_idx + cur_h]:
                        cur_h += 1
                    lcp[cur_rank] = cur_h

                    if cur_h > 0:
                        cur_h -= 1

        return lcp
Beispiel #2
0
    def get_lcp(self):
        """Returns the LCP array"""

        if not self._completed_suffixes:
            self.get_suffixes()

        assert self._completed_suffixes

        rank = [[0 for _ in range(len(d_list))] for d_list in self.data]
        lcp = [0 for _ in range(self.length)]

        # compute rank array
        for i in range(self.length):
            glyph_idx, tok_idx = self.suffixes[i]
            rank[glyph_idx][tok_idx] = i

        for glyph_idx in range(len(self.data)):
            cur_h = 0
            chstring = self.data[glyph_idx]
            for tok_idx in range(len(chstring)):
                cur_rank = rank[glyph_idx][tok_idx]
                if cur_rank > 0:
                    last_glidx, last_tidx = self.suffixes[cur_rank - 1]
                    last_chstring = self.data[last_glidx]
                    while last_tidx + cur_h < len(last_chstring) and \
                          tok_idx + cur_h < len(chstring) and \
                          last_chstring[last_tidx + cur_h] == self.data[glyph_idx][tok_idx + cur_h]:
                        cur_h += 1
                    lcp[cur_rank] = cur_h

                    if cur_h > 0:
                        cur_h -= 1

        return lcp
Beispiel #3
0
 def _sortByDecompositionBase(self, glyphNames, ascending,
                              allowPseudoUnicode):
     baseToGlyphNames = {None: []}
     for glyphName in glyphNames:
         if allowPseudoUnicode:
             value = self.pseudoUnicodeForGlyphName(glyphName)
         else:
             value = self.unicodeForGlyphName(glyphName)
         if value is None:
             base = None
         else:
             base = unicodeTools.decompositionBase(value)
             base = self.glyphNameForUnicode(base)
             # try to add the glyph names suffix to the base.
             # this will handle mapping aacute.alt to a.alt
             # instead of aacute.alt to a.
             if base is not None:
                 if "." in glyphName and not glyphName.startswith("."):
                     suffix = glyphName.split(".")[1]
                     if base + "." + suffix in self.font:
                         base = base + "." + suffix
         if base not in baseToGlyphNames:
             baseToGlyphNames[base] = []
         baseToGlyphNames[base].append(glyphName)
     # get the list of glyphs with no base.
     noBase = baseToGlyphNames.pop(None)
     # find all bases that are not in the overall glyph names list
     missingBase = []
     for base in sorted(baseToGlyphNames):
         if base is None:
             continue
         if base not in noBase:
             missingBase.append(base)
     # work through the found bases
     processedBases = set()
     sortedResult = []
     for base in noBase:
         if base in processedBases:
             continue
         processedBases.add(base)
         # the base could be in the list more than once.
         # if so, add the proper number of instances of the base.
         count = noBase.count(base)
         r = [base for i in range(count)]
         # add the referencing glyphs
         r += baseToGlyphNames.get(base, [])
         sortedResult.append(r)
     # work through the missing bases
     for base in sorted(missingBase):
         sortedResult.append(baseToGlyphNames[base])
     # reverse if necessary
     if not ascending:
         sortedResult.reverse()
     return sortedResult
Beispiel #4
0
 def _sortByDecompositionBase(self, glyphNames, ascending, allowPseudoUnicode):
     baseToGlyphNames = {None:[]}
     for glyphName in glyphNames:
         if allowPseudoUnicode:
             value = self.pseudoUnicodeForGlyphName(glyphName)
         else:
             value = self.unicodeForGlyphName(glyphName)
         if value is None:
             base = None
         else:
             base = unicodeTools.decompositionBase(value)
             base = self.glyphNameForUnicode(base)
             # try to add the glyph names suffix to the base.
             # this will handle mapping aacute.alt to a.alt
             # instead of aacute.alt to a.
             if base is not None:
                 if "." in glyphName and not glyphName.startswith("."):
                     suffix = glyphName.split(".")[1]
                     if base + "." + suffix in self.font:
                         base = base + "." + suffix
         if base not in baseToGlyphNames:
             baseToGlyphNames[base] = []
         baseToGlyphNames[base].append(glyphName)
     # get the list of glyphs with no base.
     noBase = baseToGlyphNames.pop(None)
     # find all bases that are not in the overall glyph names list
     missingBase = []
     for base in sorted(baseToGlyphNames):
         if base is None:
             continue
         if base not in noBase:
             missingBase.append(base)
     # work through the found bases
     processedBases = set()
     sortedResult = []
     for base in noBase:
         if base in processedBases:
             continue
         processedBases.add(base)
         # the base could be in the list more than once.
         # if so, add the proper number of instances of the base.
         count = noBase.count(base)
         r = [base for i in range(count)]
         # add the referencing glyphs
         r += baseToGlyphNames.get(base, [])
         sortedResult.append(r)
     # work through the missing bases
     for base in sorted(missingBase):
         sortedResult.append(baseToGlyphNames[base])
     # reverse if necessary
     if not ascending:
         sortedResult.reverse()
     return sortedResult
Beispiel #5
0
def _encode_base64(data, maxlinelength=76, indent_level=1):
    data = b64encode(data)
    if data and maxlinelength:
        # split into multiple lines right-justified to 'maxlinelength' chars
        indent = b"\n" + b"  " * indent_level
        max_length = max(16, maxlinelength - len(indent))
        chunks = []
        for i in range(0, len(data), max_length):
            chunks.append(indent)
            chunks.append(data[i:i + max_length])
        chunks.append(indent)
        data = b"".join(chunks)
    return data
Beispiel #6
0
    def process_chstrings(self, glyph_set):
        """Remap the charstring alphabet and put into self.data"""

        self.glyph_set_keys = sorted(glyph_set.keys())

        keymap = {} # maps charstring tokens -> simple integer alphabet

        next_key = 0

        for k in self.glyph_set_keys:
            char_string = glyph_set[k]._glyph
            char_string.decompile()
            program = []
            piter = iter(enumerate(char_string.program))
            for i, tok in piter:
                assert tok not in ("callsubr", "callgsubr", "return")
                assert tok != "endchar" or i == len(char_string.program) - 1
                if tok in ("hintmask", "cntrmask"):
                    # Attach next token to this, as a subroutine
                    # call cannot be placed between this token and
                    # the following.
                    _, tokennext = next(piter)
                    tok = (tok, tokennext)
                if not tok in keymap:
                    keymap[tok] = next_key
                    self.rev_keymap.append(tok)
                    self.cost_map.append(tokenCost(tok))
                    next_key += 1
                program.append(keymap[tok])

            program = tuple(program)
            chstr_len = len(program)
            self.length += chstr_len
            glyph_idx = len(self.data)
            self.suffixes.extend(
                    map(lambda x: (glyph_idx, x), range(chstr_len))
                )
            self.data.append(tuple(program))

        self.alphabet_size = next_key
Beispiel #7
0
    def process_chstrings(self, glyph_set):
        """Remap the charstring alphabet and put into self.data"""

        self.glyph_set_keys = sorted(glyph_set.keys())

        keymap = {}  # maps charstring tokens -> simple integer alphabet

        next_key = 0

        for k in self.glyph_set_keys:
            char_string = glyph_set[k]._glyph
            char_string.decompile()
            program = []
            piter = iter(enumerate(char_string.program))
            for i, tok in piter:
                assert tok not in ("callsubr", "callgsubr", "return")
                assert tok != "endchar" or i == len(char_string.program) - 1
                if tok in ("hintmask", "cntrmask"):
                    # Attach next token to this, as a subroutine
                    # call cannot be placed between this token and
                    # the following.
                    _, tokennext = next(piter)
                    tok = (tok, tokennext)
                if not tok in keymap:
                    keymap[tok] = next_key
                    self.rev_keymap.append(tok)
                    self.cost_map.append(tokenCost(tok))
                    next_key += 1
                program.append(keymap[tok])

            program = tuple(program)
            chstr_len = len(program)
            self.length += chstr_len
            glyph_idx = len(self.data)
            self.suffixes.extend(
                map(lambda x: (glyph_idx, x), range(chstr_len)))
            self.data.append(tuple(program))

        self.alphabet_size = next_key
Beispiel #8
0
    def process_subrs(glyph_set_keys, encodings, fdlen, fdselect, substrings,
                      rev_keymap, subr_limit, nest_limit):
        def mark_reachable(cand_subr, fdidx):
            try:
                if fdidx not in cand_subr._fdidx:
                    cand_subr._fdidx.append(fdidx)
            except AttributeError:
                cand_subr._fdidx = [fdidx]

            for it in cand_subr._encoding:
                mark_reachable(it[1], fdidx)

        if fdselect is not None:
            for g, enc in zip(glyph_set_keys, encodings):
                sel = fdselect(g)
                for it in enc:
                    mark_reachable(it[1], sel)
        else:
            for encoding in encodings:
                for it in encoding:
                    mark_reachable(it[1], 0)

        subrs = [
            s for s in substrings
            if s.usages() > 0 and hasattr(s, '_fdidx') and bool(s._fdidx)
            and s.subr_saving(use_usages=True, true_cost=True) > 0
        ]

        bad_substrings = [
            s for s in substrings if s.usages() == 0
            or not hasattr(s, '_fdidx') or not bool(s._fdidx)
            or s.subr_saving(use_usages=True, true_cost=True) <= 0
        ]
        log.debug("%d substrings unused or negative saving subrs",
                  len(bad_substrings))

        for s in bad_substrings:
            s._flatten = True

        gsubrs = []
        lsubrs = [[] for _ in range(fdlen)]

        subrs.sort(
            key=lambda s: s.subr_saving(use_usages=True, true_cost=True))

        while subrs and (any(len(s) < subr_limit
                             for s in lsubrs) or len(gsubrs) < subr_limit):
            subr = subrs[-1]
            del subrs[-1]
            if len(subr._fdidx) == 1:
                lsub_index = lsubrs[subr._fdidx[0]]
                if len(gsubrs) < subr_limit:
                    if len(lsub_index) < subr_limit:
                        # both have space
                        gcost = Compreffor.test_call_cost(subr, gsubrs)
                        lcost = Compreffor.test_call_cost(subr, lsub_index)

                        if gcost < lcost:
                            Compreffor.insert_by_usage(subr, gsubrs)
                            subr._global = True
                        else:
                            Compreffor.insert_by_usage(subr, lsub_index)
                    else:
                        # just gsubrs has space
                        Compreffor.insert_by_usage(subr, gsubrs)
                        subr._global = True
                elif len(lsub_index) < subr_limit:
                    # just lsubrs has space
                    Compreffor.insert_by_usage(subr, lsub_index)
                else:
                    # we must skip :(
                    bad_substrings.append(subr)
            else:
                if len(gsubrs) < subr_limit:
                    # we can put it in globals
                    Compreffor.insert_by_usage(subr, gsubrs)
                    subr._global = True
                else:
                    # no room for this one
                    bad_substrings.append(subr)

        bad_substrings.extend([s[1] for s in subrs
                               ])  # add any leftover subrs to bad_substrings

        if fdselect is not None:
            # CID-keyed: Avoid `callsubr` usage in global subroutines
            bad_lsubrs = Compreffor.collect_lsubrs_called_from(gsubrs)
            bad_substrings.extend(bad_lsubrs)
            lsubrs = [[s for s in lsubrarr if s not in bad_lsubrs]
                      for lsubrarr in lsubrs]

        for s in bad_substrings:
            s._flatten = True

        # fix any nesting issues
        Compreffor.calc_nesting(gsubrs)
        for subrs in lsubrs:
            Compreffor.calc_nesting(subrs)

        too_nested = [
            s for s in itertools.chain(*lsubrs)
            if s._max_call_depth > nest_limit
        ]
        too_nested.extend(
            [s for s in gsubrs if s._max_call_depth > nest_limit])
        for s in too_nested:
            s._flatten = True
        bad_substrings.extend(too_nested)
        lsubrs = [[s for s in lsubrarr if s._max_call_depth <= nest_limit]
                  for lsubrarr in lsubrs]
        gsubrs = [s for s in gsubrs if s._max_call_depth <= nest_limit]
        too_nested = len(too_nested)

        log.debug("%d substrings nested too deep", too_nested)
        log.debug("%d substrings being flattened", len(bad_substrings))

        # reorganize to minimize call cost of most frequent subrs
        gbias = psCharStrings.calcSubrBias(gsubrs)
        lbias = [psCharStrings.calcSubrBias(s) for s in lsubrs]

        for subr_arr, bias in zip(itertools.chain([gsubrs], lsubrs),
                                  itertools.chain([gbias], lbias)):
            subr_arr.sort(key=lambda s: s.usages(), reverse=True)

            if bias == 1131:
                subr_arr[:] = subr_arr[216:1240] + subr_arr[0:216] + subr_arr[
                    1240:]
            elif bias == 32768:
                subr_arr[:] = (subr_arr[2264:33901] + subr_arr[216:1240] +
                               subr_arr[0:216] + subr_arr[1240:2264] +
                               subr_arr[33901:])
            for idx, subr in enumerate(subr_arr):
                subr._position = idx

        for subr in sorted(bad_substrings, key=lambda s: len(s)):
            # NOTE: it is important this is run in order so shorter
            # substrings are run before longer ones
            if hasattr(subr, '_fdidx') and len(subr._fdidx) > 0:
                program = [rev_keymap[tok] for tok in subr.value()]
                Compreffor.update_program(program, subr.encoding(), gbias,
                                          lbias, None)
                Compreffor.expand_hintmask(program)
                subr._program = program

        for subr_arr, sel in zip(itertools.chain([gsubrs], lsubrs),
                                 itertools.chain([None], range(fdlen))):
            for subr in subr_arr:
                program = [rev_keymap[tok] for tok in subr.value()]
                if program[-1] not in ("endchar", "return"):
                    program.append("return")
                Compreffor.update_program(program, subr.encoding(), gbias,
                                          lbias, sel)
                Compreffor.expand_hintmask(program)
                subr._program = program

        return (gsubrs, lsubrs)
Beispiel #9
0
    def iterative_encode(self, glyph_set, fdselect=None, fdlen=1):
        """
        Choose a subroutinization encoding for all charstrings in
        `glyph_set` using an iterative Dynamic Programming algorithm.
        Initially uses the results from SubstringFinder and then
        iteratively optimizes.

        Arguments:
        glyph_set -- the set of charstrings to encode (required)
        fdselect -- the FDSelect array of the source font, or None
        fdlen -- the number of FD's in the source font, or 1 if there are none

        Returns:
        A three-part dictionary with keys 'gsubrs', 'lsubrs', and
        'glyph_encodings'. The 'glyph_encodings' encoding dictionary
        specifies how to break up each charstring. Encoding[i]
        describes how to encode glyph i. Each entry is something
        like [(x_1, c_1), (x_2, c_2), ..., (x_k, c_k)], where x_* is an index
        into the charstring that indicates where a subr starts and c_*
        is a CandidateSubr. The 'gsubrs' entry contains an array of global
        subroutines (CandidateSubr objects) and 'lsubrs' is an array indexed
        by FDidx, where each entry is a list of local subroutines.
        """

        # generate substrings for marketplace
        sf = SubstringFinder(glyph_set)

        if self.test_mode:
            substrings = sf.get_substrings(min_freq=0,
                                           check_positive=False,
                                           sort_by_length=False)
        else:
            substrings = sf.get_substrings(min_freq=2,
                                           check_positive=True,
                                           sort_by_length=False)

        # TODO remove unnecessary substrings?

        data = sf.data
        rev_keymap = sf.rev_keymap
        cost_map = sf.cost_map
        glyph_set_keys = sf.glyph_set_keys
        del sf

        if not self.SINGLE_PROCESS:
            pool = multiprocessing.Pool(processes=self.PROCESSES)
        else:

            class DummyPool:
                pass

            pool = DummyPool()
            pool.map = lambda f, *l, **kwargs: map(f, *l)

        substr_dict = {}

        timer.split()

        log.debug("glyphstrings+substrings=%d", len(data) + len(substrings))

        # set up dictionary with initial values
        for idx, substr in enumerate(substrings):
            substr._adjusted_cost = substr.cost()
            substr._price = substr._adjusted_cost
            substr._usages = substr.freq  # this is the frequency that the substring appears,
            # not necessarily used
            substr._list_idx = idx
            substr_dict[substr.value()] = (
                idx, substr._price)  # NOTE: avoid excess data copying on fork
            # probably can just pass substr
            # if threading instead

        for run_count in range(self.NROUNDS):
            # calibrate prices
            for idx, substr in enumerate(substrings):
                marg_cost = float(
                    substr._adjusted_cost) / (substr._usages + self.K)
                substr._price = marg_cost * self.ALPHA + substr._price * (
                    1 - self.ALPHA)
                substr_dict[substr.value()] = (idx, substr._price)

            # minimize substring costs
            csize = int(math.ceil(self.POOL_CHUNKRATIO * len(substrings)))
            substr_encodings = pool.map(
                functools.partial(optimize_charstring,
                                  cost_map=cost_map,
                                  substr_dict=substr_dict,
                                  progress=self._progress),
                enumerate([s.value() for s in substrings]),
                chunksize=csize)

            for substr, result in zip(substrings, substr_encodings):
                substr._encoding = [(enc_item[0], substrings[enc_item[1]])
                                    for enc_item in result["encoding"]]
                substr._adjusted_cost = result["market_cost"]
            del substr_encodings

            # minimize charstring costs in current market through DP
            csize = int(math.ceil(self.POOL_CHUNKRATIO * len(data)))
            encodings = pool.map(functools.partial(optimize_charstring,
                                                   cost_map=cost_map,
                                                   substr_dict=substr_dict,
                                                   progress=self._progress),
                                 data,
                                 chunksize=csize)
            encodings = [[(enc_item[0], substrings[enc_item[1]])
                          for enc_item in i["encoding"]] for i in encodings]

            # update substring frequencies based on cost minimization
            for substr in substrings:
                substr._usages = 0

            for calling_substr in substrings:
                for start, substr in calling_substr._encoding:
                    if substr:
                        substr._usages += 1
            for glyph_idx, enc in enumerate(encodings):
                for start, substr in enc:
                    if substr:
                        substr._usages += 1

            if log.isEnabledFor(logging.INFO):
                log.info("Round %d Done!", (run_count + 1))
                log.info(
                    "avg: %f",
                    (float(sum(substr._usages
                               for substr in substrings)) / len(substrings)))
                log.info("max: %d",
                         max(substr._usages for substr in substrings))
                log.info("used: %d",
                         sum(substr._usages > 0 for substr in substrings))

            if run_count <= self.NROUNDS - 2 and not self.test_mode:
                with timer("cutdown"):
                    if run_count < self.NROUNDS - 2:
                        bad_substrings = [
                            s for s in substrings
                            if s.subr_saving(use_usages=True) <= 0
                        ]
                        substrings = [
                            s for s in substrings
                            if s.subr_saving(use_usages=True) > 0
                        ]
                    else:
                        bad_substrings = [
                            s for s in substrings if s.subr_saving(
                                use_usages=True, true_cost=False) <= 0
                        ]
                        substrings = [
                            s for s in substrings if
                            s.subr_saving(use_usages=True, true_cost=False) > 0
                        ]

                    for substr in bad_substrings:
                        # heuristic to encourage use of called substrings:
                        for idx, called_substr in substr._encoding:
                            called_substr._usages += substr._usages - 1
                        del substr_dict[substr.value()]
                    for idx, s in enumerate(substrings):
                        s._list_idx = idx
                    if log.isEnabledFor(logging.DEBUG):
                        log.debug(
                            "%d substrings with non-positive savings removed",
                            len(bad_substrings))
                        log.debug(
                            "(%d had positive usage)",
                            len([s for s in bad_substrings if s._usages > 0]))

        log.info("Finished iterative market (%gs)", timer.split())
        log.info("%d candidate subrs found", len(substrings))

        gsubrs, lsubrs = Compreffor.process_subrs(glyph_set_keys, encodings,
                                                  fdlen, fdselect, substrings,
                                                  rev_keymap,
                                                  self.NSUBRS_LIMIT,
                                                  self.SUBR_NEST_LIMIT)

        return {
            "glyph_encodings": dict(zip(glyph_set_keys, encodings)),
            "lsubrs": lsubrs,
            "gsubrs": gsubrs
        }
Beispiel #10
0
def optimize_charstring(charstring, cost_map, substr_dict, progress=False):
    """Optimize a charstring (encoded using keymap) using
    the substrings in substr_dict. This is the Dynamic Programming portion
    of `iterative_encode`."""

    if len(charstring) > 1 and type(charstring[1]) == tuple:
        if type(charstring[0]) == int:
            skip_idx = charstring[0]
            charstring = charstring[1]
    else:
        skip_idx = None

    results = [0 for _ in range(len(charstring) + 1)]
    next_enc_idx = [None for _ in range(len(charstring))]
    next_enc_substr = [None for _ in range(len(charstring))]
    for i in reversed(range(len(charstring))):
        min_option = float("inf")
        min_enc_idx = len(charstring)
        min_enc_substr = None
        cur_cost = 0
        for j in range(i + 1, len(charstring) + 1):
            cur_cost += cost_map[charstring[j - 1]]

            if charstring[i:j] in substr_dict:
                substr = substr_dict[charstring[i:j]]
                if substr[0] != skip_idx:
                    option = substr[1] + results[j]
                    substr = substr[0]
                else:
                    assert i == 0 and j == len(charstring)
                    substr = None
                    option = cur_cost + results[j]
            else:
                # note: must not be branching, so just make _price actual cost
                substr = None
                option = cur_cost + results[j]

            if option < min_option:
                min_option = option
                min_enc_idx = j
                min_enc_substr = substr

        results[i] = min_option
        next_enc_idx[i] = min_enc_idx
        next_enc_substr[i] = min_enc_substr

    market_cost = results[0]
    encoding = []
    cur_enc_idx = 0
    last = len(next_enc_idx)
    while cur_enc_idx < last:
        last_idx = cur_enc_idx
        cur_enc_substr = next_enc_substr[cur_enc_idx]
        cur_enc_idx = next_enc_idx[cur_enc_idx]

        if cur_enc_substr is not None:
            encoding.append((last_idx, cur_enc_substr))

    if progress:
        sys.stderr.write(".")
        sys.stderr.flush()
    return {"encoding": encoding, "market_cost": market_cost}
Beispiel #11
0
def optimize_charstring(charstring, cost_map, substr_dict, progress=False):
    """Optimize a charstring (encoded using keymap) using
    the substrings in substr_dict. This is the Dynamic Programming portion
    of `iterative_encode`."""

    if len(charstring) > 1 and type(charstring[1]) == tuple:
        if type(charstring[0]) == int:
            skip_idx = charstring[0]
            charstring = charstring[1]
    else:
        skip_idx = None

    results = [0 for _ in range(len(charstring) + 1)]
    next_enc_idx = [None for _ in range(len(charstring))]
    next_enc_substr = [None for _ in range(len(charstring))]
    for i in reversed(range(len(charstring))):
        min_option = float("inf")
        min_enc_idx = len(charstring)
        min_enc_substr = None
        cur_cost = 0
        for j in range(i + 1, len(charstring) + 1):
            cur_cost += cost_map[charstring[j - 1]]

            if charstring[i:j] in substr_dict:
                substr = substr_dict[charstring[i:j]]
                if substr[0] != skip_idx:
                    option = substr[1] + results[j]
                    substr = substr[0]
                else:
                    assert i == 0 and j == len(charstring)
                    substr = None
                    option = cur_cost + results[j]
            else:
                # note: must not be branching, so just make _price actual cost
                substr = None
                option = cur_cost + results[j]

            if option < min_option:
                min_option = option
                min_enc_idx = j
                min_enc_substr = substr

        results[i] = min_option
        next_enc_idx[i] = min_enc_idx
        next_enc_substr[i] = min_enc_substr

    market_cost = results[0]
    encoding = []
    cur_enc_idx = 0
    last = len(next_enc_idx)
    while cur_enc_idx < last:
        last_idx = cur_enc_idx
        cur_enc_substr = next_enc_substr[cur_enc_idx]
        cur_enc_idx = next_enc_idx[cur_enc_idx]

        if cur_enc_substr is not None:
            encoding.append((last_idx, cur_enc_substr))

    if progress:
        sys.stderr.write(".")
        sys.stderr.flush()
    return {"encoding": encoding, "market_cost": market_cost}
Beispiel #12
0
    def process_subrs(glyph_set_keys, encodings, fdlen, fdselect, substrings, rev_keymap, subr_limit, nest_limit):

        def mark_reachable(cand_subr, fdidx):
            try:
                if fdidx not in cand_subr._fdidx:
                    cand_subr._fdidx.append(fdidx)
            except AttributeError:
                cand_subr._fdidx = [fdidx]

            for it in cand_subr._encoding:
                mark_reachable(it[1], fdidx)
        if fdselect is not None:
            for g, enc in zip(glyph_set_keys, encodings):
                sel = fdselect(g)
                for it in enc:
                    mark_reachable(it[1], sel)
        else:
            for encoding in encodings:
                for it in encoding:
                    mark_reachable(it[1], 0)

        subrs = [s for s in substrings if s.usages() > 0 and hasattr(s, '_fdidx') and  bool(s._fdidx) and s.subr_saving(use_usages=True, true_cost=True) > 0]

        bad_substrings = [s for s in substrings if s.usages() == 0 or not hasattr(s, '_fdidx') or not bool(s._fdidx) or s.subr_saving(use_usages=True, true_cost=True) <= 0]
        log.debug("%d substrings unused or negative saving subrs", len(bad_substrings))

        for s in bad_substrings:
            s._flatten = True

        gsubrs = []
        lsubrs = [[] for _ in range(fdlen)]

        subrs.sort(key=lambda s: s.subr_saving(use_usages=True, true_cost=True))

        while subrs and (any(len(s) < subr_limit for s in lsubrs) or
                         len(gsubrs) < subr_limit):
            subr = subrs[-1]
            del subrs[-1]
            if len(subr._fdidx) == 1:
                lsub_index = lsubrs[subr._fdidx[0]]
                if len(gsubrs) < subr_limit:
                    if len(lsub_index) < subr_limit:
                        # both have space
                        gcost = Compreffor.test_call_cost(subr, gsubrs)
                        lcost = Compreffor.test_call_cost(subr, lsub_index)

                        if gcost < lcost:
                            Compreffor.insert_by_usage(subr, gsubrs)
                            subr._global = True
                        else:
                            Compreffor.insert_by_usage(subr, lsub_index)
                    else:
                        # just gsubrs has space
                        Compreffor.insert_by_usage(subr, gsubrs)
                        subr._global = True
                elif len(lsub_index) < subr_limit:
                    # just lsubrs has space
                    Compreffor.insert_by_usage(subr, lsub_index)
                else:
                    # we must skip :(
                    bad_substrings.append(subr)
            else:
                if len(gsubrs) < subr_limit:
                    # we can put it in globals
                    Compreffor.insert_by_usage(subr, gsubrs)
                    subr._global = True
                else:
                    # no room for this one
                    bad_substrings.append(subr)

        bad_substrings.extend([s[1] for s in subrs]) # add any leftover subrs to bad_substrings

        for s in bad_substrings:
            s._flatten = True

        # fix any nesting issues
        Compreffor.calc_nesting(gsubrs)
        for subrs in lsubrs:
            Compreffor.calc_nesting(subrs)

        too_nested = [s for s in itertools.chain(*lsubrs) if s._max_call_depth > nest_limit]
        too_nested.extend([s for s in gsubrs if s._max_call_depth > nest_limit])
        for s in too_nested:
            s._flatten = True
        bad_substrings.extend(too_nested)
        lsubrs = [[s for s in lsubrarr if s._max_call_depth <= nest_limit] for lsubrarr in lsubrs]
        gsubrs = [s for s in gsubrs if s._max_call_depth <= nest_limit]
        too_nested = len(too_nested)

        log.debug("%d substrings nested too deep", too_nested)
        log.debug("%d substrings being flattened", len(bad_substrings))

        # reorganize to minimize call cost of most frequent subrs
        gbias = psCharStrings.calcSubrBias(gsubrs)
        lbias = [psCharStrings.calcSubrBias(s) for s in lsubrs]

        for subr_arr, bias in zip(itertools.chain([gsubrs], lsubrs),
                                  itertools.chain([gbias], lbias)):
            subr_arr.sort(key=lambda s: s.usages(), reverse=True)

            if bias == 1131:
                subr_arr[:] = subr_arr[216:1240] + subr_arr[0:216] + subr_arr[1240:]
            elif bias == 32768:
                subr_arr[:] = (subr_arr[2264:33901] + subr_arr[216:1240] +
                            subr_arr[0:216] + subr_arr[1240:2264] + subr_arr[33901:])
            for idx, subr in enumerate(subr_arr):
                subr._position = idx

        for subr in sorted(bad_substrings, key=lambda s: len(s)):
            # NOTE: it is important this is run in order so shorter
            # substrings are run before longer ones
            if hasattr(subr, '_fdidx') and len(subr._fdidx) > 0:
                program = [rev_keymap[tok] for tok in subr.value()]
                Compreffor.update_program(program, subr.encoding(), gbias, lbias, None)
                Compreffor.expand_hintmask(program)
                subr._program = program

        for subr_arr, sel in zip(itertools.chain([gsubrs], lsubrs),
                                  itertools.chain([None], range(fdlen))):
            for subr in subr_arr:
                program = [rev_keymap[tok] for tok in subr.value()]
                if program[-1] not in ("endchar", "return"):
                    program.append("return")
                Compreffor.update_program(program, subr.encoding(), gbias, lbias, sel)
                Compreffor.expand_hintmask(program)
                subr._program = program

        return (gsubrs, lsubrs)
Beispiel #13
0
    def iterative_encode(self, glyph_set, fdselect=None, fdlen=1):
        """
        Choose a subroutinization encoding for all charstrings in
        `glyph_set` using an iterative Dynamic Programming algorithm.
        Initially uses the results from SubstringFinder and then
        iteratively optimizes.

        Arguments:
        glyph_set -- the set of charstrings to encode (required)
        fdselect -- the FDSelect array of the source font, or None
        fdlen -- the number of FD's in the source font, or 1 if there are none

        Returns:
        A three-part dictionary with keys 'gsubrs', 'lsubrs', and
        'glyph_encodings'. The 'glyph_encodings' encoding dictionary
        specifies how to break up each charstring. Encoding[i]
        describes how to encode glyph i. Each entry is something
        like [(x_1, c_1), (x_2, c_2), ..., (x_k, c_k)], where x_* is an index
        into the charstring that indicates where a subr starts and c_*
        is a CandidateSubr. The 'gsubrs' entry contains an array of global
        subroutines (CandidateSubr objects) and 'lsubrs' is an array indexed
        by FDidx, where each entry is a list of local subroutines.
        """

        # generate substrings for marketplace
        sf = SubstringFinder(glyph_set)

        if self.test_mode:
            substrings = sf.get_substrings(min_freq=0, check_positive=False, sort_by_length=False)
        else:
            substrings = sf.get_substrings(min_freq=2, check_positive=True, sort_by_length=False)

        # TODO remove unnecessary substrings?

        data = sf.data
        rev_keymap = sf.rev_keymap
        cost_map = sf.cost_map
        glyph_set_keys = sf.glyph_set_keys
        del sf

        if not self.SINGLE_PROCESS:
            pool = multiprocessing.Pool(processes=self.PROCESSES)
        else:
            class DummyPool:
                pass
            pool = DummyPool()
            pool.map = lambda f, *l, **kwargs: map(f, *l)

        substr_dict = {}

        timer.split()

        log.debug("glyphstrings+substrings=%d", len(data) + len(substrings))

        # set up dictionary with initial values
        for idx, substr in enumerate(substrings):
            substr._adjusted_cost = substr.cost()
            substr._price = substr._adjusted_cost
            substr._usages = substr.freq # this is the frequency that the substring appears,
                                        # not necessarily used
            substr._list_idx = idx
            substr_dict[substr.value()] = (idx, substr._price) # NOTE: avoid excess data copying on fork
                                                               # probably can just pass substr
                                                               # if threading instead

        for run_count in range(self.NROUNDS):
            # calibrate prices
            for idx, substr in enumerate(substrings):
                marg_cost = float(substr._adjusted_cost) / (substr._usages + self.K)
                substr._price = marg_cost * self.ALPHA + substr._price * (1 - self.ALPHA)
                substr_dict[substr.value()] = (idx, substr._price)

            # minimize substring costs
            csize = int(math.ceil(self.POOL_CHUNKRATIO*len(substrings)))
            substr_encodings = pool.map(functools.partial(optimize_charstring,
                                                          cost_map=cost_map,
                                                          substr_dict=substr_dict,
                                                          progress=self._progress),
                                        enumerate([s.value() for s in substrings]),
                                        chunksize=csize)

            for substr, result in zip(substrings, substr_encodings):
                substr._encoding = [(enc_item[0], substrings[enc_item[1]]) for enc_item in result["encoding"]]
                substr._adjusted_cost = result["market_cost"]
            del substr_encodings

            # minimize charstring costs in current market through DP
            csize = int(math.ceil(self.POOL_CHUNKRATIO*len(data)))
            encodings = pool.map(functools.partial(optimize_charstring,
                                                   cost_map=cost_map,
                                                   substr_dict=substr_dict,
                                                   progress=self._progress),
                                 data,
                                 chunksize=csize)
            encodings = [[(enc_item[0], substrings[enc_item[1]]) for enc_item in i["encoding"]] for i in encodings]

            # update substring frequencies based on cost minimization
            for substr in substrings:
                substr._usages = 0

            for calling_substr in substrings:
                for start, substr in calling_substr._encoding:
                    if substr:
                        substr._usages += 1
            for glyph_idx, enc in enumerate(encodings):
                for start, substr in enc:
                    if substr:
                        substr._usages += 1

            if log.isEnabledFor(logging.INFO):
                log.info("Round %d Done!", (run_count + 1))
                log.info("avg: %f", (float(sum(substr._usages for substr in substrings)) / len(substrings)))
                log.info("max: %d", max(substr._usages for substr in substrings))
                log.info("used: %d", sum(substr._usages > 0 for substr in substrings))

            if run_count <= self.NROUNDS - 2 and not self.test_mode:
                with timer("cutdown"):
                    if run_count < self.NROUNDS - 2:
                        bad_substrings = [s for s in substrings if s.subr_saving(use_usages=True) <= 0]
                        substrings = [s for s in substrings if s.subr_saving(use_usages=True) > 0]
                    else:
                        bad_substrings = [s for s in substrings if s.subr_saving(use_usages=True, true_cost=False) <= 0]
                        substrings = [s for s in substrings if s.subr_saving(use_usages=True, true_cost=False) > 0]

                    for substr in bad_substrings:
                        # heuristic to encourage use of called substrings:
                        for idx, called_substr in substr._encoding:
                            called_substr._usages += substr._usages - 1
                        del substr_dict[substr.value()]
                    for idx, s in enumerate(substrings):
                        s._list_idx = idx
                    if log.isEnabledFor(logging.DEBUG):
                        log.debug("%d substrings with non-positive savings removed", len(bad_substrings))
                        log.debug("(%d had positive usage)", len([s for s in bad_substrings if s._usages > 0]))

        log.info("Finished iterative market (%gs)", timer.split())
        log.info("%d candidate subrs found", len(substrings))

        gsubrs, lsubrs = Compreffor.process_subrs(
                                            glyph_set_keys,
                                            encodings,
                                            fdlen,
                                            fdselect,
                                            substrings,
                                            rev_keymap,
                                            self.NSUBRS_LIMIT,
                                            self.SUBR_NEST_LIMIT)

        return {"glyph_encodings": dict(zip(glyph_set_keys, encodings)),
                "lsubrs": lsubrs,
                "gsubrs": gsubrs}