コード例 #1
0
ファイル: pyCompressor.py プロジェクト: anthrotype/compreffor
    def get_substrings(self, min_freq=2, check_positive=True, sort_by_length=False):
        """
        Return repeated substrings (type CandidateSubr) from the charstrings
        sorted by subroutine savings with freq >= min_freq using the LCP array.

        Arguments:
        min_freq -- the minimum frequency required to include a substring
        check_positive -- if True, only allow substrings with positive subr_saving
        sort_by_length -- if True, return substrings sorted by length, else by saving
        """

        self.get_suffixes()

        lcp = self.get_lcp()

        with timer("extract substrings"):
            start_indices = deque()
            self.substrings = []

            for i, min_l in enumerate(lcp):
                # First min_l items are still the same.

                # Pop the rest from previous and account for.
                # Note: non-branching substrings aren't included
                # TODO: don't allow overlapping substrings into the same set

                while start_indices and start_indices[-1][0] > min_l:
                    l, start_idx = start_indices.pop()
                    freq = i - start_idx
                    if freq < min_freq:
                        continue

                    substr = CandidateSubr(
                                           l,
                                           self.suffixes[start_idx],
                                           freq,
                                           self.data,
                                           self.cost_map)
                    if substr.subr_saving() > 0 or not check_positive:
                        self.substrings.append(substr)

                if not start_indices or min_l > start_indices[-1][0]:
                    start_indices.append((min_l, i - 1))

        log.debug("%d substrings found", len(self.substrings))
        with timer("sort substrings"):
            if sort_by_length:
                self.substrings.sort(key=lambda s: len(s))
            else:
                self.substrings.sort(key=lambda s: s.subr_saving(), reverse=True)
        return self.substrings
コード例 #2
0
    def get_substrings(self,
                       min_freq=2,
                       check_positive=True,
                       sort_by_length=False):
        """
        Return repeated substrings (type CandidateSubr) from the charstrings
        sorted by subroutine savings with freq >= min_freq using the LCP array.

        Arguments:
        min_freq -- the minimum frequency required to include a substring
        check_positive -- if True, only allow substrings with positive subr_saving
        sort_by_length -- if True, return substrings sorted by length, else by saving
        """

        self.get_suffixes()

        lcp = self.get_lcp()

        with timer("extract substrings"):
            start_indices = deque()
            self.substrings = []

            for i, min_l in enumerate(lcp):
                # First min_l items are still the same.

                # Pop the rest from previous and account for.
                # Note: non-branching substrings aren't included
                # TODO: don't allow overlapping substrings into the same set

                while start_indices and start_indices[-1][0] > min_l:
                    l, start_idx = start_indices.pop()
                    freq = i - start_idx
                    if freq < min_freq:
                        continue

                    substr = CandidateSubr(l, self.suffixes[start_idx], freq,
                                           self.data, self.cost_map)
                    if substr.subr_saving() > 0 or not check_positive:
                        self.substrings.append(substr)

                if not start_indices or min_l > start_indices[-1][0]:
                    start_indices.append((min_l, i - 1))

        log.debug("%d substrings found", len(self.substrings))
        with timer("sort substrings"):
            if sort_by_length:
                self.substrings.sort(key=lambda s: len(s))
            else:
                self.substrings.sort(key=lambda s: s.subr_saving(),
                                     reverse=True)
        return self.substrings
コード例 #3
0
ファイル: cxxCompressor.py プロジェクト: behdad/compreffor
def compreff(font, nrounds=None, max_subrs=None):
    """Main function that compresses `font`, a TTFont object,
    in place.
    """
    assert len(font['CFF '].cff.topDictIndex) == 1

    td = font['CFF '].cff.topDictIndex[0]

    if nrounds is None:
        nrounds = Compreffor.NROUNDS
    if max_subrs is None:
        max_subrs = Compreffor.NSUBRS_LIMIT

    input_data = write_data(td)
    with timer("run 'lib.compreff()'"):
        results = lib.compreff(input_data, nrounds)
    subrs, glyph_encodings = interpret_data(td, results)

    with timer("decompile charstrings"):
        for cs in td.CharStrings.values():
            cs.decompile()

    # in order of charset
    chstrings = [x.program for x in td.CharStrings.values()]
    for cs in chstrings:
        Compreffor.collapse_hintmask(cs)

    for s in subrs:
        s.chstrings = chstrings

    if hasattr(td, 'FDSelect'):
        fdselect = lambda g: td.CharStrings.getItemAndSelector(g)[1]
        fdlen = len(td.FDArray)
    else:
        fdselect = None
        fdlen = 1

    nest_limit = Compreffor.SUBR_NEST_LIMIT
    gsubrs, lsubrs = Compreffor.process_subrs(
                            td.charset,
                            glyph_encodings,
                            fdlen,
                            fdselect,
                            subrs,
                            IdKeyMap(),
                            max_subrs,
                            nest_limit)

    encoding = dict(zip(td.charset, glyph_encodings))

    Compreffor.apply_subrs(td, encoding, gsubrs, lsubrs)
コード例 #4
0
def compreff(font, nrounds=None, max_subrs=None):
    """Main function that compresses `font`, a TTFont object,
    in place.
    """
    assert len(font['CFF '].cff.topDictIndex) == 1

    td = font['CFF '].cff.topDictIndex[0]

    if nrounds is None:
        nrounds = Compreffor.NROUNDS
    if max_subrs is None:
        max_subrs = Compreffor.NSUBRS_LIMIT

    input_data = write_data(td)
    with timer("run 'lib.compreff()'"):
        results = lib.compreff(input_data, nrounds)
    subrs, glyph_encodings = interpret_data(td, results)

    with timer("decompile charstrings"):
        for cs in td.CharStrings.values():
            cs.decompile()

    # in order of charset
    chstrings = [x.program for x in td.CharStrings.values()]
    for cs in chstrings:
        Compreffor.collapse_hintmask(cs)

    for s in subrs:
        s.chstrings = chstrings

    if hasattr(td, 'FDSelect'):
        fdselect = lambda g: td.CharStrings.getItemAndSelector(g)[1]
        fdlen = len(td.FDArray)
    else:
        fdselect = None
        fdlen = 1

    nest_limit = Compreffor.SUBR_NEST_LIMIT
    gsubrs, lsubrs = Compreffor.process_subrs(td.charset, glyph_encodings,
                                              fdlen, fdselect, subrs,
                                              IdKeyMap(), max_subrs,
                                              nest_limit)

    encoding = dict(zip(td.charset, glyph_encodings))

    Compreffor.apply_subrs(td, encoding, gsubrs, lsubrs)
コード例 #5
0
    def get_suffixes(self):
        """Return the sorted suffix array"""

        if self._completed_suffixes:
            return self.suffixes

        with timer("get suffixes via Python sort"):
            self.suffixes.sort(key=lambda idx: self.data[idx[0]][idx[1]:])
            self._completed_suffixes = True

        return self.suffixes
コード例 #6
0
ファイル: pyCompressor.py プロジェクト: anthrotype/compreffor
    def get_suffixes(self):
        """Return the sorted suffix array"""

        if self._completed_suffixes:
            return self.suffixes

        with timer("get suffixes via Python sort"):
            self.suffixes.sort(key=lambda idx: self.data[idx[0]][idx[1]:])
            self._completed_suffixes = True

        return self.suffixes
コード例 #7
0
    def iterative_encode(self, glyph_set, fdselect=None, fdlen=1):
        """
        Choose a subroutinization encoding for all charstrings in
        `glyph_set` using an iterative Dynamic Programming algorithm.
        Initially uses the results from SubstringFinder and then
        iteratively optimizes.

        Arguments:
        glyph_set -- the set of charstrings to encode (required)
        fdselect -- the FDSelect array of the source font, or None
        fdlen -- the number of FD's in the source font, or 1 if there are none

        Returns:
        A three-part dictionary with keys 'gsubrs', 'lsubrs', and
        'glyph_encodings'. The 'glyph_encodings' encoding dictionary
        specifies how to break up each charstring. Encoding[i]
        describes how to encode glyph i. Each entry is something
        like [(x_1, c_1), (x_2, c_2), ..., (x_k, c_k)], where x_* is an index
        into the charstring that indicates where a subr starts and c_*
        is a CandidateSubr. The 'gsubrs' entry contains an array of global
        subroutines (CandidateSubr objects) and 'lsubrs' is an array indexed
        by FDidx, where each entry is a list of local subroutines.
        """

        # generate substrings for marketplace
        sf = SubstringFinder(glyph_set)

        if self.test_mode:
            substrings = sf.get_substrings(min_freq=0,
                                           check_positive=False,
                                           sort_by_length=False)
        else:
            substrings = sf.get_substrings(min_freq=2,
                                           check_positive=True,
                                           sort_by_length=False)

        # TODO remove unnecessary substrings?

        data = sf.data
        rev_keymap = sf.rev_keymap
        cost_map = sf.cost_map
        glyph_set_keys = sf.glyph_set_keys
        del sf

        if not self.SINGLE_PROCESS:
            pool = multiprocessing.Pool(processes=self.PROCESSES)
        else:

            class DummyPool:
                pass

            pool = DummyPool()
            pool.map = lambda f, *l, **kwargs: map(f, *l)

        substr_dict = {}

        timer.split()

        log.debug("glyphstrings+substrings=%d", len(data) + len(substrings))

        # set up dictionary with initial values
        for idx, substr in enumerate(substrings):
            substr._adjusted_cost = substr.cost()
            substr._price = substr._adjusted_cost
            substr._usages = substr.freq  # this is the frequency that the substring appears,
            # not necessarily used
            substr._list_idx = idx
            substr_dict[substr.value()] = (
                idx, substr._price)  # NOTE: avoid excess data copying on fork
            # probably can just pass substr
            # if threading instead

        for run_count in range(self.NROUNDS):
            # calibrate prices
            for idx, substr in enumerate(substrings):
                marg_cost = float(
                    substr._adjusted_cost) / (substr._usages + self.K)
                substr._price = marg_cost * self.ALPHA + substr._price * (
                    1 - self.ALPHA)
                substr_dict[substr.value()] = (idx, substr._price)

            # minimize substring costs
            csize = int(math.ceil(self.POOL_CHUNKRATIO * len(substrings)))
            substr_encodings = pool.map(
                functools.partial(optimize_charstring,
                                  cost_map=cost_map,
                                  substr_dict=substr_dict,
                                  progress=self._progress),
                enumerate([s.value() for s in substrings]),
                chunksize=csize)

            for substr, result in zip(substrings, substr_encodings):
                substr._encoding = [(enc_item[0], substrings[enc_item[1]])
                                    for enc_item in result["encoding"]]
                substr._adjusted_cost = result["market_cost"]
            del substr_encodings

            # minimize charstring costs in current market through DP
            csize = int(math.ceil(self.POOL_CHUNKRATIO * len(data)))
            encodings = pool.map(functools.partial(optimize_charstring,
                                                   cost_map=cost_map,
                                                   substr_dict=substr_dict,
                                                   progress=self._progress),
                                 data,
                                 chunksize=csize)
            encodings = [[(enc_item[0], substrings[enc_item[1]])
                          for enc_item in i["encoding"]] for i in encodings]

            # update substring frequencies based on cost minimization
            for substr in substrings:
                substr._usages = 0

            for calling_substr in substrings:
                for start, substr in calling_substr._encoding:
                    if substr:
                        substr._usages += 1
            for glyph_idx, enc in enumerate(encodings):
                for start, substr in enc:
                    if substr:
                        substr._usages += 1

            if log.isEnabledFor(logging.INFO):
                log.info("Round %d Done!", (run_count + 1))
                log.info(
                    "avg: %f",
                    (float(sum(substr._usages
                               for substr in substrings)) / len(substrings)))
                log.info("max: %d",
                         max(substr._usages for substr in substrings))
                log.info("used: %d",
                         sum(substr._usages > 0 for substr in substrings))

            if run_count <= self.NROUNDS - 2 and not self.test_mode:
                with timer("cutdown"):
                    if run_count < self.NROUNDS - 2:
                        bad_substrings = [
                            s for s in substrings
                            if s.subr_saving(use_usages=True) <= 0
                        ]
                        substrings = [
                            s for s in substrings
                            if s.subr_saving(use_usages=True) > 0
                        ]
                    else:
                        bad_substrings = [
                            s for s in substrings if s.subr_saving(
                                use_usages=True, true_cost=False) <= 0
                        ]
                        substrings = [
                            s for s in substrings if
                            s.subr_saving(use_usages=True, true_cost=False) > 0
                        ]

                    for substr in bad_substrings:
                        # heuristic to encourage use of called substrings:
                        for idx, called_substr in substr._encoding:
                            called_substr._usages += substr._usages - 1
                        del substr_dict[substr.value()]
                    for idx, s in enumerate(substrings):
                        s._list_idx = idx
                    if log.isEnabledFor(logging.DEBUG):
                        log.debug(
                            "%d substrings with non-positive savings removed",
                            len(bad_substrings))
                        log.debug(
                            "(%d had positive usage)",
                            len([s for s in bad_substrings if s._usages > 0]))

        log.info("Finished iterative market (%gs)", timer.split())
        log.info("%d candidate subrs found", len(substrings))

        gsubrs, lsubrs = Compreffor.process_subrs(glyph_set_keys, encodings,
                                                  fdlen, fdselect, substrings,
                                                  rev_keymap,
                                                  self.NSUBRS_LIMIT,
                                                  self.SUBR_NEST_LIMIT)

        return {
            "glyph_encodings": dict(zip(glyph_set_keys, encodings)),
            "lsubrs": lsubrs,
            "gsubrs": gsubrs
        }
コード例 #8
0
ファイル: pyCompressor.py プロジェクト: anthrotype/compreffor
    def iterative_encode(self, glyph_set, fdselect=None, fdlen=1):
        """
        Choose a subroutinization encoding for all charstrings in
        `glyph_set` using an iterative Dynamic Programming algorithm.
        Initially uses the results from SubstringFinder and then
        iteratively optimizes.

        Arguments:
        glyph_set -- the set of charstrings to encode (required)
        fdselect -- the FDSelect array of the source font, or None
        fdlen -- the number of FD's in the source font, or 1 if there are none

        Returns:
        A three-part dictionary with keys 'gsubrs', 'lsubrs', and
        'glyph_encodings'. The 'glyph_encodings' encoding dictionary
        specifies how to break up each charstring. Encoding[i]
        describes how to encode glyph i. Each entry is something
        like [(x_1, c_1), (x_2, c_2), ..., (x_k, c_k)], where x_* is an index
        into the charstring that indicates where a subr starts and c_*
        is a CandidateSubr. The 'gsubrs' entry contains an array of global
        subroutines (CandidateSubr objects) and 'lsubrs' is an array indexed
        by FDidx, where each entry is a list of local subroutines.
        """

        # generate substrings for marketplace
        sf = SubstringFinder(glyph_set)

        if self.test_mode:
            substrings = sf.get_substrings(min_freq=0, check_positive=False, sort_by_length=False)
        else:
            substrings = sf.get_substrings(min_freq=2, check_positive=True, sort_by_length=False)

        # TODO remove unnecessary substrings?

        data = sf.data
        rev_keymap = sf.rev_keymap
        cost_map = sf.cost_map
        glyph_set_keys = sf.glyph_set_keys
        del sf

        if not self.SINGLE_PROCESS:
            pool = multiprocessing.Pool(processes=self.PROCESSES)
        else:
            class DummyPool:
                pass
            pool = DummyPool()
            pool.map = lambda f, *l, **kwargs: map(f, *l)

        substr_dict = {}

        timer.split()

        log.debug("glyphstrings+substrings=%d", len(data) + len(substrings))

        # set up dictionary with initial values
        for idx, substr in enumerate(substrings):
            substr._adjusted_cost = substr.cost()
            substr._price = substr._adjusted_cost
            substr._usages = substr.freq # this is the frequency that the substring appears,
                                        # not necessarily used
            substr._list_idx = idx
            substr_dict[substr.value()] = (idx, substr._price) # NOTE: avoid excess data copying on fork
                                                               # probably can just pass substr
                                                               # if threading instead

        for run_count in range(self.NROUNDS):
            # calibrate prices
            for idx, substr in enumerate(substrings):
                marg_cost = float(substr._adjusted_cost) / (substr._usages + self.K)
                substr._price = marg_cost * self.ALPHA + substr._price * (1 - self.ALPHA)
                substr_dict[substr.value()] = (idx, substr._price)

            # minimize substring costs
            csize = int(math.ceil(self.POOL_CHUNKRATIO*len(substrings)))
            substr_encodings = pool.map(functools.partial(optimize_charstring,
                                                          cost_map=cost_map,
                                                          substr_dict=substr_dict,
                                                          progress=self._progress),
                                        enumerate([s.value() for s in substrings]),
                                        chunksize=csize)

            for substr, result in zip(substrings, substr_encodings):
                substr._encoding = [(enc_item[0], substrings[enc_item[1]]) for enc_item in result["encoding"]]
                substr._adjusted_cost = result["market_cost"]
            del substr_encodings

            # minimize charstring costs in current market through DP
            csize = int(math.ceil(self.POOL_CHUNKRATIO*len(data)))
            encodings = pool.map(functools.partial(optimize_charstring,
                                                   cost_map=cost_map,
                                                   substr_dict=substr_dict,
                                                   progress=self._progress),
                                 data,
                                 chunksize=csize)
            encodings = [[(enc_item[0], substrings[enc_item[1]]) for enc_item in i["encoding"]] for i in encodings]

            # update substring frequencies based on cost minimization
            for substr in substrings:
                substr._usages = 0

            for calling_substr in substrings:
                for start, substr in calling_substr._encoding:
                    if substr:
                        substr._usages += 1
            for glyph_idx, enc in enumerate(encodings):
                for start, substr in enc:
                    if substr:
                        substr._usages += 1

            if log.isEnabledFor(logging.INFO):
                log.info("Round %d Done!", (run_count + 1))
                log.info("avg: %f", (float(sum(substr._usages for substr in substrings)) / len(substrings)))
                log.info("max: %d", max(substr._usages for substr in substrings))
                log.info("used: %d", sum(substr._usages > 0 for substr in substrings))

            if run_count <= self.NROUNDS - 2 and not self.test_mode:
                with timer("cutdown"):
                    if run_count < self.NROUNDS - 2:
                        bad_substrings = [s for s in substrings if s.subr_saving(use_usages=True) <= 0]
                        substrings = [s for s in substrings if s.subr_saving(use_usages=True) > 0]
                    else:
                        bad_substrings = [s for s in substrings if s.subr_saving(use_usages=True, true_cost=False) <= 0]
                        substrings = [s for s in substrings if s.subr_saving(use_usages=True, true_cost=False) > 0]

                    for substr in bad_substrings:
                        # heuristic to encourage use of called substrings:
                        for idx, called_substr in substr._encoding:
                            called_substr._usages += substr._usages - 1
                        del substr_dict[substr.value()]
                    for idx, s in enumerate(substrings):
                        s._list_idx = idx
                    if log.isEnabledFor(logging.DEBUG):
                        log.debug("%d substrings with non-positive savings removed", len(bad_substrings))
                        log.debug("(%d had positive usage)", len([s for s in bad_substrings if s._usages > 0]))

        log.info("Finished iterative market (%gs)", timer.split())
        log.info("%d candidate subrs found", len(substrings))

        gsubrs, lsubrs = Compreffor.process_subrs(
                                            glyph_set_keys,
                                            encodings,
                                            fdlen,
                                            fdselect,
                                            substrings,
                                            rev_keymap,
                                            self.NSUBRS_LIMIT,
                                            self.SUBR_NEST_LIMIT)

        return {"glyph_encodings": dict(zip(glyph_set_keys, encodings)),
                "lsubrs": lsubrs,
                "gsubrs": gsubrs}