def get_substrings(self, min_freq=2, check_positive=True, sort_by_length=False): """ Return repeated substrings (type CandidateSubr) from the charstrings sorted by subroutine savings with freq >= min_freq using the LCP array. Arguments: min_freq -- the minimum frequency required to include a substring check_positive -- if True, only allow substrings with positive subr_saving sort_by_length -- if True, return substrings sorted by length, else by saving """ self.get_suffixes() lcp = self.get_lcp() with timer("extract substrings"): start_indices = deque() self.substrings = [] for i, min_l in enumerate(lcp): # First min_l items are still the same. # Pop the rest from previous and account for. # Note: non-branching substrings aren't included # TODO: don't allow overlapping substrings into the same set while start_indices and start_indices[-1][0] > min_l: l, start_idx = start_indices.pop() freq = i - start_idx if freq < min_freq: continue substr = CandidateSubr( l, self.suffixes[start_idx], freq, self.data, self.cost_map) if substr.subr_saving() > 0 or not check_positive: self.substrings.append(substr) if not start_indices or min_l > start_indices[-1][0]: start_indices.append((min_l, i - 1)) log.debug("%d substrings found", len(self.substrings)) with timer("sort substrings"): if sort_by_length: self.substrings.sort(key=lambda s: len(s)) else: self.substrings.sort(key=lambda s: s.subr_saving(), reverse=True) return self.substrings
def get_substrings(self, min_freq=2, check_positive=True, sort_by_length=False): """ Return repeated substrings (type CandidateSubr) from the charstrings sorted by subroutine savings with freq >= min_freq using the LCP array. Arguments: min_freq -- the minimum frequency required to include a substring check_positive -- if True, only allow substrings with positive subr_saving sort_by_length -- if True, return substrings sorted by length, else by saving """ self.get_suffixes() lcp = self.get_lcp() with timer("extract substrings"): start_indices = deque() self.substrings = [] for i, min_l in enumerate(lcp): # First min_l items are still the same. # Pop the rest from previous and account for. # Note: non-branching substrings aren't included # TODO: don't allow overlapping substrings into the same set while start_indices and start_indices[-1][0] > min_l: l, start_idx = start_indices.pop() freq = i - start_idx if freq < min_freq: continue substr = CandidateSubr(l, self.suffixes[start_idx], freq, self.data, self.cost_map) if substr.subr_saving() > 0 or not check_positive: self.substrings.append(substr) if not start_indices or min_l > start_indices[-1][0]: start_indices.append((min_l, i - 1)) log.debug("%d substrings found", len(self.substrings)) with timer("sort substrings"): if sort_by_length: self.substrings.sort(key=lambda s: len(s)) else: self.substrings.sort(key=lambda s: s.subr_saving(), reverse=True) return self.substrings
def compreff(font, nrounds=None, max_subrs=None): """Main function that compresses `font`, a TTFont object, in place. """ assert len(font['CFF '].cff.topDictIndex) == 1 td = font['CFF '].cff.topDictIndex[0] if nrounds is None: nrounds = Compreffor.NROUNDS if max_subrs is None: max_subrs = Compreffor.NSUBRS_LIMIT input_data = write_data(td) with timer("run 'lib.compreff()'"): results = lib.compreff(input_data, nrounds) subrs, glyph_encodings = interpret_data(td, results) with timer("decompile charstrings"): for cs in td.CharStrings.values(): cs.decompile() # in order of charset chstrings = [x.program for x in td.CharStrings.values()] for cs in chstrings: Compreffor.collapse_hintmask(cs) for s in subrs: s.chstrings = chstrings if hasattr(td, 'FDSelect'): fdselect = lambda g: td.CharStrings.getItemAndSelector(g)[1] fdlen = len(td.FDArray) else: fdselect = None fdlen = 1 nest_limit = Compreffor.SUBR_NEST_LIMIT gsubrs, lsubrs = Compreffor.process_subrs( td.charset, glyph_encodings, fdlen, fdselect, subrs, IdKeyMap(), max_subrs, nest_limit) encoding = dict(zip(td.charset, glyph_encodings)) Compreffor.apply_subrs(td, encoding, gsubrs, lsubrs)
def compreff(font, nrounds=None, max_subrs=None): """Main function that compresses `font`, a TTFont object, in place. """ assert len(font['CFF '].cff.topDictIndex) == 1 td = font['CFF '].cff.topDictIndex[0] if nrounds is None: nrounds = Compreffor.NROUNDS if max_subrs is None: max_subrs = Compreffor.NSUBRS_LIMIT input_data = write_data(td) with timer("run 'lib.compreff()'"): results = lib.compreff(input_data, nrounds) subrs, glyph_encodings = interpret_data(td, results) with timer("decompile charstrings"): for cs in td.CharStrings.values(): cs.decompile() # in order of charset chstrings = [x.program for x in td.CharStrings.values()] for cs in chstrings: Compreffor.collapse_hintmask(cs) for s in subrs: s.chstrings = chstrings if hasattr(td, 'FDSelect'): fdselect = lambda g: td.CharStrings.getItemAndSelector(g)[1] fdlen = len(td.FDArray) else: fdselect = None fdlen = 1 nest_limit = Compreffor.SUBR_NEST_LIMIT gsubrs, lsubrs = Compreffor.process_subrs(td.charset, glyph_encodings, fdlen, fdselect, subrs, IdKeyMap(), max_subrs, nest_limit) encoding = dict(zip(td.charset, glyph_encodings)) Compreffor.apply_subrs(td, encoding, gsubrs, lsubrs)
def get_suffixes(self): """Return the sorted suffix array""" if self._completed_suffixes: return self.suffixes with timer("get suffixes via Python sort"): self.suffixes.sort(key=lambda idx: self.data[idx[0]][idx[1]:]) self._completed_suffixes = True return self.suffixes
def iterative_encode(self, glyph_set, fdselect=None, fdlen=1): """ Choose a subroutinization encoding for all charstrings in `glyph_set` using an iterative Dynamic Programming algorithm. Initially uses the results from SubstringFinder and then iteratively optimizes. Arguments: glyph_set -- the set of charstrings to encode (required) fdselect -- the FDSelect array of the source font, or None fdlen -- the number of FD's in the source font, or 1 if there are none Returns: A three-part dictionary with keys 'gsubrs', 'lsubrs', and 'glyph_encodings'. The 'glyph_encodings' encoding dictionary specifies how to break up each charstring. Encoding[i] describes how to encode glyph i. Each entry is something like [(x_1, c_1), (x_2, c_2), ..., (x_k, c_k)], where x_* is an index into the charstring that indicates where a subr starts and c_* is a CandidateSubr. The 'gsubrs' entry contains an array of global subroutines (CandidateSubr objects) and 'lsubrs' is an array indexed by FDidx, where each entry is a list of local subroutines. """ # generate substrings for marketplace sf = SubstringFinder(glyph_set) if self.test_mode: substrings = sf.get_substrings(min_freq=0, check_positive=False, sort_by_length=False) else: substrings = sf.get_substrings(min_freq=2, check_positive=True, sort_by_length=False) # TODO remove unnecessary substrings? data = sf.data rev_keymap = sf.rev_keymap cost_map = sf.cost_map glyph_set_keys = sf.glyph_set_keys del sf if not self.SINGLE_PROCESS: pool = multiprocessing.Pool(processes=self.PROCESSES) else: class DummyPool: pass pool = DummyPool() pool.map = lambda f, *l, **kwargs: map(f, *l) substr_dict = {} timer.split() log.debug("glyphstrings+substrings=%d", len(data) + len(substrings)) # set up dictionary with initial values for idx, substr in enumerate(substrings): substr._adjusted_cost = substr.cost() substr._price = substr._adjusted_cost substr._usages = substr.freq # this is the frequency that the substring appears, # not necessarily used substr._list_idx = idx substr_dict[substr.value()] = ( idx, substr._price) # NOTE: avoid excess data copying on fork # probably can just pass substr # if threading instead for run_count in range(self.NROUNDS): # calibrate prices for idx, substr in enumerate(substrings): marg_cost = float( substr._adjusted_cost) / (substr._usages + self.K) substr._price = marg_cost * self.ALPHA + substr._price * ( 1 - self.ALPHA) substr_dict[substr.value()] = (idx, substr._price) # minimize substring costs csize = int(math.ceil(self.POOL_CHUNKRATIO * len(substrings))) substr_encodings = pool.map( functools.partial(optimize_charstring, cost_map=cost_map, substr_dict=substr_dict, progress=self._progress), enumerate([s.value() for s in substrings]), chunksize=csize) for substr, result in zip(substrings, substr_encodings): substr._encoding = [(enc_item[0], substrings[enc_item[1]]) for enc_item in result["encoding"]] substr._adjusted_cost = result["market_cost"] del substr_encodings # minimize charstring costs in current market through DP csize = int(math.ceil(self.POOL_CHUNKRATIO * len(data))) encodings = pool.map(functools.partial(optimize_charstring, cost_map=cost_map, substr_dict=substr_dict, progress=self._progress), data, chunksize=csize) encodings = [[(enc_item[0], substrings[enc_item[1]]) for enc_item in i["encoding"]] for i in encodings] # update substring frequencies based on cost minimization for substr in substrings: substr._usages = 0 for calling_substr in substrings: for start, substr in calling_substr._encoding: if substr: substr._usages += 1 for glyph_idx, enc in enumerate(encodings): for start, substr in enc: if substr: substr._usages += 1 if log.isEnabledFor(logging.INFO): log.info("Round %d Done!", (run_count + 1)) log.info( "avg: %f", (float(sum(substr._usages for substr in substrings)) / len(substrings))) log.info("max: %d", max(substr._usages for substr in substrings)) log.info("used: %d", sum(substr._usages > 0 for substr in substrings)) if run_count <= self.NROUNDS - 2 and not self.test_mode: with timer("cutdown"): if run_count < self.NROUNDS - 2: bad_substrings = [ s for s in substrings if s.subr_saving(use_usages=True) <= 0 ] substrings = [ s for s in substrings if s.subr_saving(use_usages=True) > 0 ] else: bad_substrings = [ s for s in substrings if s.subr_saving( use_usages=True, true_cost=False) <= 0 ] substrings = [ s for s in substrings if s.subr_saving(use_usages=True, true_cost=False) > 0 ] for substr in bad_substrings: # heuristic to encourage use of called substrings: for idx, called_substr in substr._encoding: called_substr._usages += substr._usages - 1 del substr_dict[substr.value()] for idx, s in enumerate(substrings): s._list_idx = idx if log.isEnabledFor(logging.DEBUG): log.debug( "%d substrings with non-positive savings removed", len(bad_substrings)) log.debug( "(%d had positive usage)", len([s for s in bad_substrings if s._usages > 0])) log.info("Finished iterative market (%gs)", timer.split()) log.info("%d candidate subrs found", len(substrings)) gsubrs, lsubrs = Compreffor.process_subrs(glyph_set_keys, encodings, fdlen, fdselect, substrings, rev_keymap, self.NSUBRS_LIMIT, self.SUBR_NEST_LIMIT) return { "glyph_encodings": dict(zip(glyph_set_keys, encodings)), "lsubrs": lsubrs, "gsubrs": gsubrs }
def iterative_encode(self, glyph_set, fdselect=None, fdlen=1): """ Choose a subroutinization encoding for all charstrings in `glyph_set` using an iterative Dynamic Programming algorithm. Initially uses the results from SubstringFinder and then iteratively optimizes. Arguments: glyph_set -- the set of charstrings to encode (required) fdselect -- the FDSelect array of the source font, or None fdlen -- the number of FD's in the source font, or 1 if there are none Returns: A three-part dictionary with keys 'gsubrs', 'lsubrs', and 'glyph_encodings'. The 'glyph_encodings' encoding dictionary specifies how to break up each charstring. Encoding[i] describes how to encode glyph i. Each entry is something like [(x_1, c_1), (x_2, c_2), ..., (x_k, c_k)], where x_* is an index into the charstring that indicates where a subr starts and c_* is a CandidateSubr. The 'gsubrs' entry contains an array of global subroutines (CandidateSubr objects) and 'lsubrs' is an array indexed by FDidx, where each entry is a list of local subroutines. """ # generate substrings for marketplace sf = SubstringFinder(glyph_set) if self.test_mode: substrings = sf.get_substrings(min_freq=0, check_positive=False, sort_by_length=False) else: substrings = sf.get_substrings(min_freq=2, check_positive=True, sort_by_length=False) # TODO remove unnecessary substrings? data = sf.data rev_keymap = sf.rev_keymap cost_map = sf.cost_map glyph_set_keys = sf.glyph_set_keys del sf if not self.SINGLE_PROCESS: pool = multiprocessing.Pool(processes=self.PROCESSES) else: class DummyPool: pass pool = DummyPool() pool.map = lambda f, *l, **kwargs: map(f, *l) substr_dict = {} timer.split() log.debug("glyphstrings+substrings=%d", len(data) + len(substrings)) # set up dictionary with initial values for idx, substr in enumerate(substrings): substr._adjusted_cost = substr.cost() substr._price = substr._adjusted_cost substr._usages = substr.freq # this is the frequency that the substring appears, # not necessarily used substr._list_idx = idx substr_dict[substr.value()] = (idx, substr._price) # NOTE: avoid excess data copying on fork # probably can just pass substr # if threading instead for run_count in range(self.NROUNDS): # calibrate prices for idx, substr in enumerate(substrings): marg_cost = float(substr._adjusted_cost) / (substr._usages + self.K) substr._price = marg_cost * self.ALPHA + substr._price * (1 - self.ALPHA) substr_dict[substr.value()] = (idx, substr._price) # minimize substring costs csize = int(math.ceil(self.POOL_CHUNKRATIO*len(substrings))) substr_encodings = pool.map(functools.partial(optimize_charstring, cost_map=cost_map, substr_dict=substr_dict, progress=self._progress), enumerate([s.value() for s in substrings]), chunksize=csize) for substr, result in zip(substrings, substr_encodings): substr._encoding = [(enc_item[0], substrings[enc_item[1]]) for enc_item in result["encoding"]] substr._adjusted_cost = result["market_cost"] del substr_encodings # minimize charstring costs in current market through DP csize = int(math.ceil(self.POOL_CHUNKRATIO*len(data))) encodings = pool.map(functools.partial(optimize_charstring, cost_map=cost_map, substr_dict=substr_dict, progress=self._progress), data, chunksize=csize) encodings = [[(enc_item[0], substrings[enc_item[1]]) for enc_item in i["encoding"]] for i in encodings] # update substring frequencies based on cost minimization for substr in substrings: substr._usages = 0 for calling_substr in substrings: for start, substr in calling_substr._encoding: if substr: substr._usages += 1 for glyph_idx, enc in enumerate(encodings): for start, substr in enc: if substr: substr._usages += 1 if log.isEnabledFor(logging.INFO): log.info("Round %d Done!", (run_count + 1)) log.info("avg: %f", (float(sum(substr._usages for substr in substrings)) / len(substrings))) log.info("max: %d", max(substr._usages for substr in substrings)) log.info("used: %d", sum(substr._usages > 0 for substr in substrings)) if run_count <= self.NROUNDS - 2 and not self.test_mode: with timer("cutdown"): if run_count < self.NROUNDS - 2: bad_substrings = [s for s in substrings if s.subr_saving(use_usages=True) <= 0] substrings = [s for s in substrings if s.subr_saving(use_usages=True) > 0] else: bad_substrings = [s for s in substrings if s.subr_saving(use_usages=True, true_cost=False) <= 0] substrings = [s for s in substrings if s.subr_saving(use_usages=True, true_cost=False) > 0] for substr in bad_substrings: # heuristic to encourage use of called substrings: for idx, called_substr in substr._encoding: called_substr._usages += substr._usages - 1 del substr_dict[substr.value()] for idx, s in enumerate(substrings): s._list_idx = idx if log.isEnabledFor(logging.DEBUG): log.debug("%d substrings with non-positive savings removed", len(bad_substrings)) log.debug("(%d had positive usage)", len([s for s in bad_substrings if s._usages > 0])) log.info("Finished iterative market (%gs)", timer.split()) log.info("%d candidate subrs found", len(substrings)) gsubrs, lsubrs = Compreffor.process_subrs( glyph_set_keys, encodings, fdlen, fdselect, substrings, rev_keymap, self.NSUBRS_LIMIT, self.SUBR_NEST_LIMIT) return {"glyph_encodings": dict(zip(glyph_set_keys, encodings)), "lsubrs": lsubrs, "gsubrs": gsubrs}