def get_lcp(self): """Returns the LCP array""" if not self._completed_suffixes: self.get_suffixes() assert self._completed_suffixes rank = [[0 for _ in range(len(d_list))] for d_list in self.data] lcp = [0 for _ in range(self.length)] # compute rank array for i in range(self.length): glyph_idx, tok_idx = self.suffixes[i] rank[glyph_idx][tok_idx] = i for glyph_idx in range(len(self.data)): cur_h = 0 chstring = self.data[glyph_idx] for tok_idx in range(len(chstring)): cur_rank = rank[glyph_idx][tok_idx] if cur_rank > 0: last_glidx, last_tidx = self.suffixes[cur_rank - 1] last_chstring = self.data[last_glidx] while last_tidx + cur_h < len(last_chstring) and \ tok_idx + cur_h < len(chstring) and \ last_chstring[last_tidx + cur_h] == self.data[glyph_idx][tok_idx + cur_h]: cur_h += 1 lcp[cur_rank] = cur_h if cur_h > 0: cur_h -= 1 return lcp
def _sortByDecompositionBase(self, glyphNames, ascending, allowPseudoUnicode): baseToGlyphNames = {None: []} for glyphName in glyphNames: if allowPseudoUnicode: value = self.pseudoUnicodeForGlyphName(glyphName) else: value = self.unicodeForGlyphName(glyphName) if value is None: base = None else: base = unicodeTools.decompositionBase(value) base = self.glyphNameForUnicode(base) # try to add the glyph names suffix to the base. # this will handle mapping aacute.alt to a.alt # instead of aacute.alt to a. if base is not None: if "." in glyphName and not glyphName.startswith("."): suffix = glyphName.split(".")[1] if base + "." + suffix in self.font: base = base + "." + suffix if base not in baseToGlyphNames: baseToGlyphNames[base] = [] baseToGlyphNames[base].append(glyphName) # get the list of glyphs with no base. noBase = baseToGlyphNames.pop(None) # find all bases that are not in the overall glyph names list missingBase = [] for base in sorted(baseToGlyphNames): if base is None: continue if base not in noBase: missingBase.append(base) # work through the found bases processedBases = set() sortedResult = [] for base in noBase: if base in processedBases: continue processedBases.add(base) # the base could be in the list more than once. # if so, add the proper number of instances of the base. count = noBase.count(base) r = [base for i in range(count)] # add the referencing glyphs r += baseToGlyphNames.get(base, []) sortedResult.append(r) # work through the missing bases for base in sorted(missingBase): sortedResult.append(baseToGlyphNames[base]) # reverse if necessary if not ascending: sortedResult.reverse() return sortedResult
def _sortByDecompositionBase(self, glyphNames, ascending, allowPseudoUnicode): baseToGlyphNames = {None:[]} for glyphName in glyphNames: if allowPseudoUnicode: value = self.pseudoUnicodeForGlyphName(glyphName) else: value = self.unicodeForGlyphName(glyphName) if value is None: base = None else: base = unicodeTools.decompositionBase(value) base = self.glyphNameForUnicode(base) # try to add the glyph names suffix to the base. # this will handle mapping aacute.alt to a.alt # instead of aacute.alt to a. if base is not None: if "." in glyphName and not glyphName.startswith("."): suffix = glyphName.split(".")[1] if base + "." + suffix in self.font: base = base + "." + suffix if base not in baseToGlyphNames: baseToGlyphNames[base] = [] baseToGlyphNames[base].append(glyphName) # get the list of glyphs with no base. noBase = baseToGlyphNames.pop(None) # find all bases that are not in the overall glyph names list missingBase = [] for base in sorted(baseToGlyphNames): if base is None: continue if base not in noBase: missingBase.append(base) # work through the found bases processedBases = set() sortedResult = [] for base in noBase: if base in processedBases: continue processedBases.add(base) # the base could be in the list more than once. # if so, add the proper number of instances of the base. count = noBase.count(base) r = [base for i in range(count)] # add the referencing glyphs r += baseToGlyphNames.get(base, []) sortedResult.append(r) # work through the missing bases for base in sorted(missingBase): sortedResult.append(baseToGlyphNames[base]) # reverse if necessary if not ascending: sortedResult.reverse() return sortedResult
def _encode_base64(data, maxlinelength=76, indent_level=1): data = b64encode(data) if data and maxlinelength: # split into multiple lines right-justified to 'maxlinelength' chars indent = b"\n" + b" " * indent_level max_length = max(16, maxlinelength - len(indent)) chunks = [] for i in range(0, len(data), max_length): chunks.append(indent) chunks.append(data[i:i + max_length]) chunks.append(indent) data = b"".join(chunks) return data
def process_chstrings(self, glyph_set): """Remap the charstring alphabet and put into self.data""" self.glyph_set_keys = sorted(glyph_set.keys()) keymap = {} # maps charstring tokens -> simple integer alphabet next_key = 0 for k in self.glyph_set_keys: char_string = glyph_set[k]._glyph char_string.decompile() program = [] piter = iter(enumerate(char_string.program)) for i, tok in piter: assert tok not in ("callsubr", "callgsubr", "return") assert tok != "endchar" or i == len(char_string.program) - 1 if tok in ("hintmask", "cntrmask"): # Attach next token to this, as a subroutine # call cannot be placed between this token and # the following. _, tokennext = next(piter) tok = (tok, tokennext) if not tok in keymap: keymap[tok] = next_key self.rev_keymap.append(tok) self.cost_map.append(tokenCost(tok)) next_key += 1 program.append(keymap[tok]) program = tuple(program) chstr_len = len(program) self.length += chstr_len glyph_idx = len(self.data) self.suffixes.extend( map(lambda x: (glyph_idx, x), range(chstr_len)) ) self.data.append(tuple(program)) self.alphabet_size = next_key
def process_chstrings(self, glyph_set): """Remap the charstring alphabet and put into self.data""" self.glyph_set_keys = sorted(glyph_set.keys()) keymap = {} # maps charstring tokens -> simple integer alphabet next_key = 0 for k in self.glyph_set_keys: char_string = glyph_set[k]._glyph char_string.decompile() program = [] piter = iter(enumerate(char_string.program)) for i, tok in piter: assert tok not in ("callsubr", "callgsubr", "return") assert tok != "endchar" or i == len(char_string.program) - 1 if tok in ("hintmask", "cntrmask"): # Attach next token to this, as a subroutine # call cannot be placed between this token and # the following. _, tokennext = next(piter) tok = (tok, tokennext) if not tok in keymap: keymap[tok] = next_key self.rev_keymap.append(tok) self.cost_map.append(tokenCost(tok)) next_key += 1 program.append(keymap[tok]) program = tuple(program) chstr_len = len(program) self.length += chstr_len glyph_idx = len(self.data) self.suffixes.extend( map(lambda x: (glyph_idx, x), range(chstr_len))) self.data.append(tuple(program)) self.alphabet_size = next_key
def process_subrs(glyph_set_keys, encodings, fdlen, fdselect, substrings, rev_keymap, subr_limit, nest_limit): def mark_reachable(cand_subr, fdidx): try: if fdidx not in cand_subr._fdidx: cand_subr._fdidx.append(fdidx) except AttributeError: cand_subr._fdidx = [fdidx] for it in cand_subr._encoding: mark_reachable(it[1], fdidx) if fdselect is not None: for g, enc in zip(glyph_set_keys, encodings): sel = fdselect(g) for it in enc: mark_reachable(it[1], sel) else: for encoding in encodings: for it in encoding: mark_reachable(it[1], 0) subrs = [ s for s in substrings if s.usages() > 0 and hasattr(s, '_fdidx') and bool(s._fdidx) and s.subr_saving(use_usages=True, true_cost=True) > 0 ] bad_substrings = [ s for s in substrings if s.usages() == 0 or not hasattr(s, '_fdidx') or not bool(s._fdidx) or s.subr_saving(use_usages=True, true_cost=True) <= 0 ] log.debug("%d substrings unused or negative saving subrs", len(bad_substrings)) for s in bad_substrings: s._flatten = True gsubrs = [] lsubrs = [[] for _ in range(fdlen)] subrs.sort( key=lambda s: s.subr_saving(use_usages=True, true_cost=True)) while subrs and (any(len(s) < subr_limit for s in lsubrs) or len(gsubrs) < subr_limit): subr = subrs[-1] del subrs[-1] if len(subr._fdidx) == 1: lsub_index = lsubrs[subr._fdidx[0]] if len(gsubrs) < subr_limit: if len(lsub_index) < subr_limit: # both have space gcost = Compreffor.test_call_cost(subr, gsubrs) lcost = Compreffor.test_call_cost(subr, lsub_index) if gcost < lcost: Compreffor.insert_by_usage(subr, gsubrs) subr._global = True else: Compreffor.insert_by_usage(subr, lsub_index) else: # just gsubrs has space Compreffor.insert_by_usage(subr, gsubrs) subr._global = True elif len(lsub_index) < subr_limit: # just lsubrs has space Compreffor.insert_by_usage(subr, lsub_index) else: # we must skip :( bad_substrings.append(subr) else: if len(gsubrs) < subr_limit: # we can put it in globals Compreffor.insert_by_usage(subr, gsubrs) subr._global = True else: # no room for this one bad_substrings.append(subr) bad_substrings.extend([s[1] for s in subrs ]) # add any leftover subrs to bad_substrings if fdselect is not None: # CID-keyed: Avoid `callsubr` usage in global subroutines bad_lsubrs = Compreffor.collect_lsubrs_called_from(gsubrs) bad_substrings.extend(bad_lsubrs) lsubrs = [[s for s in lsubrarr if s not in bad_lsubrs] for lsubrarr in lsubrs] for s in bad_substrings: s._flatten = True # fix any nesting issues Compreffor.calc_nesting(gsubrs) for subrs in lsubrs: Compreffor.calc_nesting(subrs) too_nested = [ s for s in itertools.chain(*lsubrs) if s._max_call_depth > nest_limit ] too_nested.extend( [s for s in gsubrs if s._max_call_depth > nest_limit]) for s in too_nested: s._flatten = True bad_substrings.extend(too_nested) lsubrs = [[s for s in lsubrarr if s._max_call_depth <= nest_limit] for lsubrarr in lsubrs] gsubrs = [s for s in gsubrs if s._max_call_depth <= nest_limit] too_nested = len(too_nested) log.debug("%d substrings nested too deep", too_nested) log.debug("%d substrings being flattened", len(bad_substrings)) # reorganize to minimize call cost of most frequent subrs gbias = psCharStrings.calcSubrBias(gsubrs) lbias = [psCharStrings.calcSubrBias(s) for s in lsubrs] for subr_arr, bias in zip(itertools.chain([gsubrs], lsubrs), itertools.chain([gbias], lbias)): subr_arr.sort(key=lambda s: s.usages(), reverse=True) if bias == 1131: subr_arr[:] = subr_arr[216:1240] + subr_arr[0:216] + subr_arr[ 1240:] elif bias == 32768: subr_arr[:] = (subr_arr[2264:33901] + subr_arr[216:1240] + subr_arr[0:216] + subr_arr[1240:2264] + subr_arr[33901:]) for idx, subr in enumerate(subr_arr): subr._position = idx for subr in sorted(bad_substrings, key=lambda s: len(s)): # NOTE: it is important this is run in order so shorter # substrings are run before longer ones if hasattr(subr, '_fdidx') and len(subr._fdidx) > 0: program = [rev_keymap[tok] for tok in subr.value()] Compreffor.update_program(program, subr.encoding(), gbias, lbias, None) Compreffor.expand_hintmask(program) subr._program = program for subr_arr, sel in zip(itertools.chain([gsubrs], lsubrs), itertools.chain([None], range(fdlen))): for subr in subr_arr: program = [rev_keymap[tok] for tok in subr.value()] if program[-1] not in ("endchar", "return"): program.append("return") Compreffor.update_program(program, subr.encoding(), gbias, lbias, sel) Compreffor.expand_hintmask(program) subr._program = program return (gsubrs, lsubrs)
def iterative_encode(self, glyph_set, fdselect=None, fdlen=1): """ Choose a subroutinization encoding for all charstrings in `glyph_set` using an iterative Dynamic Programming algorithm. Initially uses the results from SubstringFinder and then iteratively optimizes. Arguments: glyph_set -- the set of charstrings to encode (required) fdselect -- the FDSelect array of the source font, or None fdlen -- the number of FD's in the source font, or 1 if there are none Returns: A three-part dictionary with keys 'gsubrs', 'lsubrs', and 'glyph_encodings'. The 'glyph_encodings' encoding dictionary specifies how to break up each charstring. Encoding[i] describes how to encode glyph i. Each entry is something like [(x_1, c_1), (x_2, c_2), ..., (x_k, c_k)], where x_* is an index into the charstring that indicates where a subr starts and c_* is a CandidateSubr. The 'gsubrs' entry contains an array of global subroutines (CandidateSubr objects) and 'lsubrs' is an array indexed by FDidx, where each entry is a list of local subroutines. """ # generate substrings for marketplace sf = SubstringFinder(glyph_set) if self.test_mode: substrings = sf.get_substrings(min_freq=0, check_positive=False, sort_by_length=False) else: substrings = sf.get_substrings(min_freq=2, check_positive=True, sort_by_length=False) # TODO remove unnecessary substrings? data = sf.data rev_keymap = sf.rev_keymap cost_map = sf.cost_map glyph_set_keys = sf.glyph_set_keys del sf if not self.SINGLE_PROCESS: pool = multiprocessing.Pool(processes=self.PROCESSES) else: class DummyPool: pass pool = DummyPool() pool.map = lambda f, *l, **kwargs: map(f, *l) substr_dict = {} timer.split() log.debug("glyphstrings+substrings=%d", len(data) + len(substrings)) # set up dictionary with initial values for idx, substr in enumerate(substrings): substr._adjusted_cost = substr.cost() substr._price = substr._adjusted_cost substr._usages = substr.freq # this is the frequency that the substring appears, # not necessarily used substr._list_idx = idx substr_dict[substr.value()] = ( idx, substr._price) # NOTE: avoid excess data copying on fork # probably can just pass substr # if threading instead for run_count in range(self.NROUNDS): # calibrate prices for idx, substr in enumerate(substrings): marg_cost = float( substr._adjusted_cost) / (substr._usages + self.K) substr._price = marg_cost * self.ALPHA + substr._price * ( 1 - self.ALPHA) substr_dict[substr.value()] = (idx, substr._price) # minimize substring costs csize = int(math.ceil(self.POOL_CHUNKRATIO * len(substrings))) substr_encodings = pool.map( functools.partial(optimize_charstring, cost_map=cost_map, substr_dict=substr_dict, progress=self._progress), enumerate([s.value() for s in substrings]), chunksize=csize) for substr, result in zip(substrings, substr_encodings): substr._encoding = [(enc_item[0], substrings[enc_item[1]]) for enc_item in result["encoding"]] substr._adjusted_cost = result["market_cost"] del substr_encodings # minimize charstring costs in current market through DP csize = int(math.ceil(self.POOL_CHUNKRATIO * len(data))) encodings = pool.map(functools.partial(optimize_charstring, cost_map=cost_map, substr_dict=substr_dict, progress=self._progress), data, chunksize=csize) encodings = [[(enc_item[0], substrings[enc_item[1]]) for enc_item in i["encoding"]] for i in encodings] # update substring frequencies based on cost minimization for substr in substrings: substr._usages = 0 for calling_substr in substrings: for start, substr in calling_substr._encoding: if substr: substr._usages += 1 for glyph_idx, enc in enumerate(encodings): for start, substr in enc: if substr: substr._usages += 1 if log.isEnabledFor(logging.INFO): log.info("Round %d Done!", (run_count + 1)) log.info( "avg: %f", (float(sum(substr._usages for substr in substrings)) / len(substrings))) log.info("max: %d", max(substr._usages for substr in substrings)) log.info("used: %d", sum(substr._usages > 0 for substr in substrings)) if run_count <= self.NROUNDS - 2 and not self.test_mode: with timer("cutdown"): if run_count < self.NROUNDS - 2: bad_substrings = [ s for s in substrings if s.subr_saving(use_usages=True) <= 0 ] substrings = [ s for s in substrings if s.subr_saving(use_usages=True) > 0 ] else: bad_substrings = [ s for s in substrings if s.subr_saving( use_usages=True, true_cost=False) <= 0 ] substrings = [ s for s in substrings if s.subr_saving(use_usages=True, true_cost=False) > 0 ] for substr in bad_substrings: # heuristic to encourage use of called substrings: for idx, called_substr in substr._encoding: called_substr._usages += substr._usages - 1 del substr_dict[substr.value()] for idx, s in enumerate(substrings): s._list_idx = idx if log.isEnabledFor(logging.DEBUG): log.debug( "%d substrings with non-positive savings removed", len(bad_substrings)) log.debug( "(%d had positive usage)", len([s for s in bad_substrings if s._usages > 0])) log.info("Finished iterative market (%gs)", timer.split()) log.info("%d candidate subrs found", len(substrings)) gsubrs, lsubrs = Compreffor.process_subrs(glyph_set_keys, encodings, fdlen, fdselect, substrings, rev_keymap, self.NSUBRS_LIMIT, self.SUBR_NEST_LIMIT) return { "glyph_encodings": dict(zip(glyph_set_keys, encodings)), "lsubrs": lsubrs, "gsubrs": gsubrs }
def optimize_charstring(charstring, cost_map, substr_dict, progress=False): """Optimize a charstring (encoded using keymap) using the substrings in substr_dict. This is the Dynamic Programming portion of `iterative_encode`.""" if len(charstring) > 1 and type(charstring[1]) == tuple: if type(charstring[0]) == int: skip_idx = charstring[0] charstring = charstring[1] else: skip_idx = None results = [0 for _ in range(len(charstring) + 1)] next_enc_idx = [None for _ in range(len(charstring))] next_enc_substr = [None for _ in range(len(charstring))] for i in reversed(range(len(charstring))): min_option = float("inf") min_enc_idx = len(charstring) min_enc_substr = None cur_cost = 0 for j in range(i + 1, len(charstring) + 1): cur_cost += cost_map[charstring[j - 1]] if charstring[i:j] in substr_dict: substr = substr_dict[charstring[i:j]] if substr[0] != skip_idx: option = substr[1] + results[j] substr = substr[0] else: assert i == 0 and j == len(charstring) substr = None option = cur_cost + results[j] else: # note: must not be branching, so just make _price actual cost substr = None option = cur_cost + results[j] if option < min_option: min_option = option min_enc_idx = j min_enc_substr = substr results[i] = min_option next_enc_idx[i] = min_enc_idx next_enc_substr[i] = min_enc_substr market_cost = results[0] encoding = [] cur_enc_idx = 0 last = len(next_enc_idx) while cur_enc_idx < last: last_idx = cur_enc_idx cur_enc_substr = next_enc_substr[cur_enc_idx] cur_enc_idx = next_enc_idx[cur_enc_idx] if cur_enc_substr is not None: encoding.append((last_idx, cur_enc_substr)) if progress: sys.stderr.write(".") sys.stderr.flush() return {"encoding": encoding, "market_cost": market_cost}
def process_subrs(glyph_set_keys, encodings, fdlen, fdselect, substrings, rev_keymap, subr_limit, nest_limit): def mark_reachable(cand_subr, fdidx): try: if fdidx not in cand_subr._fdidx: cand_subr._fdidx.append(fdidx) except AttributeError: cand_subr._fdidx = [fdidx] for it in cand_subr._encoding: mark_reachable(it[1], fdidx) if fdselect is not None: for g, enc in zip(glyph_set_keys, encodings): sel = fdselect(g) for it in enc: mark_reachable(it[1], sel) else: for encoding in encodings: for it in encoding: mark_reachable(it[1], 0) subrs = [s for s in substrings if s.usages() > 0 and hasattr(s, '_fdidx') and bool(s._fdidx) and s.subr_saving(use_usages=True, true_cost=True) > 0] bad_substrings = [s for s in substrings if s.usages() == 0 or not hasattr(s, '_fdidx') or not bool(s._fdidx) or s.subr_saving(use_usages=True, true_cost=True) <= 0] log.debug("%d substrings unused or negative saving subrs", len(bad_substrings)) for s in bad_substrings: s._flatten = True gsubrs = [] lsubrs = [[] for _ in range(fdlen)] subrs.sort(key=lambda s: s.subr_saving(use_usages=True, true_cost=True)) while subrs and (any(len(s) < subr_limit for s in lsubrs) or len(gsubrs) < subr_limit): subr = subrs[-1] del subrs[-1] if len(subr._fdidx) == 1: lsub_index = lsubrs[subr._fdidx[0]] if len(gsubrs) < subr_limit: if len(lsub_index) < subr_limit: # both have space gcost = Compreffor.test_call_cost(subr, gsubrs) lcost = Compreffor.test_call_cost(subr, lsub_index) if gcost < lcost: Compreffor.insert_by_usage(subr, gsubrs) subr._global = True else: Compreffor.insert_by_usage(subr, lsub_index) else: # just gsubrs has space Compreffor.insert_by_usage(subr, gsubrs) subr._global = True elif len(lsub_index) < subr_limit: # just lsubrs has space Compreffor.insert_by_usage(subr, lsub_index) else: # we must skip :( bad_substrings.append(subr) else: if len(gsubrs) < subr_limit: # we can put it in globals Compreffor.insert_by_usage(subr, gsubrs) subr._global = True else: # no room for this one bad_substrings.append(subr) bad_substrings.extend([s[1] for s in subrs]) # add any leftover subrs to bad_substrings for s in bad_substrings: s._flatten = True # fix any nesting issues Compreffor.calc_nesting(gsubrs) for subrs in lsubrs: Compreffor.calc_nesting(subrs) too_nested = [s for s in itertools.chain(*lsubrs) if s._max_call_depth > nest_limit] too_nested.extend([s for s in gsubrs if s._max_call_depth > nest_limit]) for s in too_nested: s._flatten = True bad_substrings.extend(too_nested) lsubrs = [[s for s in lsubrarr if s._max_call_depth <= nest_limit] for lsubrarr in lsubrs] gsubrs = [s for s in gsubrs if s._max_call_depth <= nest_limit] too_nested = len(too_nested) log.debug("%d substrings nested too deep", too_nested) log.debug("%d substrings being flattened", len(bad_substrings)) # reorganize to minimize call cost of most frequent subrs gbias = psCharStrings.calcSubrBias(gsubrs) lbias = [psCharStrings.calcSubrBias(s) for s in lsubrs] for subr_arr, bias in zip(itertools.chain([gsubrs], lsubrs), itertools.chain([gbias], lbias)): subr_arr.sort(key=lambda s: s.usages(), reverse=True) if bias == 1131: subr_arr[:] = subr_arr[216:1240] + subr_arr[0:216] + subr_arr[1240:] elif bias == 32768: subr_arr[:] = (subr_arr[2264:33901] + subr_arr[216:1240] + subr_arr[0:216] + subr_arr[1240:2264] + subr_arr[33901:]) for idx, subr in enumerate(subr_arr): subr._position = idx for subr in sorted(bad_substrings, key=lambda s: len(s)): # NOTE: it is important this is run in order so shorter # substrings are run before longer ones if hasattr(subr, '_fdidx') and len(subr._fdidx) > 0: program = [rev_keymap[tok] for tok in subr.value()] Compreffor.update_program(program, subr.encoding(), gbias, lbias, None) Compreffor.expand_hintmask(program) subr._program = program for subr_arr, sel in zip(itertools.chain([gsubrs], lsubrs), itertools.chain([None], range(fdlen))): for subr in subr_arr: program = [rev_keymap[tok] for tok in subr.value()] if program[-1] not in ("endchar", "return"): program.append("return") Compreffor.update_program(program, subr.encoding(), gbias, lbias, sel) Compreffor.expand_hintmask(program) subr._program = program return (gsubrs, lsubrs)
def iterative_encode(self, glyph_set, fdselect=None, fdlen=1): """ Choose a subroutinization encoding for all charstrings in `glyph_set` using an iterative Dynamic Programming algorithm. Initially uses the results from SubstringFinder and then iteratively optimizes. Arguments: glyph_set -- the set of charstrings to encode (required) fdselect -- the FDSelect array of the source font, or None fdlen -- the number of FD's in the source font, or 1 if there are none Returns: A three-part dictionary with keys 'gsubrs', 'lsubrs', and 'glyph_encodings'. The 'glyph_encodings' encoding dictionary specifies how to break up each charstring. Encoding[i] describes how to encode glyph i. Each entry is something like [(x_1, c_1), (x_2, c_2), ..., (x_k, c_k)], where x_* is an index into the charstring that indicates where a subr starts and c_* is a CandidateSubr. The 'gsubrs' entry contains an array of global subroutines (CandidateSubr objects) and 'lsubrs' is an array indexed by FDidx, where each entry is a list of local subroutines. """ # generate substrings for marketplace sf = SubstringFinder(glyph_set) if self.test_mode: substrings = sf.get_substrings(min_freq=0, check_positive=False, sort_by_length=False) else: substrings = sf.get_substrings(min_freq=2, check_positive=True, sort_by_length=False) # TODO remove unnecessary substrings? data = sf.data rev_keymap = sf.rev_keymap cost_map = sf.cost_map glyph_set_keys = sf.glyph_set_keys del sf if not self.SINGLE_PROCESS: pool = multiprocessing.Pool(processes=self.PROCESSES) else: class DummyPool: pass pool = DummyPool() pool.map = lambda f, *l, **kwargs: map(f, *l) substr_dict = {} timer.split() log.debug("glyphstrings+substrings=%d", len(data) + len(substrings)) # set up dictionary with initial values for idx, substr in enumerate(substrings): substr._adjusted_cost = substr.cost() substr._price = substr._adjusted_cost substr._usages = substr.freq # this is the frequency that the substring appears, # not necessarily used substr._list_idx = idx substr_dict[substr.value()] = (idx, substr._price) # NOTE: avoid excess data copying on fork # probably can just pass substr # if threading instead for run_count in range(self.NROUNDS): # calibrate prices for idx, substr in enumerate(substrings): marg_cost = float(substr._adjusted_cost) / (substr._usages + self.K) substr._price = marg_cost * self.ALPHA + substr._price * (1 - self.ALPHA) substr_dict[substr.value()] = (idx, substr._price) # minimize substring costs csize = int(math.ceil(self.POOL_CHUNKRATIO*len(substrings))) substr_encodings = pool.map(functools.partial(optimize_charstring, cost_map=cost_map, substr_dict=substr_dict, progress=self._progress), enumerate([s.value() for s in substrings]), chunksize=csize) for substr, result in zip(substrings, substr_encodings): substr._encoding = [(enc_item[0], substrings[enc_item[1]]) for enc_item in result["encoding"]] substr._adjusted_cost = result["market_cost"] del substr_encodings # minimize charstring costs in current market through DP csize = int(math.ceil(self.POOL_CHUNKRATIO*len(data))) encodings = pool.map(functools.partial(optimize_charstring, cost_map=cost_map, substr_dict=substr_dict, progress=self._progress), data, chunksize=csize) encodings = [[(enc_item[0], substrings[enc_item[1]]) for enc_item in i["encoding"]] for i in encodings] # update substring frequencies based on cost minimization for substr in substrings: substr._usages = 0 for calling_substr in substrings: for start, substr in calling_substr._encoding: if substr: substr._usages += 1 for glyph_idx, enc in enumerate(encodings): for start, substr in enc: if substr: substr._usages += 1 if log.isEnabledFor(logging.INFO): log.info("Round %d Done!", (run_count + 1)) log.info("avg: %f", (float(sum(substr._usages for substr in substrings)) / len(substrings))) log.info("max: %d", max(substr._usages for substr in substrings)) log.info("used: %d", sum(substr._usages > 0 for substr in substrings)) if run_count <= self.NROUNDS - 2 and not self.test_mode: with timer("cutdown"): if run_count < self.NROUNDS - 2: bad_substrings = [s for s in substrings if s.subr_saving(use_usages=True) <= 0] substrings = [s for s in substrings if s.subr_saving(use_usages=True) > 0] else: bad_substrings = [s for s in substrings if s.subr_saving(use_usages=True, true_cost=False) <= 0] substrings = [s for s in substrings if s.subr_saving(use_usages=True, true_cost=False) > 0] for substr in bad_substrings: # heuristic to encourage use of called substrings: for idx, called_substr in substr._encoding: called_substr._usages += substr._usages - 1 del substr_dict[substr.value()] for idx, s in enumerate(substrings): s._list_idx = idx if log.isEnabledFor(logging.DEBUG): log.debug("%d substrings with non-positive savings removed", len(bad_substrings)) log.debug("(%d had positive usage)", len([s for s in bad_substrings if s._usages > 0])) log.info("Finished iterative market (%gs)", timer.split()) log.info("%d candidate subrs found", len(substrings)) gsubrs, lsubrs = Compreffor.process_subrs( glyph_set_keys, encodings, fdlen, fdselect, substrings, rev_keymap, self.NSUBRS_LIMIT, self.SUBR_NEST_LIMIT) return {"glyph_encodings": dict(zip(glyph_set_keys, encodings)), "lsubrs": lsubrs, "gsubrs": gsubrs}