def _print_convert_msg (url, verbose, config): if verbose > 0: line_length = config.get_int('status_line_length', 60) urltext = str (url) if len (urltext) > line_length: urltext = urltext[:line_length - 20] + "....." + urltext[-15:] message("Converting %s..." % urltext)
def _print_convert_msg (url, id, verbose, config): if verbose > 1: line_length = config.get_int('status_line_length', 60) urltext = str (url) if len (urltext) > line_length: urltext = urltext[:line_length - 20] + "....." + urltext[-15:] message("Converted %4d: %s" % (id, urltext))
def _write_doc (self, out_dict, pluckerdoc, url, id, verbose): def _print_convert_msg (url, id, verbose, config): if verbose > 1: line_length = config.get_int('status_line_length', 60) urltext = str (url) if len (urltext) > line_length: urltext = urltext[:line_length - 20] + "....." + urltext[-15:] message("Converted %4d: %s" % (id, urltext)) if id != self._mapper.get_or_add(pluckerdoc): raise ValueError("bad id %d instead of %d" % (id, self._mapper.get_or_add(pluckerdoc))) if pluckerdoc.is_text_document (): dumps = pluckerdoc.dump_record_with_splits (self._mapper) # sys.stderr.write("dumps is %s\n" % str(map(lambda p: (p[0], p[1]), dumps))) if dumps[0][1] != id: message("****** bad id %d instead of %d" % (dumps[0][1], id,)) for dump in dumps: (the_url, the_id, dump) = dump if the_id == 0: the_id = id # original out_dict [the_id] = (dump, the_url, the_id, verbose) _print_convert_msg(the_url, the_id, verbose, self._config) return else: dump = pluckerdoc.dump_record (id) out_dict [id] = (dump, url, id, verbose) _print_convert_msg(url, id, verbose, self._config)
def get_or_add (self, url_or_doc): # For a standard URL, returns the numeric record ID. # For a URL which has a fragment-id: # If the fragment is a paragraph of a text page, a pair # (record-id, paragraph-id) is returned. # Otherwise, just the record id is returned. # If arg is PluckerDocument, returns the id assigned for that document. # If arg is integer, treats it as a registered-document id. Get-only. if type(url_or_doc) == type(''): import urllib url, tag = urllib.splittag(url_or_doc) finalurl = self._alias_list.get(url, url) if tag: id = self._get_id_for_url((finalurl, tag)) else: id = self._get_id_for_url(finalurl) return id elif isinstance(url_or_doc, PluckerDocs.PluckerDocument): url = url_or_doc.get_url() if not self._url_to_doc_mapping.has_key(url): self._url_to_doc_mapping[url] = url_or_doc if not self._doc_to_id_mapping.has_key(url_or_doc) and self._url_to_id_mapping.has_key(url): self._doc_to_id_mapping[url_or_doc] = self._url_to_id_mapping[url] if not self._doc_to_id_mapping.has_key(url_or_doc): message(2, "New document %s added", url_or_doc) return self._get_id_for_doc(url_or_doc) else: raise ValueError("not a URL or an instance of " + str(PluckerDocs.PluckerDocument))
def _get_id_for_doc(self, idoc, add=1): if type(idoc) == type(()): doc = idoc[0] else: doc = idoc id = self._doc_to_id_mapping.get(doc) if not id: id = self._url_to_id_mapping.get(doc.get_url()) if id: self._doc_to_id_mapping[doc] = id if not id: if not add: return None if isinstance(doc, PluckerDocs.PluckerIndexDocument): # there's only one, and it always has record # 1 id = 1 elif isinstance(doc, PluckerDocs.PluckerBookmarkDocument): id = 6 elif isinstance(doc, PluckerDocs.PluckerLinkIndexDocument): id = 3 elif isinstance(doc, PluckerDocs.PluckerCategoryDocument): id = 4 elif isinstance(doc, PluckerDocs.PluckerMetadataDocument): id = 5 else: id = self._current_id self._current_id = self._current_id + 1 self._doc_to_id_mapping[doc] = id url_mapping = self._url_to_doc_mapping.get(doc.get_url()) if (url_mapping != doc): if (url_mapping != None): message("URL %s for doc %s points to doc %s\n" % (doc.get_url(), str(doc), str(url_mapping))) self._url_to_doc_mapping[doc.get_url()] = doc # message("new document " + str(doc) + " => " + str(id) + "\n") if type(idoc) == type(()): return (id, idoc[1]) else: return id
def convert(self, width, height, bpp, section): import java, jarray import net.sourceforge.jiu try: if section: cropper = net.sourceforge.jiu.geometry.Crop() cropper.setInputImage(self._image) cropper.setBounds(section[0], section[1], section[0] + section[2], section[1] + section[3]) cropper.process() im = cropper.getOutputImage() else: im = self._image # scale if necessary if width != im.getWidth() or height != im.getHeight(): message( 2, "Scaling original %dx%d image by %f/%f to %dx%dx%d" % (im.getWidth(), im.getHeight(), float(width) / float(im.getWidth()), float(height) / float(im.getHeight()), width, height, bpp)) scaler = net.sourceforge.jiu.geometry.Resample() scaler.setInputImage(im) scaler.setSize(width, height) # bell filter is reasonably fast and reasonably accurate # scaler.setFilter(net.sourceforge.jiu.geometry.Resample.FILTER_TYPE_BELL); # b-spline is more accurate scaler.setFilter( net.sourceforge.jiu.geometry.Resample.FILTER_TYPE_B_SPLINE) scaler.process() im = scaler.getOutputImage() # convert to proper bit depth if bpp == 1: reducer = net.sourceforge.jiu.color.reduction.RGBToGrayConversion( ) reducer.setInputImage(im) reducer.process() ditherer = net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering( ) ditherer.setType(net.sourceforge.jiu.color.dithering. ErrorDiffusionDithering.TYPE_FLOYD_STEINBERG) ditherer.setGrayscaleOutputBits(1) ditherer.setInputImage(reducer.getOutputImage()) ditherer.process() im = ditherer.getOutputImage() elif bpp in (2, 4, 8): if bpp == 2: palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem2BitGrayscalePalette( ) dither = 1 elif bpp == 4: palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem4BitGrayscalePalette( ) dither = 1 elif bpp == 8: palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem8BitPalette( ) dither = 0 quantizer = net.sourceforge.jiu.color.quantization.ArbitraryPaletteQuantizer( palette) nodither = self._config.get_bool( 'no_dithering_in_java_image_quantization' ) or self._attribs.has_key('nodither') if nodither or not dither: quantizer.setInputImage(im) quantizer.process() im = quantizer.getOutputImage() else: ditherer = net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering( ) ditherer.setType( net.sourceforge.jiu.color.dithering. ErrorDiffusionDithering.TYPE_FLOYD_STEINBERG) ditherer.setQuantizer(quantizer) ditherer.setTruecolorOutput(0) ditherer.setInputImage(im) ditherer.process() im = ditherer.getOutputImage() elif bpp != 16: message( 0, "%d bpp images not supported with JIU imaging yet. Using 16 bit color.\n" % (bpp, )) # finally, turn it into a Palm image codec = net.sourceforge.jiu.codecs.PalmCodec() codec.setImage(im) codec.setCompression( net.sourceforge.jiu.codecs.PalmCodec.COMPRESSION_RLE) outputStream = java.io.ByteArrayOutputStream() codec.setOutputStream(outputStream) codec.process() codec.close() # bytes = outputStream.toByteArray() # bits = string.join(map(lambda x: chr(((x < 0) and (0x100 + x)) or x), bytes), "") return outputStream.toString(0) except: if self._verbose > 1: import traceback traceback.print_exc() raise RuntimeError("Error while converting image " + self._url + " with JIU")
def write (self, verbose, alias_list=None): """Write out the collection. Returns the mapping that was used to generate the ids.""" def _print_convert_msg (url, verbose, config): if verbose > 0: line_length = config.get_int('status_line_length', 60) urltext = str (url) if len (urltext) > line_length: urltext = urltext[:line_length - 20] + "....." + urltext[-15:] message("Converting %s..." % urltext) self._mapper = Mapper(self._collection, alias_list.as_dict()) # figure default charset mibenum = self._config.get_int('default_charset', 0) or None charsets = {} if verbose > 2: self._mapper.print_mapping() out_dict = {} bookmarks = {} for pluckerdoc in self._mapper.get_docs(): id = self._mapper.get_or_add(pluckerdoc) _print_convert_msg(pluckerdoc.get_url(), verbose, self._config) if pluckerdoc.is_multiimage_document (): pluckerdoc.resolve_ids (self._mapper) if pluckerdoc.is_table_document (): pluckerdoc.resolve_ids (self._mapper) if pluckerdoc.is_text_document (): pluckerdoc.resolve_ids (self._mapper) doc_mibenum = pluckerdoc.get_charset() if verbose > 2: charset_name = charset_mibenum_to_name(doc_mibenum) message(2, pluckerdoc.get_url() + ' has charset ' + str(doc_mibenum) + ((charset_name and " (" + charset_name + ")") or "") + "\n") if charsets.has_key(doc_mibenum): charsets[doc_mibenum].append(id) else: charsets[doc_mibenum] = [id] # Add doc.bookmarks to bookmark list if self._config and self._config.get_bool('bookmark_pages', 0): key = pluckerdoc.get_url() pid = self._mapper.get_or_add(key) key = string.split(key, ":") key = key[-1] key = string.split(key, "/") key = key[-1] key = string.split(key, "?") key = key[0] if not len(key): key = 'Home Page' if not bookmarks.has_key(key): bookmarks[key] = (pid, 0) if self._config and self._config.get_bool('bookmarks', 0): tmp_book = pluckerdoc.get_bookmark_ids() for key in tmp_book.keys(): if not bookmarks.has_key(key): bookmarks[key] = tmp_book[key] self._write_doc (out_dict, pluckerdoc, pluckerdoc.get_url(), id, verbose) ## Do some error checking if not out_dict.has_key (2): raise RuntimeError("The collection process failed to generate a 'home' document") ## set up the metadata mapping, if any metadata = {} # set the default to the charset which has the 'most' pages items = charsets.items() if len(items) > 0: # have to allow for image-only document items.sort(lambda x, y: ((len(x[1]) < len(y[1]) and 1) or ((len(x[1]) > len(y[1])) and -1) or 0)) mibenum = items[0][0] odd_charsets = [] if len(items) > 1: for item in items[1:]: for id in item[1]: odd_charsets.append((id, item[0] or 0,)) else: mibenum = None odd_charsets = [] if mibenum != None: metadata['CharSet'] = mibenum if verbose > 1: charset_name = charset_mibenum_to_name(mibenum) message('Default charset is MIBenum ' + str(mibenum) + ((charset_name and " (" + charset_name + ")") or "")) else: message('No default charset') if len(odd_charsets) > 0: metadata['ExceptionalCharSets'] = odd_charsets message("ExceptionalCharSets is " + str(odd_charsets) + "\n") intended_owner = self._config.get_string('owner_id_build') if intended_owner: metadata['OwnerID'] = intended_owner message(2, "OwnerID is '%s'", intended_owner) author = self._config.get_string('author_md') if author: metadata['Author'] = author message(2, "Author is '%s'", author) title = self._config.get_string('title_md') if title: metadata['Title'] = title message(2, "Title is '%s'", title) ## write the index record tmp_url = "plucker:/~special~/index" type = PluckerDocs.PluckerIndexDocument (tmp_url, self._config, metadata, bookmarks) self._write_doc (out_dict, type, tmp_url, 1, verbose) ## write the bookmark record (if any) if len(bookmarks): tmp_url = "plucker:/~special~/bookmarks" bookdoc = PluckerDocs.PluckerBookmarkDocument(tmp_url, bookmarks) self._write_doc (out_dict, bookdoc, tmp_url, 6, verbose) ## write the URL information, if desired if not self._config.get_bool ('no_urlinfo', 0): links = self._mapper.build_links() # for i in range(len(links)): # message(0, "%3d: '%s'", i, links[i]) linksdocs = [] for i in range(1, len(links), 200): tmp_url = "plucker:/~special~/links" + str(i) linksdoc = PluckerDocs.PluckerLinksDocument(tmp_url, links, i) self._mapper.get_or_add(linksdoc) linksdocs.append(linksdoc) # now make links index tmp_url = "plucker:/~special~/pluckerlinks" indexdoc = PluckerDocs.PluckerLinkIndexDocument(tmp_url, linksdocs, self._mapper) self._mapper.get_or_add(indexdoc) # OK, write the links index document self._write_doc (out_dict, indexdoc, tmp_url, 3, verbose) # and write the various links documents for doc in linksdocs: self._write_doc (out_dict, doc, doc.get_url(), self._mapper.get_or_add(doc), verbose) ## write the category information, if present if self._config.get_string ('category') is not None: tmp_url = "plucker:/~special~/category" type = PluckerDocs.PluckerCategoryDocument (tmp_url, self._config) self._write_doc (out_dict, type, tmp_url, 4, verbose) ## write the metadata record, if any if metadata: tmp_url = "plucker:/~special~/metadata" type = PluckerDocs.PluckerMetadataDocument (tmp_url, metadata) self._write_doc (out_dict, type, tmp_url, 5, verbose) ## now write everything else the_ids = out_dict.keys () the_ids.sort () # they are numeric, so sort does the right thing for id in the_ids: dump, the_url, the_id, verbose = out_dict[id] self.save_data (dump, the_url, the_id, verbose) if verbose: line_length = self._config.get_int('status_line_length', 60) urltext = str (the_url) if len (urltext) > line_length: urltext = urltext[:line_length - 20] + "....." + urltext[-15:] message("Wrote %d <= %s" % (the_id, urltext)) return self._mapper
def print_mapping(self): # print a list of all the URL's and associated IDs message(0, '*********\n') message(0, 'PluckerDoc record ids:') for (doc, id) in self._doc_to_id_mapping.items(): #sys.stderr.write(str(doc) + ' ' + str(id) + '\n') if type(doc) == type(()): url = doc[0].get_url() message(0, '%70s => %3d (%s)\n' % (url, id, str(doc[1]))) else: url = doc.get_url() message(0, '%70s => %3d\n' % (url, id)) if len(self._url_to_id_mapping) > 0: message(0, 'Non-included URL record ids:') for (url, id) in self._url_to_id_mapping.items(): message(0, '%70s => %3d\n' % (url, id)) message(0, '*********\n')
def convert(self, width, height, bpp, section): import java, jarray import net.sourceforge.jiu try: if section: cropper = net.sourceforge.jiu.geometry.Crop() cropper.setInputImage(self._image) cropper.setBounds(section[0], section[1], section[0] + section[2], section[1] + section[3]) cropper.process() im = cropper.getOutputImage() else: im = self._image # scale if necessary if width != im.getWidth() or height != im.getHeight(): message(2, "Scaling original %dx%d image by %f/%f to %dx%dx%d" % (im.getWidth(), im.getHeight(), float(width)/float(im.getWidth()), float(height)/float(im.getHeight()), width, height, bpp)) scaler = net.sourceforge.jiu.geometry.Resample() scaler.setInputImage(im) scaler.setSize(width, height) # bell filter is reasonably fast and reasonably accurate # scaler.setFilter(net.sourceforge.jiu.geometry.Resample.FILTER_TYPE_BELL); # b-spline is more accurate scaler.setFilter(net.sourceforge.jiu.geometry.Resample.FILTER_TYPE_B_SPLINE); scaler.process() im = scaler.getOutputImage() # convert to proper bit depth if bpp == 1: reducer = net.sourceforge.jiu.color.reduction.RGBToGrayConversion() reducer.setInputImage(im) reducer.process() ditherer = net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering(); ditherer.setType(net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering.TYPE_FLOYD_STEINBERG); ditherer.setGrayscaleOutputBits(1) ditherer.setInputImage(reducer.getOutputImage()) ditherer.process(); im = ditherer.getOutputImage(); elif bpp in (2, 4, 8): if bpp == 2: palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem2BitGrayscalePalette() dither = 1 elif bpp == 4: palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem4BitGrayscalePalette() dither = 1 elif bpp == 8: palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem8BitPalette() dither = 0 quantizer = net.sourceforge.jiu.color.quantization.ArbitraryPaletteQuantizer(palette) nodither = self._config.get_bool('no_dithering_in_java_image_quantization') or self._attribs.has_key('nodither') if nodither or not dither: quantizer.setInputImage(im) quantizer.process() im = quantizer.getOutputImage() else: ditherer = net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering(); ditherer.setType(net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering.TYPE_FLOYD_STEINBERG); ditherer.setQuantizer(quantizer) ditherer.setTruecolorOutput(0) ditherer.setInputImage(im) ditherer.process(); im = ditherer.getOutputImage(); elif bpp != 16: message(0, "%d bpp images not supported with JIU imaging yet. Using 16 bit color.\n" % (bpp,)) # finally, turn it into a Palm image codec = net.sourceforge.jiu.codecs.PalmCodec() codec.setImage(im) codec.setCompression(net.sourceforge.jiu.codecs.PalmCodec.COMPRESSION_RLE); outputStream = java.io.ByteArrayOutputStream() codec.setOutputStream(outputStream); codec.process() codec.close() # bytes = outputStream.toByteArray() # bits = string.join(map(lambda x: chr(((x < 0) and (0x100 + x)) or x), bytes), "") return outputStream.toString(0) except: if self._verbose > 1: import traceback traceback.print_exc() raise RuntimeError("Error while converting image " + self._url + " with JIU")