Ejemplo n.º 1
0
 def _print_convert_msg (url, verbose, config):
     if verbose > 0:
         line_length = config.get_int('status_line_length', 60)
         urltext = str (url)
         if len (urltext) > line_length:
             urltext = urltext[:line_length - 20] + "....." + urltext[-15:]
         message("Converting %s..." % urltext)
Ejemplo n.º 2
0
 def _print_convert_msg (url, id, verbose, config):
     if verbose > 1:
         line_length = config.get_int('status_line_length', 60)
         urltext = str (url)
         if len (urltext) > line_length:
             urltext = urltext[:line_length - 20] + "....." + urltext[-15:]
         message("Converted %4d:  %s" % (id, urltext))
Ejemplo n.º 3
0
    def _write_doc (self, out_dict, pluckerdoc, url, id, verbose):

        def _print_convert_msg (url, id, verbose, config):
            if verbose > 1:
                line_length = config.get_int('status_line_length', 60)
                urltext = str (url)
                if len (urltext) > line_length:
                    urltext = urltext[:line_length - 20] + "....." + urltext[-15:]
                message("Converted %4d:  %s" % (id, urltext))

        if id != self._mapper.get_or_add(pluckerdoc):
            raise ValueError("bad id %d instead of %d" % (id, self._mapper.get_or_add(pluckerdoc)))
        if pluckerdoc.is_text_document ():
            dumps = pluckerdoc.dump_record_with_splits (self._mapper)
            # sys.stderr.write("dumps is %s\n" % str(map(lambda p: (p[0], p[1]), dumps)))
            if dumps[0][1] != id:
                message("****** bad id %d instead of %d" % (dumps[0][1], id,))
            for dump in dumps:
                (the_url, the_id, dump) = dump
                if the_id == 0:
                    the_id = id # original
                out_dict [the_id] = (dump, the_url, the_id, verbose)
                _print_convert_msg(the_url, the_id, verbose, self._config)
            return
        else:
            dump = pluckerdoc.dump_record (id)
            out_dict [id] = (dump, url, id, verbose)
            _print_convert_msg(url, id, verbose, self._config)
Ejemplo n.º 4
0
 def get_or_add (self, url_or_doc):
    # For a standard URL, returns the numeric record ID.
    # For a URL which has a fragment-id:
    #   If the fragment is a paragraph of a text page, a pair
    #   (record-id, paragraph-id) is returned.
    #   Otherwise, just the record id is returned.
    # If arg is PluckerDocument, returns the id assigned for that document.
    # If arg is integer, treats it as a registered-document id.  Get-only.
    if type(url_or_doc) == type(''):
        import urllib
        url, tag = urllib.splittag(url_or_doc)
        finalurl = self._alias_list.get(url, url)
        if tag:
            id = self._get_id_for_url((finalurl, tag))
        else:
            id = self._get_id_for_url(finalurl)
        return id
    elif isinstance(url_or_doc, PluckerDocs.PluckerDocument):
        url = url_or_doc.get_url()
        if not self._url_to_doc_mapping.has_key(url):
            self._url_to_doc_mapping[url] = url_or_doc
        if not self._doc_to_id_mapping.has_key(url_or_doc) and self._url_to_id_mapping.has_key(url):
            self._doc_to_id_mapping[url_or_doc] = self._url_to_id_mapping[url]
        if not self._doc_to_id_mapping.has_key(url_or_doc):
            message(2, "New document %s added", url_or_doc)
        return self._get_id_for_doc(url_or_doc)
    else:
        raise ValueError("not a URL or an instance of " + str(PluckerDocs.PluckerDocument))
Ejemplo n.º 5
0
 def _get_id_for_doc(self, idoc, add=1):
     if type(idoc) == type(()):
         doc = idoc[0]
     else:
         doc = idoc
     id = self._doc_to_id_mapping.get(doc)
     if not id:
         id = self._url_to_id_mapping.get(doc.get_url())
         if id:
             self._doc_to_id_mapping[doc] = id
     if not id:
         if not add:
             return None
         if isinstance(doc, PluckerDocs.PluckerIndexDocument):
             # there's only one, and it always has record # 1
             id = 1
         elif isinstance(doc, PluckerDocs.PluckerBookmarkDocument):
             id = 6
         elif isinstance(doc, PluckerDocs.PluckerLinkIndexDocument):
             id = 3
         elif isinstance(doc, PluckerDocs.PluckerCategoryDocument):
             id = 4
         elif isinstance(doc, PluckerDocs.PluckerMetadataDocument):
             id = 5
         else:
             id = self._current_id
             self._current_id = self._current_id + 1
         self._doc_to_id_mapping[doc] = id
         url_mapping = self._url_to_doc_mapping.get(doc.get_url())
         if (url_mapping != doc):
             if (url_mapping != None):
                 message("URL %s for doc %s points to doc %s\n" %
                         (doc.get_url(), str(doc), str(url_mapping)))
             self._url_to_doc_mapping[doc.get_url()] = doc           
         # message("new document " + str(doc) + " => " + str(id) + "\n")
     if type(idoc) == type(()):
         return (id, idoc[1])
     else:
         return id
    def convert(self, width, height, bpp, section):

        import java, jarray
        import net.sourceforge.jiu

        try:
            if section:
                cropper = net.sourceforge.jiu.geometry.Crop()
                cropper.setInputImage(self._image)
                cropper.setBounds(section[0], section[1],
                                  section[0] + section[2],
                                  section[1] + section[3])
                cropper.process()
                im = cropper.getOutputImage()
            else:
                im = self._image

            # scale if necessary
            if width != im.getWidth() or height != im.getHeight():
                message(
                    2, "Scaling original %dx%d image by %f/%f to %dx%dx%d" %
                    (im.getWidth(), im.getHeight(),
                     float(width) / float(im.getWidth()), float(height) /
                     float(im.getHeight()), width, height, bpp))
                scaler = net.sourceforge.jiu.geometry.Resample()
                scaler.setInputImage(im)
                scaler.setSize(width, height)
                # bell filter is reasonably fast and reasonably accurate
                # scaler.setFilter(net.sourceforge.jiu.geometry.Resample.FILTER_TYPE_BELL);
                # b-spline is more accurate
                scaler.setFilter(
                    net.sourceforge.jiu.geometry.Resample.FILTER_TYPE_B_SPLINE)
                scaler.process()
                im = scaler.getOutputImage()

            # convert to proper bit depth
            if bpp == 1:
                reducer = net.sourceforge.jiu.color.reduction.RGBToGrayConversion(
                )
                reducer.setInputImage(im)
                reducer.process()
                ditherer = net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering(
                )
                ditherer.setType(net.sourceforge.jiu.color.dithering.
                                 ErrorDiffusionDithering.TYPE_FLOYD_STEINBERG)
                ditherer.setGrayscaleOutputBits(1)
                ditherer.setInputImage(reducer.getOutputImage())
                ditherer.process()
                im = ditherer.getOutputImage()
            elif bpp in (2, 4, 8):
                if bpp == 2:
                    palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem2BitGrayscalePalette(
                    )
                    dither = 1
                elif bpp == 4:
                    palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem4BitGrayscalePalette(
                    )
                    dither = 1
                elif bpp == 8:
                    palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem8BitPalette(
                    )
                    dither = 0
                quantizer = net.sourceforge.jiu.color.quantization.ArbitraryPaletteQuantizer(
                    palette)
                nodither = self._config.get_bool(
                    'no_dithering_in_java_image_quantization'
                ) or self._attribs.has_key('nodither')
                if nodither or not dither:
                    quantizer.setInputImage(im)
                    quantizer.process()
                    im = quantizer.getOutputImage()
                else:
                    ditherer = net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering(
                    )
                    ditherer.setType(
                        net.sourceforge.jiu.color.dithering.
                        ErrorDiffusionDithering.TYPE_FLOYD_STEINBERG)
                    ditherer.setQuantizer(quantizer)
                    ditherer.setTruecolorOutput(0)
                    ditherer.setInputImage(im)
                    ditherer.process()
                    im = ditherer.getOutputImage()
            elif bpp != 16:
                message(
                    0,
                    "%d bpp images not supported with JIU imaging yet.  Using 16 bit color.\n"
                    % (bpp, ))

            # finally, turn it into a Palm image
            codec = net.sourceforge.jiu.codecs.PalmCodec()
            codec.setImage(im)
            codec.setCompression(
                net.sourceforge.jiu.codecs.PalmCodec.COMPRESSION_RLE)
            outputStream = java.io.ByteArrayOutputStream()
            codec.setOutputStream(outputStream)
            codec.process()
            codec.close()
            # bytes = outputStream.toByteArray()
            # bits = string.join(map(lambda x: chr(((x < 0) and (0x100 + x)) or x), bytes), "")
            return outputStream.toString(0)

        except:
            if self._verbose > 1:
                import traceback
                traceback.print_exc()
            raise RuntimeError("Error while converting image " + self._url +
                               " with JIU")
Ejemplo n.º 7
0
    def write (self, verbose, alias_list=None):
        """Write out the collection.  Returns the mapping that was
        used to generate the ids."""

        def _print_convert_msg (url, verbose, config):
            if verbose > 0:
                line_length = config.get_int('status_line_length', 60)
                urltext = str (url)
                if len (urltext) > line_length:
                    urltext = urltext[:line_length - 20] + "....." + urltext[-15:]
                message("Converting %s..." % urltext)

        self._mapper = Mapper(self._collection, alias_list.as_dict())

        # figure default charset
        mibenum = self._config.get_int('default_charset', 0) or None
        charsets = {}

        if verbose > 2:
            self._mapper.print_mapping()

        out_dict = {}
        bookmarks = {}
        for pluckerdoc in self._mapper.get_docs():
            id = self._mapper.get_or_add(pluckerdoc)
            _print_convert_msg(pluckerdoc.get_url(), verbose, self._config)
            if pluckerdoc.is_multiimage_document ():
                pluckerdoc.resolve_ids (self._mapper)
            if pluckerdoc.is_table_document ():
                pluckerdoc.resolve_ids (self._mapper)
            if pluckerdoc.is_text_document ():
                pluckerdoc.resolve_ids (self._mapper)
                doc_mibenum = pluckerdoc.get_charset()
                if verbose > 2:
                    charset_name = charset_mibenum_to_name(doc_mibenum)
                    message(2, pluckerdoc.get_url() + ' has charset ' + str(doc_mibenum) + ((charset_name and " (" + charset_name + ")") or "") + "\n")
                if charsets.has_key(doc_mibenum):
                    charsets[doc_mibenum].append(id)
                else:
                    charsets[doc_mibenum] = [id]

                # Add doc.bookmarks to bookmark list
                if self._config and self._config.get_bool('bookmark_pages', 0):
                    key = pluckerdoc.get_url()
                    pid = self._mapper.get_or_add(key)
                    key = string.split(key, ":")
                    key = key[-1]
                    key = string.split(key, "/")
                    key = key[-1]
                    key = string.split(key, "?")
                    key = key[0]
                    if not len(key):
                        key = 'Home Page'
                    if not bookmarks.has_key(key):
                        bookmarks[key] = (pid, 0)

                if self._config and self._config.get_bool('bookmarks', 0):
                    tmp_book = pluckerdoc.get_bookmark_ids()
                    for key in tmp_book.keys():
                        if not bookmarks.has_key(key):
                            bookmarks[key] = tmp_book[key]

            self._write_doc (out_dict, pluckerdoc, pluckerdoc.get_url(), id, verbose)

        ## Do some error checking
        if not out_dict.has_key (2):
            raise RuntimeError("The collection process failed to generate a 'home' document")
        
        ## set up the metadata mapping, if any
        metadata = {}
        # set the default to the charset which has the 'most' pages
        items = charsets.items()
        if len(items) > 0:        # have to allow for image-only document
            items.sort(lambda x, y: ((len(x[1]) < len(y[1]) and 1) or ((len(x[1]) > len(y[1])) and -1) or 0))
            mibenum = items[0][0]
            odd_charsets = []
            if len(items) > 1:
                for item in items[1:]:
                    for id in item[1]:
                        odd_charsets.append((id, item[0] or 0,))
        else:
            mibenum = None
            odd_charsets = []
        if mibenum != None:
            metadata['CharSet'] = mibenum
            if verbose > 1:
                charset_name = charset_mibenum_to_name(mibenum)
                message('Default charset is MIBenum ' + str(mibenum) + ((charset_name and " (" + charset_name + ")") or ""))
        else:
            message('No default charset')
        if len(odd_charsets) > 0:
            metadata['ExceptionalCharSets'] = odd_charsets
            message("ExceptionalCharSets is " + str(odd_charsets) + "\n")
        intended_owner = self._config.get_string('owner_id_build')
        if intended_owner:
            metadata['OwnerID'] = intended_owner
            message(2, "OwnerID is '%s'", intended_owner)
        author = self._config.get_string('author_md')
        if author:
            metadata['Author'] = author
            message(2, "Author is '%s'", author)
        title = self._config.get_string('title_md')
        if title:
            metadata['Title'] = title
            message(2, "Title is '%s'", title)

        ## write the index record
        tmp_url = "plucker:/~special~/index"
        type = PluckerDocs.PluckerIndexDocument (tmp_url, self._config, metadata, bookmarks)
        self._write_doc (out_dict, type, tmp_url, 1, verbose)

        ## write the bookmark record (if any)
        if len(bookmarks):
            tmp_url = "plucker:/~special~/bookmarks"
            bookdoc = PluckerDocs.PluckerBookmarkDocument(tmp_url, bookmarks)
            self._write_doc (out_dict, bookdoc, tmp_url, 6, verbose)

        ## write the URL information, if desired
        if not self._config.get_bool ('no_urlinfo', 0):
            links = self._mapper.build_links()
            # for i in range(len(links)):
            #   message(0, "%3d: '%s'", i, links[i])
            linksdocs = []
            for i in range(1, len(links), 200):
                tmp_url = "plucker:/~special~/links" + str(i)
                linksdoc = PluckerDocs.PluckerLinksDocument(tmp_url, links, i)
                self._mapper.get_or_add(linksdoc)
                linksdocs.append(linksdoc)
            # now make links index
            tmp_url = "plucker:/~special~/pluckerlinks"
            indexdoc = PluckerDocs.PluckerLinkIndexDocument(tmp_url, linksdocs, self._mapper)
            self._mapper.get_or_add(indexdoc)
            # OK, write the links index document
            self._write_doc (out_dict, indexdoc, tmp_url, 3, verbose)
            # and write the various links documents
            for doc in linksdocs:
                self._write_doc (out_dict, doc, doc.get_url(), self._mapper.get_or_add(doc), verbose)

        ## write the category information, if present
        if self._config.get_string ('category') is not None:
            tmp_url = "plucker:/~special~/category"
            type = PluckerDocs.PluckerCategoryDocument (tmp_url, self._config)
            self._write_doc (out_dict, type, tmp_url, 4, verbose)

        ## write the metadata record, if any
        if metadata:
            tmp_url = "plucker:/~special~/metadata"
            type = PluckerDocs.PluckerMetadataDocument (tmp_url, metadata)
            self._write_doc (out_dict, type, tmp_url, 5, verbose)

        ## now write everything else
        the_ids = out_dict.keys ()
        the_ids.sort ()  # they are numeric, so sort does the right thing
        for id in the_ids:
            dump, the_url, the_id, verbose = out_dict[id]
            self.save_data (dump, the_url, the_id, verbose)
            if verbose:
                line_length = self._config.get_int('status_line_length', 60)
                urltext = str (the_url)
                if len (urltext) > line_length:
                    urltext = urltext[:line_length - 20] + "....." + urltext[-15:]
                message("Wrote %d <= %s" % (the_id, urltext))

        return self._mapper
Ejemplo n.º 8
0
 def print_mapping(self):
     # print a list of all the URL's and associated IDs
     message(0, '*********\n')
     message(0, 'PluckerDoc record ids:')
     for (doc, id) in self._doc_to_id_mapping.items():
         #sys.stderr.write(str(doc) + '  ' + str(id) + '\n')
         if type(doc) == type(()):
             url = doc[0].get_url()
             message(0, '%70s => %3d (%s)\n' % (url, id, str(doc[1])))
         else:
             url = doc.get_url()
             message(0, '%70s => %3d\n' % (url, id))
     if len(self._url_to_id_mapping) > 0:
         message(0, 'Non-included URL record ids:')
         for (url, id) in self._url_to_id_mapping.items():
             message(0, '%70s => %3d\n' % (url, id))
     message(0, '*********\n')
    def convert(self, width, height, bpp, section):

	import java, jarray
	import net.sourceforge.jiu

        try:
	    if section:
		cropper = net.sourceforge.jiu.geometry.Crop()
		cropper.setInputImage(self._image)
                cropper.setBounds(section[0], section[1], section[0] + section[2], section[1] + section[3])
		cropper.process()
		im = cropper.getOutputImage()
	    else:
		im = self._image

	    # scale if necessary
	    if width != im.getWidth() or height != im.getHeight():
		message(2, "Scaling original %dx%d image by %f/%f to %dx%dx%d" % (im.getWidth(), im.getHeight(), float(width)/float(im.getWidth()), float(height)/float(im.getHeight()), width, height, bpp))
		scaler = net.sourceforge.jiu.geometry.Resample()
		scaler.setInputImage(im)
		scaler.setSize(width, height)
                # bell filter is reasonably fast and reasonably accurate
                # scaler.setFilter(net.sourceforge.jiu.geometry.Resample.FILTER_TYPE_BELL);
                # b-spline is more accurate
                scaler.setFilter(net.sourceforge.jiu.geometry.Resample.FILTER_TYPE_B_SPLINE);
		scaler.process()
		im = scaler.getOutputImage()

	    # convert to proper bit depth
            if bpp == 1:
                reducer = net.sourceforge.jiu.color.reduction.RGBToGrayConversion()
                reducer.setInputImage(im)
                reducer.process()
                ditherer = net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering();
                ditherer.setType(net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering.TYPE_FLOYD_STEINBERG);
                ditherer.setGrayscaleOutputBits(1)
                ditherer.setInputImage(reducer.getOutputImage())
                ditherer.process();
                im = ditherer.getOutputImage();
            elif bpp in (2, 4, 8):
                if bpp == 2:
                    palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem2BitGrayscalePalette()
                    dither = 1
                elif bpp == 4:
                    palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem4BitGrayscalePalette()
                    dither = 1
                elif bpp == 8:
                    palette = net.sourceforge.jiu.codecs.PalmCodec.createSystem8BitPalette()
                    dither = 0
                quantizer = net.sourceforge.jiu.color.quantization.ArbitraryPaletteQuantizer(palette)
                nodither = self._config.get_bool('no_dithering_in_java_image_quantization') or self._attribs.has_key('nodither')
                if nodither or not dither:
                    quantizer.setInputImage(im)
                    quantizer.process()
                    im = quantizer.getOutputImage()
                else:
                    ditherer = net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering();
                    ditherer.setType(net.sourceforge.jiu.color.dithering.ErrorDiffusionDithering.TYPE_FLOYD_STEINBERG);
                    ditherer.setQuantizer(quantizer)
                    ditherer.setTruecolorOutput(0)
                    ditherer.setInputImage(im)
                    ditherer.process();
                    im = ditherer.getOutputImage();
	    elif bpp != 16:
                message(0, "%d bpp images not supported with JIU imaging yet.  Using 16 bit color.\n" % (bpp,))

	    # finally, turn it into a Palm image
	    codec = net.sourceforge.jiu.codecs.PalmCodec()
	    codec.setImage(im)
            codec.setCompression(net.sourceforge.jiu.codecs.PalmCodec.COMPRESSION_RLE);
	    outputStream = java.io.ByteArrayOutputStream()
	    codec.setOutputStream(outputStream);
	    codec.process()
	    codec.close()
            # bytes = outputStream.toByteArray()
            # bits = string.join(map(lambda x: chr(((x < 0) and (0x100 + x)) or x), bytes), "")
            return outputStream.toString(0)

        except:
            if self._verbose > 1:
                import traceback
                traceback.print_exc()
            raise RuntimeError("Error while converting image " + self._url + " with JIU")