Example #1
0
 def begin_tag(self, tag, props=None):
   s = ''
   if props:
     s = ''.join( ' %s="%s"' % (enc(k), enc(str(v))) for (k,v)
                  in sorted(props.iteritems()) )
   self.outfp.write('<%s%s>' % (enc(tag.name), s))
   self.tag = tag
   return
Example #2
0
 def __renderChar(self, item):
     #if not self.__isLine:
     #    return
     #font = enc(item.font.basefont)
     font = enc(item.fontname)
     #size = item.get_size()
     size = item.size
     fontChanged = False
     if (self.__font == None or
         (self.__font.fullName != font
          or self.__font.size != size)) or not self.__hasFont:
         # zmienil sie font lub nie jestesmy w obrebie tagu <span> dla fontu
         #if self.__font == None:
         #    print "None",
         #else:
         #    print self.__font.psname,
         if self.__hasFont:
             if self.__icu != None and self.__inWord:
                 if not (self.__wordInd - 1) in self.__whites:
                     self.__outfp.write("</span>")
                 fontChanged = True
             self.__endSpecialTags(self.__font)
             self.__outfp.write("</span>")
         self.__font = self.__lib.findFont(self.__page,
                                           font).instantiate(size)
         #print self.__font.psname
         self.__outfp.write("<span style=\"")
         name = None
         if self.__fontMap != None:
             name = self.__fontMap.get(self.__font.name)
         if name == None:
             name = self.__font.name
         self.__outfp.write("font-family: " + self.__font.name)
         if self.__font.bold:
             self.__outfp.write("; font-weight: bold")
         if self.__font.italic:
             self.__outfp.write("; font-style: italic")
         #self.__outfp.write("; ps-name: " + font)
         self.__outfp.write("; font-size: " + str(self.__font.size))
         self.__outfp.write("\">")
         self.__startSpecialTags(self.__font)
         self.__hasFont = True
     if self.__icu != None:
         if self.__ind in self.__divs or fontChanged:  # TODO: I jezeli hasFont bylo False przy wywolaniu renderChar
             # i wypisalismy nowy font to wtedy nie moze byc srodek slowa (wiec __ind bedzie w self.__divs bo to 0)
             if not self.__wordInd in self.__whites:
                 self.__outfp.write(
                     "<span class=\"ocrx_word\" title=\"bbox " + bbox2str(
                         changeCoords(self.__pagebbox, self.__divbboxes[
                             self.__wordInd])) + "\">")
             self.__wordInd += 1
             self.__inWord = True
     self.__outfp.write(item.get_text().replace("<", "&lt;").replace(
         "&", "&amp;").encode("utf-8"))
     if self.__icu != None:
         self.__ind += 1
         if self.__ind in self.__divs:
             if not (self.__wordInd - 1) in self.__whites:
                 self.__outfp.write("</span>")
             self.__inWord = False
Example #3
0
 def render(item):
     if isinstance(item, LTPage):
         self.outfp.write('<a id="page_%s" data-bbox="%s" data-rotate="%d"></a>\n' %
                          (item.pageid, bbox2str(item.bbox), item.rotate))
         for child in item:
             render(child)
     elif isinstance(item, LTLine):
         self.outfp.write('<line linewidth="%d" bbox="%s" />\n' %
                          (item.linewidth, bbox2str(item.bbox)))
     elif isinstance(item, LTRect):
         self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
                          (item.linewidth, bbox2str(item.bbox)))
     elif isinstance(item, LTCurve):
         self.outfp.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' %
                          (item.linewidth, bbox2str(item.bbox), item.get_pts()))
     elif isinstance(item, LTFigure):
         self.outfp.write('<figure name="%s" bbox="%s">\n' %
                          (item.name, bbox2str(item.bbox)))
         for child in item:
             render(child)
         self.outfp.write('</figure>\n')
     elif isinstance(item, LTTextLine):
         self.outfp.write('<span data-bbox="%s"/>\n' % bbox2str(item.bbox))
         for child in item:
             render(child)
         # self.outfp.write('</p>\n')
     elif isinstance(item, LTTextBox):
         wmode = ''
         if isinstance(item, LTTextBoxVertical):
             wmode = ' wmode="vertical"'
         self.outfp.write('<div id="%d" data-bbox="%s" data-wmode="%s"><p>\n' %
                          (item.index, bbox2str(item.bbox), wmode))
         for child in item:
             render(child)
         self.outfp.write('</p></div>\n')
     elif isinstance(item, LTChar):
         # self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
         #                  (enc(item.fontname), bbox2str(item.bbox), item.size))
         self.write_text(item.get_text())
         # self.outfp.write('</text>\n')
     elif isinstance(item, LTText):
         self.outfp.write(item.get_text())
     elif isinstance(item, LTImage):
         if self.imagewriter is not None:
             name = self.imagewriter.export_image(item)
             self.outfp.write('<img src="%s" width="%d" height="%d" />\n' %
                              (enc(name), item.width, item.height))
         else:
             self.outfp.write('<img width="%d" height="%d" />\n' %
                              (item.width, item.height))
     else:
         assert 0, item
     return
Example #4
0
 def render_string(self, textstate, seq):
   font = textstate.font
   text = ''
   for obj in seq:
     if not isinstance(obj, str): continue
     chars = font.decode(obj)
     for cid in chars:
       try:
         char = font.to_unicode(cid)
         text += char
       except PDFUnicodeNotDefined:
         pass
   self.outfp.write(enc(text, self.codec))
   return
Example #5
0
 def write_text(self, text, item=None):
     if self.ignoring():
         self.state.step(u'\n')
         return
     text = self.CONTROL.sub(u'', text)
     if item and text.strip():
         self.register_font(item)
         if self.has_new_line:
             self.handle_new_line()
         if self.has_new_chunk:
             self.handle_new_chunk()
     if text:
         self.state.step(unicode(text))
     self.handle_style()
     self.buffer.write(enc(text, 'utf-8'))
     self.last_char = text
     return
Example #6
0
 def write_text(self, text, item=None):
     if self.ignoring():
         self.state.step(u'\n')
         return
     text = self.CONTROL.sub(u'', text)
     if item and text.strip():
         self.register_font(item)
         if self.has_new_line:
             self.handle_new_line()
         if self.has_new_chunk:
             self.handle_new_chunk()
     if text:
         self.state.step(unicode(text))
     self.handle_style()
     self.buffer.write(enc(text, 'utf-8'))
     self.last_char = text
     return
Example #7
0
 def write_header(self):
     self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
     self.outfp.write('<html xmlns="http://www.w3.org/1999/xhtml">\n')
     self.outfp.write('<head>\n')
     self.outfp.write('<meta charset="%s"/>\n' % self.codec)
     if self.document is not None:
         contents = self.document.info[0]
         if contents is not None:
             for name in contents:
                 if 'itle' in name:
                     self.outfp.write('<title>%s</title>\n' % contents[name])
                 else:
                     self.outfp.write('<meta name="%s" content="%s"/>\n' % (enc(name), contents[name]) )
     self.outfp.write('<meta name="Note" content="Converted with PDFminer.py for xhtml format"/>\n')
     self.outfp.write('<link rel="stylesheet" type="text/css" href="css/style.css"/>\n')
     self.outfp.write('</head>\n')
     self.outfp.write('<body>\n')
     return
Example #8
0
 def render(item):
   if isinstance(item, LTPage):
     self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
                      (item.id, item.get_bbox(), item.rotate))
     for child in item:
       render(child)
     self.outfp.write('</page>\n')
   elif isinstance(item, LTLine):
     self.outfp.write('<line linewidth="%d" direction="%s" bbox="%s" />' % (item.linewidth, item.direction, item.get_bbox()))
   elif isinstance(item, LTRect):
     self.outfp.write('<rect linewidth="%d" bbox="%s" />' % (item.linewidth, item.get_bbox()))
   elif isinstance(item, LTFigure):
     self.outfp.write('<figure id="%s">\n' % (item.id))
     for child in item:
       render(child)
     self.outfp.write('</figure>\n')
   elif isinstance(item, LTTextLine):
     self.outfp.write('<textline bbox="%s">\n' % (item.get_bbox()))
     for child in item:
       render(child)
     self.outfp.write('</textline>\n')
   elif isinstance(item, LTTextBox):
     self.outfp.write('<textbox id="%s" bbox="%s">\n' % (item.id, item.get_bbox()))
     for child in item:
       render(child)
     self.outfp.write('</textbox>\n')
   elif isinstance(item, LTTextItem):
     self.outfp.write('<text font="%s" vertical="%s" bbox="%s" fontsize="%.3f">' %
                      (enc(item.font.fontname), item.is_vertical(),
                       item.get_bbox(), item.fontsize))
     self.write(item.text)
     self.outfp.write('</text>\n')
   elif isinstance(item, LTText):
     self.outfp.write('<text>%s</text>\n' % item.text)
   else:
     assert 0, item
   return
Example #9
0
        def render(item):
            if isinstance(item, LTPage):
                metainfo = {
                    'pid': item.pageid,
                    'rotate': item.rotate,
                    'x0': item.bbox[0],
                    'y0': item.bbox[1],
                    'x1': item.bbox[2],
                    'y1': item.bbox[3]
                }
                self.page = {
                    'metainfo': metainfo,
                    'text': [],
                    'line': [],
                    'rect': [],
                    'curve': [],
                    'figure': [],
                    'textline': [],
                    'textbox': [],
                    'textgroup': [],
                    'image': []
                }

                for child in item:
                    render(child)

                if item.groups is not None:
                    for group in item.groups:
                        show_group(group)

                self.doc.append(self.page)
            elif isinstance(item, LTLine):
                self.page['line'].append({
                    'linewidth': item.linewidth,
                    'x0': item.bbox[0],
                    'y0': item.bbox[1],
                    'x1': item.bbox[2],
                    'y1': item.bbox[3]
                })
            elif isinstance(item, LTRect):
                self.page['rect'].append({
                    'linewidth': item.linewidth,
                    'x0': item.bbox[0],
                    'y0': item.bbox[1],
                    'x1': item.bbox[2],
                    'y1': item.bbox[3]
                })
            elif isinstance(item, LTCurve):
                curve = {
                    'linewidth': item.linewidth,
                    'pts': item.get_pts(),
                    'x0': item.bbox[0],
                    'y0': item.bbox[1],
                    'x1': item.bbox[2],
                    'y1': item.bbox[3]
                }
                self.page['curve'].append(curve)
            elif isinstance(item, LTFigure):
                self.page['figure'].append({
                    'name': item.name,
                    'x0': item.bbox[0],
                    'y0': item.bbox[1],
                    'x1': item.bbox[2],
                    'y1': item.bbox[3]
                })
                for child in item:
                    render(child)
            elif isinstance(item, LTTextLine):
                self.page['textline'].append({
                    'x0': item.bbox[0],
                    'y0': item.bbox[1],
                    'x1': item.bbox[2],
                    'y1': item.bbox[3]
                })
                for child in item:
                    render(child)
            elif isinstance(item, LTTextBox):
                wmode = 'vertical' if isinstance(
                    item, LTTextBoxVertical) else 'horizontal'
                tb = {
                    'id': item.index,
                    'wmode': wmode,
                    'x0': item.bbox[0],
                    'y0': item.bbox[1],
                    'x1': item.bbox[2],
                    'y1': item.bbox[3]
                }
                self.page['textbox'].append(tb)
                for child in item:
                    render(child)
            elif isinstance(item, LTChar):
                # bbox (x0,y0,x1,y1)
                # x0: the distance from the left of the page to the left edge of the box.
                # y0: the distance from the bottom of the page to the lower edge of the box.
                # x1: the distance from the left of the page to the right edge of the box.
                # y1: the distance from the bottom of the page to the upper edge of the box.
                txt = {
                    'text': item.get_text(),
                    'font': enc(item.fontname),
                    'size': item.size,
                    'colorspace': item.ncs.name,
                    'color': json.dumps(item.graphicstate.ncolor),
                    'x0': item.bbox[0],
                    'y0': item.bbox[1],
                    'x1': item.bbox[2],
                    'y1': item.bbox[3]
                }
                self.page['text'].append(txt)
            elif isinstance(item, LTText):
                # LTText is the interface for things that have text.
                # LTAnno inherits from LTText.
                self.page['text'].append({'text': item.get_text()})
            elif isinstance(item, LTImage):
                if self.imagewriter is not None:
                    name = self.imagewriter.export_image(item)
                    img = {
                        'src': enc(name),
                        'width': item.width,
                        'height': item.height
                    }
                    self.page['image'].append(img)
                else:
                    self.page['image'].append({
                        'width': item.width,
                        'height': item.height
                    })
            else:
                assert False, str(('Unhandled', item))
            return
Example #10
0
        def render(item):
            if isinstance(item, LTPage):
                s = '<page id="%s" bbox="%s" rotate="%d">\n' % (
                    item.pageid,
                    bbox2str(item.bbox),
                    item.rotate,
                )
                self.current_page = RpaPdfPage(item.pageid, item.bbox,
                                               item.rotate)

                self.write(s)
                for child in item:
                    render(child)
                if item.groups is not None:
                    self.write("<layout>\n")
                    for group in item.groups:
                        show_group(group)
                    self.write("</layout>\n")
                self.write("</page>\n")
                self.rpa_pdf_document.add_page(self.current_page)
            elif isinstance(item, LTLine):
                s = '<line linewidth="%d" bbox="%s" />\n' % (
                    item.linewidth,
                    bbox2str(item.bbox),
                )
                self.write(s)
            elif isinstance(item, LTRect):
                s = '<rect linewidth="%d" bbox="%s" />\n' % (
                    item.linewidth,
                    bbox2str(item.bbox),
                )
                self.write(s)
            elif isinstance(item, LTCurve):
                s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
                    item.linewidth,
                    bbox2str(item.bbox),
                    item.get_pts(),
                )
                self.write(s)
            elif isinstance(item, LTFigure):
                s = '<figure name="%s" bbox="%s">\n' % (item.name,
                                                        bbox2str(item.bbox))
                self.write(s)
                self.figure = RpaFigure(item.name, item.bbox)
                for child in item:
                    self.figure.set_item(item)
                    render(child)
                self.write("</figure>\n")
                self.current_page.add_content(self.figure)
                self.figure = None
            elif isinstance(item, LTTextLine):
                self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
                for child in item:
                    render(child)
                self.write("</textline>\n")
            elif isinstance(item, LTTextBox):
                wmode = ""

                if isinstance(item, LTTextBoxVertical):
                    wmode = ' wmode="vertical"'
                s = '<textbox id="%d" bbox="%s"%s>\n' % (
                    item.index,
                    bbox2str(item.bbox),
                    wmode,
                )
                box = RpaTextBox(item.index, item.bbox, wmode)
                self.write(s)
                box.set_item(item)
                self.current_page.add_content(box)
                for child in item:
                    render(child)
                self.write("</textbox>\n")
            elif isinstance(item, LTChar):
                s = ('<text font="%s" bbox="%s" colourspace="%s" '
                     'ncolour="%s" size="%.3f">' % (
                         enc(item.fontname),
                         bbox2str(item.bbox),
                         item.ncs.name,
                         item.graphicstate.ncolor,
                         item.size,
                     ))
                self.write(s)
                self.write_text(item.get_text())
                self.write("</text>\n")
            elif isinstance(item, LTText):
                self.write("<text>%s</text>\n" % item.get_text())
            elif isinstance(item, LTImage):
                if self.figure:
                    self.figure.set_item(item)
                if self.imagewriter is not None:
                    name = self.imagewriter.export_image(item)
                    self.write('<image src="%s" width="%d" height="%d" />\n' %
                               (enc(name), item.width, item.height))
                else:
                    self.write('<image width="%d" height="%d" />\n' %
                               (item.width, item.height))
            else:
                assert False, str(("Unhandled", item))
Example #11
0
 def encode_text(self):
     for textline in self.taglists['textline']:
         textline["text"] = subst_control_chars(enc(textline["text"]))
Example #12
0
 def write_text(self, text: str):
     if self.stripcontrol:
         text = self.CONTROL.sub("", text)
     self.write(enc(text))
Example #13
0
 def write_text(self, text):
     if self.stripcontrol:
         text = self.CONTROL.sub(u'', text)
     self.outfp.write(enc(text, self.codec))
     return
Example #14
0
 def end_tag(self):
   assert self.tag
   self.outfp.write('</%s>' % enc(self.tag.name))
   self.tag = None
   return
Example #15
0
 def write(self, text):
   self.outfp.write(enc(text, self.codec))
   return
Example #16
0
 def render(item):
     self.__stack.append(self.__node)
     parent = self.__node
     if isinstance(item, LTPage):
         self.__num += 1
         self.__page = PDFMinerNode("page")
         self.__page.setPageId(self.__num)
         self.__setBbox(self.__page, bbox2str(normalize(item.bbox)))
         if self.__lib != None:
             self.__lib.addBbox(self.__page.getPageId(),
                                self.__page.getBbox())
         self.__node = self.__page
         for child in item:
             render(child)
     elif isinstance(item, LTLine):
         pass
     elif isinstance(item, LTRect):
         pass
     elif isinstance(item, LTCurve):
         pass
     elif isinstance(item, LTFigure):
         pass
     elif isinstance(item, LTTextLine):
         #print "textline"
         self.__node = PDFMinerNode("textline")
         self.__node.setPageId(self.__num)
         parent.add(self.__node)
         self.__setBbox(self.__node, bbox2str(normalize(item.bbox)))
         for child in item:
             render(child)
     elif isinstance(item, LTTextBox):
         self.__node = PDFMinerNode("textbox", item.index)
         self.__node.setPageId(self.__num)
         parent.add(self.__node)
         self.__setBbox(self.__node, bbox2str(normalize(item.bbox)))
         for child in item:
             render(child)
     elif isinstance(item, LTChar):
         #font = enc(item.font.fontname)
         font = enc(item.fontname)
         #size = item.get_size()
         size = item.size
         self.__font = self.__fontDict.get(font + str(size))
         if self.__font == None:
             self.__font = self.__fontDict.setdefault(
                 font + str(size),
                 self.__lib.findFont(self.__pdfminerpage,
                                     font).instantiate(size))
         self.__node = PDFMinerNode("text")
         self.__node.setPageId(self.__num)
         self.__node.setLeaf()
         #self.__node.setContentType("Text")
         parent.add(self.__node)
         #print parent.textOf()
         #assert(parent.textOf() == "textline")
         self.__setBbox(self.__node, bbox2str(normalize(item.bbox)))
         if self.__font != None:
             self.__node.add(self.__font)
             #assert(self.__node.textOf() == "text")
         self.__node.add(item.get_text())
     elif isinstance(item, LTText):
         pass
         # TODO: NOTE ignorujemy tekst pusty (tu byly same spacje)
         #self.outfp.write('<text>%s</text>\n' % item.get_text())
     elif isinstance(item, LTImage):
         pass
     elif isinstance(item, LTTextGroup):
         self.__node = PDFMinerNode("textgroup")
         self.__node.setPageId(self.__num)
         parent.add(self.__node)
         self.__setBbox(self.__node, bbox2str(normalize(item.bbox)))
         for child in item:
             render(child)
     else:
         assert 0, item
     self.__node = self.__stack.pop()
     return
 def write_text(self, text):
     if self.stripcontrol:
         text = self.CONTROL.sub('', text)
     self.write(enc(text))
     return
Example #18
0
        def render(item):
            if isinstance(item, LTPage):
                self.current_page = Page(item.pageid, item.bbox, item.rotate)
                self.write(self.current_page.tag + "\n")
                for child in item:
                    render(child)
                if item.groups is not None:
                    self.write("<layout>\n")
                    for group in item.groups:
                        show_group(group)
                    self.write("</layout>\n")
                self.write("</page>\n")
                self.active_pdf_document.add_page(self.current_page)
            elif isinstance(item, LTLine):
                s = '<line linewidth="%d" bbox="%s" />\n' % (
                    item.linewidth,
                    bbox2str(item.bbox),
                )
                self.write(s)
            elif isinstance(item, LTRect):
                s = '<rect linewidth="%d" bbox="%s" />\n' % (
                    item.linewidth,
                    bbox2str(item.bbox),
                )
                self.write(s)
            elif isinstance(item, LTCurve):
                s = '<curve linewidth="%d" bbox="%s" pts="%s"/>\n' % (
                    item.linewidth,
                    bbox2str(item.bbox),
                    item.get_pts(),
                )
                self.write(s)
            elif isinstance(item, LTFigure):
                figure = Figure(item)
                s = '<figure name="%s" bbox="%s">\n' % (
                    item.name,
                    bbox2str(item.bbox),
                )
                self.write(s)
                for child in item:
                    render(child)
                self.write("</figure>\n")
                self._add_unique_figure(figure)
            elif isinstance(item, LTTextLine):
                self.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
                for child in item:
                    render(child)
                self.write("</textline>\n")
            elif isinstance(item, LTTextBox):
                wmode = ""

                if isinstance(item, LTTextBoxVertical):
                    wmode = ' wmode="vertical"'
                s = '<textbox id="%d" bbox="%s"%s>\n' % (
                    item.index,
                    bbox2str(item.bbox),
                    wmode,
                )
                box = TextBox(item.index, item=item, trim=self.trim)
                self.write(s)
                self.current_page.add_content(box)
                for child in item:
                    render(child)
                self.write("</textbox>\n")
            elif isinstance(item, LTChar):
                s = ('<text font="%s" bbox="%s" colourspace="%s" '
                     'ncolour="%s" size="%.3f">' % (
                         enc(item.fontname),
                         bbox2str(item.bbox),
                         item.ncs.name,
                         item.graphicstate.ncolor,
                         item.size,
                     ))
                self.write(s)
                self.write_text(item.get_text())
                self.write("</text>\n")
            elif isinstance(item, LTText):
                self.write("<text>%s</text>\n" % item.get_text())
            elif isinstance(item, LTImage):
                figure = Figure(item)
                if self.imagewriter is not None:
                    name = self.imagewriter.export_image(item)
                    self.write('<image src="%s" width="%d" height="%d" />\n' %
                               (enc(name), item.width, item.height))
                else:
                    self.write('<image width="%d" height="%d" />\n' %
                               (item.width, item.height))
                self._add_unique_figure(figure)
            else:
                self._logger.warning("Unknown item: %r", item)