Ejemplo n.º 1
0
    def add_graphemes(self, it):
        """
        Adds a number of graphemes to the current scope (either line or segment).

        A line and/or segment has to be created beforehand.

        Args:
            it (iterable): An iterable returning a dictionary which at least
                           contains a key 'grapheme' with the recognition
                           result. A bounding box has to be placed under the
                           key 'bbox'; a confidence value in the range 0-100
                           (int) is expected under 'confidence'. Additional
                           data (style etc.) will be retained for serializer
                           use.
        """
        if self.line_scope is None:
            raise NidabaRecordException('No element scoped.')
        if self.segment_scope is not None:
            target = self.lines[self.line_scope]['content'][
                self.segment_scope]['content']
        else:
            target = self.lines[self.line_scope]['content']
        gr_cnt = len(self.graphemes)
        ids = []
        for glyph in it:
            gr_cnt += 1
            id = u'grapheme_' + unicode(gr_cnt)
            ids.append(id)
            glyph['type'] = 'grapheme'
            if 'confidence' in glyph and (glyph['confidence'] < 0
                                          or glyph['confidence'] > 100):
                raise NidabaRecordException(
                    'Glyph confidence {} outside valid '
                    'range'.format(glyph['confidence']))
            if 'grapheme' not in glyph:
                raise NidabaRecordException(
                    'Mandatory field missing when adding graphemes.')
            if self.resp_scope:
                glyph['resp'] = self.resp_scope
            target[id] = glyph
        return ids
Ejemplo n.º 2
0
    def scope_line(self, id):
        """
        Scopes a line.

        Args:
            id (unicode): ID of the line to scope.

        Raises:
            NidabaRecordException if no line with the ID could be found.
        """
        if id not in self.lines:
            raise NidabaRecordException('Invalid line ID.')
        self.line_scope = id
Ejemplo n.º 3
0
    def scope_respstmt(self, id):
        """
        Scopes a responsibility statement.

        Args:
            id (unicode): String of targeted resposibility statement.

        Raises:
            NidabaRecordException if the responsibility statement couldn't be
            found.
        """
        if id not in self.respstmt:
            raise NidabaRecordException('No such responsibility statement')
        self.resp_scope = id
Ejemplo n.º 4
0
    def add_segment(self, dim, language=None, confidence=None, **kwargs):
        """
        Marks the beginning of a new topographical segment in the current
        scope. Most often this correspond to a word recognized by an engine.

        Args:
            dim (tuple): A tuple containing the bounding box (x0, y0, x1, y1)
            lang (unicode): Optional identifier of the segment language.
            confidence (int): Optional confidence value between 0 and 100.
            kwargs (dict): Additional data used by the final serialization.

        Returns:
            A string containing the segment's indeitifier.

        Raises:
            NidabaRecordException if no line is scoped.
        """
        if not self.line_scope:
            raise NidabaRecordException('No line scoped.')
        id = u'seg_' + unicode(len(self.segments) + 1)
        kwargs['type'] = 'segment'
        kwargs['bbox'] = dim
        if language:
            kwargs['language'] = language
        if confidence:
            if confidence < 0 or confidence > 100:
                raise NidabaRecordException(
                    'Segmentconfidence {} outside valid '
                    'range'.format(confidence))
            kwargs['confidence'] = confidence
        if self.resp_scope:
            kwargs['resp'] = self.resp_scope

        kwargs['content'] = OrderedDict()
        self.lines[self.line_scope]['content'][id] = kwargs
        self.segment_scope = id
        return id
Ejemplo n.º 5
0
    def scope_segment(self, id):
        """
        Scopes a segment (and by association its line).

        Args:
            id (unicode): ID of the segment to scope.

        Raises
        """
        for line_id, line in self.lines.iteritems():
            if id in line['content']:
                self.line_scope = line_id
                self.segment_scope = id
                return
        raise NidabaRecordException('Invalid segment ID.')
Ejemplo n.º 6
0
    def write_hocr(self, fp):
        """
        Serializes the OCR record in hOCR format.

        Metadata except image source and dimensions are lost, as are
        responsibility statements. Alternatives EXCEPT grapheme alternatives
        are inserted using the INS-DEL syntax described in section 10 of the
        hOCR standard [0]. Grapheme coordinates and confidences are added as
        cuts/x_confs to the ocr_line element.

        [0] https://docs.google.com/document/d/1QQnIQtvdAC_8n92-LhwPcjtAUFwBlzE8EWnKAxlgVf0/preview

        Args:
            fp (file): File descriptor to write to.
        """
        page = etree.Element('html', xmlns="http://www.w3.org/1999/xhtml")
        head = SubElement(page, 'head')
        if 'title' in self.meta:
            SubElement(head, 'title').text = self.meta['title']
        if self.respstmt:
            SubElement(head, 'meta', name="ocr-system",
                       content=self.respstmt.values()[-1]['name'])
        capa = "ocr_page"
        if self.lines:
            capa += ", ocr_line"
        if self.segments:
            capa += ", ocrx_word"
        SubElement(head, 'meta', name='ocr-capabilities', content=capa)

        body = SubElement(page, 'body')
        p_hocr = _micro_hocr()
        if 'dimensions' in self.meta:
            p_hocr.add('bbox', 0, 0, *self.meta['dimensions'])
        if self.img is not None:
            p_hocr.add('image', self.img)
        ocr_page = SubElement(body, 'div', title=str(p_hocr))
        ocr_page.set('class', 'ocr_page')

        def _wrap_alternatives(alternatives, ins, parent):
            span = SubElement(parent, 'span')
            span.set('class', 'alternatives')
            ins_el = SubElement(span, 'ins')
            ins_el.set('class', 'alt')
            ins_el.append(ins)
            for alt in alternatives['content']:
                corr = SubElement(span, 'del')
                corr.set('class', 'alt')
                corr.text = alt['alternative']
                if 'confidence' in alt:
                    corr.set('title', 'x_cost {}'.format(100 - alt['confidence']))

        for line_id, line in self.lines.iteritems():
            ocr_line = Element('span', id=line_id)
            ocr_line.set('class', 'ocr_line')
            ocr_line.text = u''
            l_hocr = _micro_hocr()
            l_hocr.add('bbox', line['bbox'])
            gr_boxes = []
            gr_confidences = []
            if 'alternatives' in line:
                _wrap_alternatives(line['alternatives'], ocr_line, ocr_page)
            else:
                ocr_page.append(ocr_line)
            SubElement(ocr_page, 'br')

            for seg_id, seg in line['content'].iteritems():
                if seg['type'] == 'grapheme':
                    if 'bbox' in seg:
                        gr_boxes.append(seg['bbox'])
                    if 'confidence' in seg:
                        gr_confidences.append(seg['confidence'])
                    ocr_line.text += seg['grapheme']
                elif seg['type'] == 'segment':
                    ocrx_word = Element('span', id=seg_id)
                    ocrx_word.set('class', 'ocrx_word')
                    s_hocr = _micro_hocr()
                    if 'bbox' in seg:
                        s_hocr.add('bbox', seg['bbox'])
                    if 'confidence' in seg:
                        s_hocr.add('x_wconf', seg['confidence'])
                    ocrx_word.set('title', str(s_hocr))
                    ocrx_word.text = u''
                    if 'alternatives' in seg:
                        _wrap_alternatives(seg['alternatives'], ocrx_word, ocr_line)
                    else:
                        ocr_line.append(ocrx_word)
                    for g in seg['content'].itervalues():
                        if 'bbox' in g:
                            gr_boxes.append(g['bbox'])
                        if 'confidence' in g:
                            gr_confidences.append(g['confidence'])
                        ocrx_word.text += g['grapheme']
                else:
                    raise NidabaRecordException('Unknown nodes beneath line records')
            if gr_boxes:
                l_hocr.add('cuts', *list(_delta(line['bbox'], gr_boxes)))
            if gr_confidences:
                l_hocr.add('x_confs', *gr_confidences)
            ocr_line.set('title', str(l_hocr))
        fp.write(etree.tostring(page, pretty_print=True,
                 xml_declaration=True, encoding='utf-8'))
        fp.flush()
Ejemplo n.º 7
0
    def write_abbyyxml(self, fp):
        """
        Writes the TEI document in a format reminiscent of Abbyy FineReader's
        XML output. Its basic format is:

        <document>
        <page>
        <text>
        <line l="0" r="111" t="6" b="89">
        <charParams l="0" r="78" t="6" b="89" charConfidence="76" wordStart="true">D</charParams>
        <charParams l="86" r="111" t="24" b="89" charConfidence="76" wordStart="false">e</charParams>
        </line>
        ....
        </text>
        </page>
        </document>

        Please note that alternative readings as produced for example by spell
        checking are dropped from the output. Responsibility statements,
        metadata, and source image information is likewise lost.

        Args:
            fp (file): File descriptor to write to.
        """
        page = Element('document',
                       xmlns='http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml',
                       version='1.0',
                       producer='nidaba')
        p = SubElement(page, 'page')
        p.set('width', str(self.dimensions[0]))
        p.set('height', str(self.dimensions[1]))
        p.set('resolution', '0')
        p.set('originalCoords', '1')
        b = SubElement(p, 'block', blockType='Text')
        text = SubElement(b, 'text')
        par = SubElement(text, 'par')
        for line in self.lines.itervalues():
            lel = SubElement(par, 'line')
            # XXX: meaning of baseline is nowere documented
            lel.set('baseline', '0')
            lel.set('l', str(line['bbox'][0]))
            lel.set('t', str(line['bbox'][1]))
            lel.set('r', str(line['bbox'][2]))
            lel.set('b', str(line['bbox'][3]))
            for seg in line['content'].itervalues():
                if seg['type'] == 'segment':
                    formatting = SubElement(lel, 'formatting')
                    if 'language' in seg:
                        formatting.set('lang', seg['language'])
                    word_start = True
                    for g in seg['content'].itervalues():
                        if 'bbox' not in g:
                            raise NidabaRecordException('No bounding box for grapheme')
                        el = SubElement(formatting, 'charParams')
                        if word_start:
                            el.set('wordStart', 'true')
                            word_start = False
                        else:
                            el.set('wordStart', 'false')
                        el.text = g['grapheme']
                        el.set('l', str(g['bbox'][0]))
                        el.set('t', str(g['bbox'][1]))
                        el.set('r', str(g['bbox'][2]))
                        el.set('b', str(g['bbox'][3]))
                        if 'confidence' in g:
                            el.set('charConfidence', str(g['confidence']))
                elif seg['type'] == 'grapheme':
                    formatting = SubElement(lel, 'formatting')
                    if 'language' in seg:
                        formatting.set('lang', seg['language'])
                    el = SubElement(formatting, 'charParams')
                    el.text = g['grapheme']
                    el.set('l', str(g['bbox'][0]))
                    el.set('t', str(g['bbox'][1]))
                    el.set('r', str(g['bbox'][2]))
                    el.set('b', str(g['bbox'][3]))
                    if 'confidence' in g:
                        el.set('charConfidence', str(g['confidence']))
                else:
                    raise NidabaRecordException('Unknown nodes beneath line records')
        fp.write(etree.tostring(page, xml_declaration=True, encoding='utf-8'))
        fp.flush()
Ejemplo n.º 8
0
    def write_tei(self, fp):
        """
        Serializes the record to a TEI facsimile.

        Args:
            fp (File): Target file descriptor.
        """
        doc = Element('TEI', nsmap={None: 'http://www.tei-c.org/ns/1.0'},
                      version='5.0')
        header = SubElement(doc, self.tei_ns + 'teiHeader')
        fileDesc = SubElement(header, self.tei_ns + 'fileDesc')

        sourceDoc = SubElement(doc, self.tei_ns + 'sourceDoc')
        kwargs = {}
        if self.dimensions:
            kwargs = {'ulx': '0', 'uly': '0',
                      'lrx': str(self.dimensions[0]),
                      'lry': str(self.dimensions[1])}
        surface = SubElement(sourceDoc, self.tei_ns + 'surface', **kwargs)
        if self.img:
            SubElement(surface, self.tei_ns + 'graphic', url=self.img)

        surface_zone = SubElement(surface, self.tei_ns + 'zone')

        for stmt, fields in self._tei_fields:
            # create *Stmt in correct order
            parent = Element(self.tei_ns + stmt)
            for field in fields:
                if field[0] in self.meta:
                    el = parent
                    for node in field[1].split('/{')[1:]:
                        el = SubElement(el, '{' + node)
                    value = self.meta[field[0]]
                    if isinstance(value, list):
                        el.set(field[2], value[1])
                        value = value[0]
                    el.text = value
            # insert *Stmt only when needed
            if list(parent):
                fileDesc.append(parent)

        titleStmt = doc.find('{0}teiHeader/{0}fileDesc/{0}titleStmt'.format(self.tei_ns))

        if titleStmt is None:
            titleStmt = Element(self.tei_ns + 'titleStmt')
            fileDesc.insert(0, titleStmt)

        for id, resp in self.respstmt.iteritems():
            r = SubElement(titleStmt, self.tei_ns + 'respStmt')
            r.set(self.xml_ns + 'id', id)
            SubElement(r, self.tei_ns + 'resp').text = resp['resp']
            SubElement(r, self.tei_ns + 'name').text = resp['name']

        def _set_confidence(el, up, dic):
            cert = None
            if 'confidence' in dic:
                cert = SubElement(el, self.tei_ns + 'certainty',
                                  degree=u'{0:.2f}'.format(dic['confidence'] / 100.0),
                                  locus='value')
                if el.get(self.xml_ns + 'id'):
                    cert.set('target', '#' + el.get(self.xml_ns + 'id'))
            if 'resp' in up:
                el.set('resp', '#' + up['resp'])
                if cert is not None:
                    cert.set('resp', '#' + up['resp'])

        def _wrap_choices(alternatives, sic, parent):
            choice = SubElement(parent, self.tei_ns + 'choice')
            sic_el = SubElement(choice, self.tei_ns + 'sic')
            sic_el.append(sic)
            for alt in alternatives['content']:
                corr = SubElement(choice, self.tei_ns + 'corr')
                corr.text = alt['alternative']
                _set_confidence(corr, alternatives, alt)

        def _add_grapheme(grapheme_id, grapheme, parent):
            g_el = Element(self.tei_ns + 'zone',
                           type='grapheme')
            g_el.set(self.xml_ns + 'id', grapheme_id)
            if 'bbox' in grapheme:
                g_el.set('ulx', str(grapheme['bbox'][0]))
                g_el.set('uly', str(grapheme['bbox'][1]))
                g_el.set('lrx', str(grapheme['bbox'][2]))
                g_el.set('lry', str(grapheme['bbox'][3]))

            if 'alternatives' in grapheme:
                _wrap_choices(grapheme['alternatives'], g_el, parent)
            else:
                parent.append(g_el)
            glyph = SubElement(SubElement(g_el, self.tei_ns + 'seg'), self.tei_ns + 'g')
            glyph.text = grapheme['grapheme']
            _set_confidence(g_el, grapheme, grapheme)

        for line_id, line in self.lines.iteritems():
            line_el = Element(self.tei_ns + 'line', ulx=str(line['bbox'][0]),
                              uly=str(line['bbox'][1]),
                              lrx=str(line['bbox'][2]),
                              lry=str(line['bbox'][3]))
            line_el.set(self.xml_ns + 'id', line_id)
            _set_confidence(line_el, line, line)
            if 'alternatives' in line:
                _wrap_choices(line['alternatives'], line_el, surface_zone)
            else:
                surface_zone.append(line_el)
            for seg_id, seg in line['content'].iteritems():
                if seg['type'] == 'segment':
                    seg_el = Element(self.tei_ns + 'zone',
                                     ulx=str(seg['bbox'][0]),
                                     uly=str(seg['bbox'][1]),
                                     lrx=str(seg['bbox'][2]),
                                     lry=str(seg['bbox'][3]),
                                     type=seg['type'])
                    seg_el.set(self.xml_ns + 'id', seg_id)
                    _set_confidence(seg_el, seg, seg)
                    for grapheme_id, grapheme in seg['content'].iteritems():
                        _add_grapheme(grapheme_id, grapheme, seg_el)
                    if 'alternatives' in seg:
                        _wrap_choices(seg['alternatives'], seg_el, line_el)
                    else:
                        line_el.append(seg_el)
                elif seg['type'] == 'grapheme':
                    _add_grapheme(seg_id, seg, line_el)
                else:
                    raise NidabaRecordException('Unknown nodes beneath line records')
        fp.write(etree.tostring(doc, xml_declaration=True, encoding='utf-8', pretty_print=True))
        fp.flush()
Ejemplo n.º 9
0
    def load_tei(self, fp):
        """
        Reads in a TEI facsimile and populates the record.

        Args:
            fp (File): Source file descriptor.
        """

        doc = etree.parse(fp)

        self.respstmt = OrderedDict()
        self.resp_scope = None

        for stmt, fields in self._tei_fields:
            stmt_el = doc.find('{0}teiHeader/{0}fileDesc/{0}{1}'.format(self.tei_ns, stmt))
            if stmt_el is None:
                continue
            for field in fields:
                f_el = stmt_el.find('./' + field[1])
                if f_el is not None:
                    if len(field) == 3 and f_el.get(field[2]):
                        self.meta[field[0]] = [f_el.text, f_el.get(field[2])]
                    else:
                        self.meta[field[0]] = f_el.text

        for resp in doc.iter(self.tei_ns + 'respStmt'):
            id = resp.get(self.xml_ns + 'id')
            r = resp.find('.//{}resp'.format(self.tei_ns)).text
            n = resp.find('.//{}name'.format(self.tei_ns)).text
            self.respstmt[id] = {'resp': r, 'name': n}

        surface = doc.find('{0}sourceDoc/{0}surface'.format(self.tei_ns))
        if surface.get('lrx') is not None and surface.get('lry') is not None:
            self.dimensions = (int(surface.get('lrx')), int(surface.get('lry')))
        graphic = surface.find('{}graphic'.format(self.tei_ns))
        if graphic is not None:
            self.img = graphic.get('url')

        root_zone = doc.find('{0}sourceDoc/{0}surface/{0}zone'.format(self.tei_ns))

        corr_flag = False
        alts = []
        sic = None
        last_el = None

        def _get_dict_from_key(id):
            if id in self.lines:
                return self.lines[id]
            for line in self.lines.itervalues():
                if id in line['content']:
                    return line['content'][id]
                for seg in line['content'].itervalues():
                    if 'content' in seg and id in seg['content']:
                        return seg['content'][id]

        for el in islice(root_zone.iter(), 1, None):
            if el.tag != self.tei_ns + 'corr' and corr_flag:
                corr_flag = False
                # flush alternatives
                self.add_choices(sic, alts)
                alts = []
            elif el.tag == self.tei_ns + 'sic':
                sic = None
            elif el.tag == self.tei_ns + 'corr':
                corr_flag = True
                alts.append({'alternative': ''.join(el.text)})
                last_el = alts[-1]
            elif el.tag == self.tei_ns + 'line':
                if el.get('resp') is not None:
                    self.scope_respstmt(el.get('resp')[1:])
                id = self.add_line((int(el.get('ulx')), int(el.get('uly')),
                                    int(el.get('lrx')), int(el.get('lry'))))
                last_el = _get_dict_from_key(id)
                sic = id if not sic else None
            elif el.tag == self.tei_ns + 'zone' and el.get('type') == 'segment':
                if el.get('resp') is not None:
                    self.scope_respstmt(el.get('resp')[1:])
                id = self.add_segment((int(el.get('ulx')), int(el.get('uly')),
                                       int(el.get('lrx')), int(el.get('lry'))))
                last_el = _get_dict_from_key(id)
                sic = id if not sic else None
            elif el.tag == self.tei_ns + 'zone' and el.get('type') == 'grapheme':
                gr = {'bbox': (int(el.get('ulx')), int(el.get('uly')),
                               int(el.get('lrx')), int(el.get('lry'))),
                      'grapheme': el.findtext('./{0}seg/{0}g'.format(self.tei_ns))}
                id = self.add_graphemes([gr])[0]
                last_el = _get_dict_from_key(id)
                sic = id if not sic else None
            elif el.tag == self.tei_ns + 'certainty':
                last_el['confidence'] = float(el.get('degree')) * 100
            elif el.tag in [self.tei_ns + 'seg', self.tei_ns + 'g', self.tei_ns + 'choice']:
                pass
            else:
                raise NidabaRecordException('Unknown tag {} encountered'.format(el.tag))