def add_graphemes(self, it): """ Adds a number of graphemes to the current scope (either line or segment). A line and/or segment has to be created beforehand. Args: it (iterable): An iterable returning a dictionary which at least contains a key 'grapheme' with the recognition result. A bounding box has to be placed under the key 'bbox'; a confidence value in the range 0-100 (int) is expected under 'confidence'. Additional data (style etc.) will be retained for serializer use. """ if self.line_scope is None: raise NidabaRecordException('No element scoped.') if self.segment_scope is not None: target = self.lines[self.line_scope]['content'][ self.segment_scope]['content'] else: target = self.lines[self.line_scope]['content'] gr_cnt = len(self.graphemes) ids = [] for glyph in it: gr_cnt += 1 id = u'grapheme_' + unicode(gr_cnt) ids.append(id) glyph['type'] = 'grapheme' if 'confidence' in glyph and (glyph['confidence'] < 0 or glyph['confidence'] > 100): raise NidabaRecordException( 'Glyph confidence {} outside valid ' 'range'.format(glyph['confidence'])) if 'grapheme' not in glyph: raise NidabaRecordException( 'Mandatory field missing when adding graphemes.') if self.resp_scope: glyph['resp'] = self.resp_scope target[id] = glyph return ids
def scope_line(self, id): """ Scopes a line. Args: id (unicode): ID of the line to scope. Raises: NidabaRecordException if no line with the ID could be found. """ if id not in self.lines: raise NidabaRecordException('Invalid line ID.') self.line_scope = id
def scope_respstmt(self, id): """ Scopes a responsibility statement. Args: id (unicode): String of targeted resposibility statement. Raises: NidabaRecordException if the responsibility statement couldn't be found. """ if id not in self.respstmt: raise NidabaRecordException('No such responsibility statement') self.resp_scope = id
def add_segment(self, dim, language=None, confidence=None, **kwargs): """ Marks the beginning of a new topographical segment in the current scope. Most often this correspond to a word recognized by an engine. Args: dim (tuple): A tuple containing the bounding box (x0, y0, x1, y1) lang (unicode): Optional identifier of the segment language. confidence (int): Optional confidence value between 0 and 100. kwargs (dict): Additional data used by the final serialization. Returns: A string containing the segment's indeitifier. Raises: NidabaRecordException if no line is scoped. """ if not self.line_scope: raise NidabaRecordException('No line scoped.') id = u'seg_' + unicode(len(self.segments) + 1) kwargs['type'] = 'segment' kwargs['bbox'] = dim if language: kwargs['language'] = language if confidence: if confidence < 0 or confidence > 100: raise NidabaRecordException( 'Segmentconfidence {} outside valid ' 'range'.format(confidence)) kwargs['confidence'] = confidence if self.resp_scope: kwargs['resp'] = self.resp_scope kwargs['content'] = OrderedDict() self.lines[self.line_scope]['content'][id] = kwargs self.segment_scope = id return id
def scope_segment(self, id): """ Scopes a segment (and by association its line). Args: id (unicode): ID of the segment to scope. Raises """ for line_id, line in self.lines.iteritems(): if id in line['content']: self.line_scope = line_id self.segment_scope = id return raise NidabaRecordException('Invalid segment ID.')
def write_hocr(self, fp): """ Serializes the OCR record in hOCR format. Metadata except image source and dimensions are lost, as are responsibility statements. Alternatives EXCEPT grapheme alternatives are inserted using the INS-DEL syntax described in section 10 of the hOCR standard [0]. Grapheme coordinates and confidences are added as cuts/x_confs to the ocr_line element. [0] https://docs.google.com/document/d/1QQnIQtvdAC_8n92-LhwPcjtAUFwBlzE8EWnKAxlgVf0/preview Args: fp (file): File descriptor to write to. """ page = etree.Element('html', xmlns="http://www.w3.org/1999/xhtml") head = SubElement(page, 'head') if 'title' in self.meta: SubElement(head, 'title').text = self.meta['title'] if self.respstmt: SubElement(head, 'meta', name="ocr-system", content=self.respstmt.values()[-1]['name']) capa = "ocr_page" if self.lines: capa += ", ocr_line" if self.segments: capa += ", ocrx_word" SubElement(head, 'meta', name='ocr-capabilities', content=capa) body = SubElement(page, 'body') p_hocr = _micro_hocr() if 'dimensions' in self.meta: p_hocr.add('bbox', 0, 0, *self.meta['dimensions']) if self.img is not None: p_hocr.add('image', self.img) ocr_page = SubElement(body, 'div', title=str(p_hocr)) ocr_page.set('class', 'ocr_page') def _wrap_alternatives(alternatives, ins, parent): span = SubElement(parent, 'span') span.set('class', 'alternatives') ins_el = SubElement(span, 'ins') ins_el.set('class', 'alt') ins_el.append(ins) for alt in alternatives['content']: corr = SubElement(span, 'del') corr.set('class', 'alt') corr.text = alt['alternative'] if 'confidence' in alt: corr.set('title', 'x_cost {}'.format(100 - alt['confidence'])) for line_id, line in self.lines.iteritems(): ocr_line = Element('span', id=line_id) ocr_line.set('class', 'ocr_line') ocr_line.text = u'' l_hocr = _micro_hocr() l_hocr.add('bbox', line['bbox']) gr_boxes = [] gr_confidences = [] if 'alternatives' in line: _wrap_alternatives(line['alternatives'], ocr_line, ocr_page) else: ocr_page.append(ocr_line) SubElement(ocr_page, 'br') for seg_id, seg in line['content'].iteritems(): if seg['type'] == 'grapheme': if 'bbox' in seg: gr_boxes.append(seg['bbox']) if 'confidence' in seg: gr_confidences.append(seg['confidence']) ocr_line.text += seg['grapheme'] elif seg['type'] == 'segment': ocrx_word = Element('span', id=seg_id) ocrx_word.set('class', 'ocrx_word') s_hocr = _micro_hocr() if 'bbox' in seg: s_hocr.add('bbox', seg['bbox']) if 'confidence' in seg: s_hocr.add('x_wconf', seg['confidence']) ocrx_word.set('title', str(s_hocr)) ocrx_word.text = u'' if 'alternatives' in seg: _wrap_alternatives(seg['alternatives'], ocrx_word, ocr_line) else: ocr_line.append(ocrx_word) for g in seg['content'].itervalues(): if 'bbox' in g: gr_boxes.append(g['bbox']) if 'confidence' in g: gr_confidences.append(g['confidence']) ocrx_word.text += g['grapheme'] else: raise NidabaRecordException('Unknown nodes beneath line records') if gr_boxes: l_hocr.add('cuts', *list(_delta(line['bbox'], gr_boxes))) if gr_confidences: l_hocr.add('x_confs', *gr_confidences) ocr_line.set('title', str(l_hocr)) fp.write(etree.tostring(page, pretty_print=True, xml_declaration=True, encoding='utf-8')) fp.flush()
def write_abbyyxml(self, fp): """ Writes the TEI document in a format reminiscent of Abbyy FineReader's XML output. Its basic format is: <document> <page> <text> <line l="0" r="111" t="6" b="89"> <charParams l="0" r="78" t="6" b="89" charConfidence="76" wordStart="true">D</charParams> <charParams l="86" r="111" t="24" b="89" charConfidence="76" wordStart="false">e</charParams> </line> .... </text> </page> </document> Please note that alternative readings as produced for example by spell checking are dropped from the output. Responsibility statements, metadata, and source image information is likewise lost. Args: fp (file): File descriptor to write to. """ page = Element('document', xmlns='http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml', version='1.0', producer='nidaba') p = SubElement(page, 'page') p.set('width', str(self.dimensions[0])) p.set('height', str(self.dimensions[1])) p.set('resolution', '0') p.set('originalCoords', '1') b = SubElement(p, 'block', blockType='Text') text = SubElement(b, 'text') par = SubElement(text, 'par') for line in self.lines.itervalues(): lel = SubElement(par, 'line') # XXX: meaning of baseline is nowere documented lel.set('baseline', '0') lel.set('l', str(line['bbox'][0])) lel.set('t', str(line['bbox'][1])) lel.set('r', str(line['bbox'][2])) lel.set('b', str(line['bbox'][3])) for seg in line['content'].itervalues(): if seg['type'] == 'segment': formatting = SubElement(lel, 'formatting') if 'language' in seg: formatting.set('lang', seg['language']) word_start = True for g in seg['content'].itervalues(): if 'bbox' not in g: raise NidabaRecordException('No bounding box for grapheme') el = SubElement(formatting, 'charParams') if word_start: el.set('wordStart', 'true') word_start = False else: el.set('wordStart', 'false') el.text = g['grapheme'] el.set('l', str(g['bbox'][0])) el.set('t', str(g['bbox'][1])) el.set('r', str(g['bbox'][2])) el.set('b', str(g['bbox'][3])) if 'confidence' in g: el.set('charConfidence', str(g['confidence'])) elif seg['type'] == 'grapheme': formatting = SubElement(lel, 'formatting') if 'language' in seg: formatting.set('lang', seg['language']) el = SubElement(formatting, 'charParams') el.text = g['grapheme'] el.set('l', str(g['bbox'][0])) el.set('t', str(g['bbox'][1])) el.set('r', str(g['bbox'][2])) el.set('b', str(g['bbox'][3])) if 'confidence' in g: el.set('charConfidence', str(g['confidence'])) else: raise NidabaRecordException('Unknown nodes beneath line records') fp.write(etree.tostring(page, xml_declaration=True, encoding='utf-8')) fp.flush()
def write_tei(self, fp): """ Serializes the record to a TEI facsimile. Args: fp (File): Target file descriptor. """ doc = Element('TEI', nsmap={None: 'http://www.tei-c.org/ns/1.0'}, version='5.0') header = SubElement(doc, self.tei_ns + 'teiHeader') fileDesc = SubElement(header, self.tei_ns + 'fileDesc') sourceDoc = SubElement(doc, self.tei_ns + 'sourceDoc') kwargs = {} if self.dimensions: kwargs = {'ulx': '0', 'uly': '0', 'lrx': str(self.dimensions[0]), 'lry': str(self.dimensions[1])} surface = SubElement(sourceDoc, self.tei_ns + 'surface', **kwargs) if self.img: SubElement(surface, self.tei_ns + 'graphic', url=self.img) surface_zone = SubElement(surface, self.tei_ns + 'zone') for stmt, fields in self._tei_fields: # create *Stmt in correct order parent = Element(self.tei_ns + stmt) for field in fields: if field[0] in self.meta: el = parent for node in field[1].split('/{')[1:]: el = SubElement(el, '{' + node) value = self.meta[field[0]] if isinstance(value, list): el.set(field[2], value[1]) value = value[0] el.text = value # insert *Stmt only when needed if list(parent): fileDesc.append(parent) titleStmt = doc.find('{0}teiHeader/{0}fileDesc/{0}titleStmt'.format(self.tei_ns)) if titleStmt is None: titleStmt = Element(self.tei_ns + 'titleStmt') fileDesc.insert(0, titleStmt) for id, resp in self.respstmt.iteritems(): r = SubElement(titleStmt, self.tei_ns + 'respStmt') r.set(self.xml_ns + 'id', id) SubElement(r, self.tei_ns + 'resp').text = resp['resp'] SubElement(r, self.tei_ns + 'name').text = resp['name'] def _set_confidence(el, up, dic): cert = None if 'confidence' in dic: cert = SubElement(el, self.tei_ns + 'certainty', degree=u'{0:.2f}'.format(dic['confidence'] / 100.0), locus='value') if el.get(self.xml_ns + 'id'): cert.set('target', '#' + el.get(self.xml_ns + 'id')) if 'resp' in up: el.set('resp', '#' + up['resp']) if cert is not None: cert.set('resp', '#' + up['resp']) def _wrap_choices(alternatives, sic, parent): choice = SubElement(parent, self.tei_ns + 'choice') sic_el = SubElement(choice, self.tei_ns + 'sic') sic_el.append(sic) for alt in alternatives['content']: corr = SubElement(choice, self.tei_ns + 'corr') corr.text = alt['alternative'] _set_confidence(corr, alternatives, alt) def _add_grapheme(grapheme_id, grapheme, parent): g_el = Element(self.tei_ns + 'zone', type='grapheme') g_el.set(self.xml_ns + 'id', grapheme_id) if 'bbox' in grapheme: g_el.set('ulx', str(grapheme['bbox'][0])) g_el.set('uly', str(grapheme['bbox'][1])) g_el.set('lrx', str(grapheme['bbox'][2])) g_el.set('lry', str(grapheme['bbox'][3])) if 'alternatives' in grapheme: _wrap_choices(grapheme['alternatives'], g_el, parent) else: parent.append(g_el) glyph = SubElement(SubElement(g_el, self.tei_ns + 'seg'), self.tei_ns + 'g') glyph.text = grapheme['grapheme'] _set_confidence(g_el, grapheme, grapheme) for line_id, line in self.lines.iteritems(): line_el = Element(self.tei_ns + 'line', ulx=str(line['bbox'][0]), uly=str(line['bbox'][1]), lrx=str(line['bbox'][2]), lry=str(line['bbox'][3])) line_el.set(self.xml_ns + 'id', line_id) _set_confidence(line_el, line, line) if 'alternatives' in line: _wrap_choices(line['alternatives'], line_el, surface_zone) else: surface_zone.append(line_el) for seg_id, seg in line['content'].iteritems(): if seg['type'] == 'segment': seg_el = Element(self.tei_ns + 'zone', ulx=str(seg['bbox'][0]), uly=str(seg['bbox'][1]), lrx=str(seg['bbox'][2]), lry=str(seg['bbox'][3]), type=seg['type']) seg_el.set(self.xml_ns + 'id', seg_id) _set_confidence(seg_el, seg, seg) for grapheme_id, grapheme in seg['content'].iteritems(): _add_grapheme(grapheme_id, grapheme, seg_el) if 'alternatives' in seg: _wrap_choices(seg['alternatives'], seg_el, line_el) else: line_el.append(seg_el) elif seg['type'] == 'grapheme': _add_grapheme(seg_id, seg, line_el) else: raise NidabaRecordException('Unknown nodes beneath line records') fp.write(etree.tostring(doc, xml_declaration=True, encoding='utf-8', pretty_print=True)) fp.flush()
def load_tei(self, fp): """ Reads in a TEI facsimile and populates the record. Args: fp (File): Source file descriptor. """ doc = etree.parse(fp) self.respstmt = OrderedDict() self.resp_scope = None for stmt, fields in self._tei_fields: stmt_el = doc.find('{0}teiHeader/{0}fileDesc/{0}{1}'.format(self.tei_ns, stmt)) if stmt_el is None: continue for field in fields: f_el = stmt_el.find('./' + field[1]) if f_el is not None: if len(field) == 3 and f_el.get(field[2]): self.meta[field[0]] = [f_el.text, f_el.get(field[2])] else: self.meta[field[0]] = f_el.text for resp in doc.iter(self.tei_ns + 'respStmt'): id = resp.get(self.xml_ns + 'id') r = resp.find('.//{}resp'.format(self.tei_ns)).text n = resp.find('.//{}name'.format(self.tei_ns)).text self.respstmt[id] = {'resp': r, 'name': n} surface = doc.find('{0}sourceDoc/{0}surface'.format(self.tei_ns)) if surface.get('lrx') is not None and surface.get('lry') is not None: self.dimensions = (int(surface.get('lrx')), int(surface.get('lry'))) graphic = surface.find('{}graphic'.format(self.tei_ns)) if graphic is not None: self.img = graphic.get('url') root_zone = doc.find('{0}sourceDoc/{0}surface/{0}zone'.format(self.tei_ns)) corr_flag = False alts = [] sic = None last_el = None def _get_dict_from_key(id): if id in self.lines: return self.lines[id] for line in self.lines.itervalues(): if id in line['content']: return line['content'][id] for seg in line['content'].itervalues(): if 'content' in seg and id in seg['content']: return seg['content'][id] for el in islice(root_zone.iter(), 1, None): if el.tag != self.tei_ns + 'corr' and corr_flag: corr_flag = False # flush alternatives self.add_choices(sic, alts) alts = [] elif el.tag == self.tei_ns + 'sic': sic = None elif el.tag == self.tei_ns + 'corr': corr_flag = True alts.append({'alternative': ''.join(el.text)}) last_el = alts[-1] elif el.tag == self.tei_ns + 'line': if el.get('resp') is not None: self.scope_respstmt(el.get('resp')[1:]) id = self.add_line((int(el.get('ulx')), int(el.get('uly')), int(el.get('lrx')), int(el.get('lry')))) last_el = _get_dict_from_key(id) sic = id if not sic else None elif el.tag == self.tei_ns + 'zone' and el.get('type') == 'segment': if el.get('resp') is not None: self.scope_respstmt(el.get('resp')[1:]) id = self.add_segment((int(el.get('ulx')), int(el.get('uly')), int(el.get('lrx')), int(el.get('lry')))) last_el = _get_dict_from_key(id) sic = id if not sic else None elif el.tag == self.tei_ns + 'zone' and el.get('type') == 'grapheme': gr = {'bbox': (int(el.get('ulx')), int(el.get('uly')), int(el.get('lrx')), int(el.get('lry'))), 'grapheme': el.findtext('./{0}seg/{0}g'.format(self.tei_ns))} id = self.add_graphemes([gr])[0] last_el = _get_dict_from_key(id) sic = id if not sic else None elif el.tag == self.tei_ns + 'certainty': last_el['confidence'] = float(el.get('degree')) * 100 elif el.tag in [self.tei_ns + 'seg', self.tei_ns + 'g', self.tei_ns + 'choice']: pass else: raise NidabaRecordException('Unknown tag {} encountered'.format(el.tag))