Beispiel #1
0
def archive_pybossa(doc, method=u'archive_pybossa', name='', description=''):
    """
    Adds recognition result to a pybossa service for postcorrection.

    Args:
        doc [(unicode, unicode), ...]: The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        The list of input storage tuples.
    """
    logger.debug('Creating pybossa project named {}'.format(name))
    proj = pbclient.create_project('{} ({})'.format(name, doc[0][0]), doc[0][0], description)
    logger.debug('Creating pybossa tasks for docs {}'.format(doc))
    for d in doc:
        data = tei.OCRRecord()
        with storage.StorageFile(*d, mode='rb') as fp:
            data.load_tei(fp)
            for line_id, line in data.lines.iteritems():
                text = u''
                for seg in line['content'].itervalues():
                    text += u''.join(x['grapheme'] for x in seg['content'].itervalues())
                pbclient.create_task(proj.id, {
                    'image': data.img,
                    'dimensions': data.dimensions,
                    'line_text': text.encode('utf-8'),
                    'bbox': [
                        line['bbox'][0],
                        line['bbox'][1],
                        line['bbox'][2],
                        line['bbox'][3]
                    ]
                })
    return doc
Beispiel #2
0
    def test_tei(self):
        """
        Test TEI de-/serialization.
        """
        fp = StringIO.StringIO()

        self.record.write_tei(fp)

        doc = etree.fromstring(fp.getvalue())
        
        # responsibility statements
        self.assertEqual(len(doc.findall('.//{}respStmt'.format(self.record.tei_ns))), 2)

        # number of lines, segments and graphemes
        lines = doc.findall('.//{}line'.format(self.record.tei_ns))
        segments = doc.findall('.//{}zone[@type="segment"]'.format(self.record.tei_ns))
        graphemes = doc.findall('.//{}zone[@type="grapheme"]'.format(self.record.tei_ns))

        self.assertEqual(len(lines), 11)
        self.assertEqual(len(lines[-2].findall('.//{}zone[@type="segment"]'.format(self.record.tei_ns))), 10)
        self.assertEqual(len(segments), 11)
        self.assertEqual(len(segments[-2].findall('.//{}zone[@type="grapheme"]'.format(self.record.tei_ns))),
                         len(list(itertools.permutations('ABCD', 2))))
        self.assertEqual(len(graphemes), 1+len(list(itertools.permutations('ABCD', 2))))

        for x in lines[:-1]:
            self.assertIsNotNone(x.get('resp'))
        for x in segments[:-1]:
            self.assertIsNotNone(x.get('resp'))
        for x in graphemes[:-1]:
            self.assertIsNotNone(x.get('resp'))

        self.assertIsNone(lines[-1].get('resp'))
        self.assertIsNone(segments[-1].get('resp'))
        self.assertIsNone(graphemes[-1].get('resp'))

        # check that confidence values are preserverd correctly
        self.assertEqual(len(doc.findall('.//{}certainty[@degree="0.95"]'.format(self.record.tei_ns))), 61)

        # choices on lines, segments and graphemes
        choices = doc.findall('.//{}choice'.format(self.record.tei_ns))
        self.assertEqual(len(choices), 4)
        self.assertIsNotNone(choices[0].find('{0}sic/{0}line[@{1}id="line_3"]'.format(self.record.tei_ns, self.record.xml_ns)))
        self.assertIsNotNone(choices[1].find('{0}sic/{0}zone[@{1}id="seg_1"]'.format(self.record.tei_ns, self.record.xml_ns)))
        self.assertIsNotNone(choices[2].find('{0}sic/{0}zone[@{1}id="grapheme_1"]'.format(self.record.tei_ns, self.record.xml_ns)))
        self.assertIsNotNone(choices[3].find('{0}sic/{0}line[@{1}id="line_11"]'.format(self.record.tei_ns, self.record.xml_ns)))

        self.assertEqual(len(choices[0].findall('{0}corr'.format(self.record.tei_ns))), 
                         len(list(itertools.permutations('ABCD', 2))))
        self.assertEqual(len(choices[1].findall('{0}corr'.format(self.record.tei_ns))), 
                         len(list(itertools.permutations('ABCD', 2))))
        self.assertEqual(len(choices[2].findall('{0}corr'.format(self.record.tei_ns))), 
                         len(list(itertools.permutations('ABCD', 2))))
        self.assertEqual(len(choices[3].findall('{0}corr'.format(self.record.tei_ns))), 
                         len(list(itertools.permutations('ABCD', 2))))

        # check that the loaded self.record is the same
        self.record2 = tei.OCRRecord()
        fp.seek(0)
        self.record2.load_tei(fp)
Beispiel #3
0
 def setUp(self):
     self.record = tei.OCRRecord()
     for x in self.record.fields:
         setattr(self.record, x, str(uuid.uuid4()))
     id_1 = self.record.add_respstmt('bar', 'foo')
     id = self.record.add_respstmt('foo', 'bar')
     self.record.dimensions = (100, 100)
     for x in range(0, 10):
         self.record.add_line((0, 0, 0, 0))
     for x in range(0, 10):
         self.record.add_segment((0, 0, 0, 0), language='foo', confidence=80)
     self.record.add_graphemes([{'bbox': (0, 0, 0, 0), 
                            'confidence': 95,
                            'grapheme': ''.join(x)} for x in itertools.permutations('ABCD', 2)])
     self.record.add_choices('line_3', [{'confidence': 95,
                                    'alternative': ''.join(x)} for x in itertools.permutations('ABCD', 2)])
     self.record.add_choices('seg_1', [{'confidence': 95,
                                    'alternative': ''.join(x)} for x in itertools.permutations('ABCD', 2)])
     self.record.add_choices('grapheme_1', [{'confidence': 95,
                                        'alternative': ''.join(x)} for x in itertools.permutations('ABCD', 2)])
     # elements without responsibility statements
     self.record.reset_respstmt_scope()
     self.record.add_line((0, 0, 0, 0))
     self.record.add_segment((0, 0, 0, 0), language='foo', confidence=80)
     self.record.add_graphemes([{'bbox': (0, 0, 0, 0), 
                            'confidence': 95,
                            'grapheme': 'AB'}])
     self.record.add_choices('line_11', [{'confidence': 95,
                                        'alternative': ''.join(x)} for x in itertools.permutations('ABCD', 2)])
Beispiel #4
0
def archive_pybossa(doc, method=u'archive_pybossa'):
    """
    Adds recognition result to a pybossa service for postcorrection.

    Args:
        doc (unicode, unicode): The input document tuple
        method (unicode): The suffix string appended to all output files.

    Returns:
        The input storage tuple.
    """
    logger.debug('Creating pybossa task {} {}'.format(*doc))
    for d in doc:
        data = tei.OCRRecord()
        data.load_tei(d[1])
        for line_id, line in data.lines.iteritems():
            pbclient.create_task(
                project, {
                    'image':
                    data.img,
                    'dimensions':
                    data.dimensions,
                    'line_text':
                    line,
                    'bbox': [
                        str(line['bbox'][0]),
                        str(line['bbox'][1]),
                        str(line['bbox'][2]),
                        str(line['bbox'][3])
                    ]
                })
    return doc
Beispiel #5
0
    def test_meta(self):
        """
        Test metadata methods.
        """
        self.record = tei.OCRRecord()

        # test that the document field
        for x in self.record.fields:
            setattr(self.record, x, uuid.uuid4())
        for x in self.record.fields:
            self.assertIsNotNone(getattr(self.record, x))
Beispiel #6
0
    def test_hocr(self):
        """
        Test hOCR de-/serialization.
        """
        fp = StringIO.StringIO()
        self.record.write_hocr(fp)

        doc = etree.HTML(fp.getvalue())
        
        lines = doc.findall('.//span[@class="ocr_line"]')
        segments = doc.findall('.//span[@class="ocrx_word"]')

        self.assertEqual(len(lines), 11)
        self.assertEqual(len(lines[-2].findall('.//span[@class="ocrx_word"]')), 10)
        self.assertEqual(len(segments), 11)
        self.assertEqual(len(''.join(segments[-2].itertext())),
                         2*len(list(itertools.permutations('ABCD', 2))))

        # choices on lines, segments. check grapheme choices are discarded
        choices = doc.findall('.//span[@class="alternatives"]')
        self.assertEqual(len(choices), 3)

        self.assertEqual(len(choices[0].findall('del')),
                         len(list(itertools.permutations('ABCD', 2))))
        self.assertEqual(len(choices[1].findall('del')),
                         len(list(itertools.permutations('ABCD', 2))))
        self.assertEqual(len(choices[2].findall('del')),
                         len(list(itertools.permutations('ABCD', 2))))

        # check that the loaded self.record is "same-ish". Multi-codepoint graphemes
        # unfortunately can't be encoded using hOCR.
        record2 = tei.OCRRecord()
        fp.seek(0)
        record2.load_hocr(fp)
        self.assertEqual(self.record.dimensions, record2.dimensions)
        self.assertEqual(self.record.img, record2.img)
        self.assertEqual(len(self.record.lines), len(record2.lines))
        self.assertEqual(len(self.record.segments), len(record2.segments))
Beispiel #7
0
    def test_respstmt(self):
        """
        Tests responsibility statement methods.
        """
        self.record = tei.OCRRecord()

        # test adding responsibility statements
        id_1 = self.record.add_respstmt('bar', 'foo')
        id = self.record.add_respstmt('foo', 'bar')
        self.assertEqual(2, len(self.record.respstmt))
        self.assertIn(id, self.record.respstmt)
        self.assertEqual(id, self.record.resp_scope)

        # test scoping invalid responsibility statements
        with self.assertRaises(NidabaRecordException):
            self.record.scope_respstmt('foo')

        # test scoping valid responsibility statements
        self.record.scope_respstmt(id_1)
        self.assertEqual(id_1, self.record.resp_scope)

        # test respstmt is added to elements
        for x in range(0, 10):
            self.record.add_line((0, 0, 0, 0))
        for x in range(0, 10):
            self.record.add_segment((0, 0, 0, 0), language='foo', confidence=80)
        self.record.add_graphemes([{'bbox': (0, 0, 0, 0), 
                               'confidence': 95,
                               'grapheme': x} for x in itertools.permutations('ABCD', 2)])
        self.record.add_choices('seg_1', [{'confidence': 95,
                                       'alternative': x} for x in itertools.permutations('ABCD', 2)])
        self.assertEqual(id_1, self.record.lines['line_3']['resp'])
        self.assertEqual(id_1, self.record.segments['seg_8']['resp'])
        self.assertEqual(id_1, self.record.graphemes['grapheme_5']['resp'])
     
        # test resetting responsibility scope
        self.record.reset_respstmt_scope()
        self.assertIsNone(self.record.resp_scope)
Beispiel #8
0
    def test_basic(self):
        """
        Tests basic functionality (adding lines, segments, graphemes,
        alternatives)
        """
        self.record = tei.OCRRecord()

        # test adding lines
        for x in range(0, 10):
            self.record.add_line((0, 0, 0, 0))
        self.assertEqual(len(self.record.lines), 10)

        # test scoping invalid line
        with self.assertRaises(NidabaRecordException):
            self.record.scope_line('foo')

        # test scoping valid line
        self.record.scope_line('line_1')
        self.assertEqual(self.record.line_scope, 'line_1')

        # test adding graphemes directly beneath lines
        self.record.scope_line('line_1')
        self.record.add_graphemes([{'bbox': (0, 0, 0, 0), 
                               'confidence': 95,
                               'grapheme': x} for x in itertools.permutations('ABCD', 2)])
        self.assertEqual(len(self.record.lines['line_1']['content']),
                         len(list(itertools.permutations('ABCD', 2))))

        # test adding segments
        self.record.scope_line('line_2')
        for x in range(0, 10):
            self.record.add_segment((0, 0, 0, 0), language='foo', confidence=80)
        self.assertEqual(len(self.record.lines), 10)

        # test scoping invalid segment
        with self.assertRaises(NidabaRecordException):
            self.record.scope_segment('foo')

        # test scoping valid segment inside current line
        self.record.scope_segment('seg_1')
        self.assertEqual(self.record.segment_scope, 'seg_1')
      
        # test segment scope also updates line scope
        self.record.scope_line('line_8')
        self.record.scope_segment('seg_1')
        self.assertEqual(self.record.line_scope, 'line_2')

        # test adding graphemes beneath segment
        self.record.add_graphemes([{'bbox': (0, 0, 0, 0), 
                               'confidence': 95,
                               'grapheme': x} for x in itertools.permutations('ABCD', 2)])
        self.assertEqual(len(self.record.segments[self.record.segment_scope]['content']),
                         len(list(itertools.permutations('ABCD', 2))))
        
        # test adding alternatives to lines
        self.record.add_choices('line_3', [{'confidence': 95,
                                       'alternative': x} for x in itertools.permutations('ABCD', 2)])
        self.assertEqual(len(self.record.lines['line_3']['alternatives']['content']),
                         len(list(itertools.permutations('ABCD', 2))))

        # test adding alternatives to segments
        self.record.add_choices('seg_1', [{'confidence': 95,
                                       'alternative': x} for x in itertools.permutations('ABCD', 2)])
        self.assertEqual(len(self.record.segments['seg_1']['alternatives']['content']),
                         len(list(itertools.permutations('ABCD', 2))))

        # test adding alternatives to graphemes
        self.record.add_choices('grapheme_1', [{'confidence': 95,
                                           'alternative': x} for x in itertools.permutations('ABCD', 2)])
        self.assertEqual(len(self.record.graphemes['grapheme_1']['alternatives']['content']),
                         len(list(itertools.permutations('ABCD', 2))))

        # test resetting scopes
        self.record.reset_line_scope()
        self.assertIsNone(self.record.line_scope)
        self.assertIsNone(self.record.segment_scope)

        self.record.scope_segment('seg_1')
        self.record.reset_segment_scope()
        self.assertIsNotNone(self.record.line_scope)
        self.assertIsNone(self.record.segment_scope)

        # test clearing segments
        self.record.clear_segments()
        self.assertEqual(len(self.record.segments), 0)

        # test clearing lines
        self.record.clear_lines()
        self.assertEqual(len(self.record.lines), 0)